In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [None]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

## Read the CSV and perform the basic cleaning

In [None]:
# Load the data
file_path = Path('Team_merge.csv')
df = pd.read_csv(file_path)

# Drop the null columns where all values are null
#df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

df.reset_index(inplace=True, drop=True)

df.head()

In [None]:
df.columns

In [None]:
df = df.drop(['MATCHUP','GAME_CLOCK','CLOSEST_DEFENDER','CLOSEST_DEFENDER_PLAYER_ID','player_name','player_id','DATE','TIMESTAMP','Name (Shooter)',
              'FGM','FGA','3PM','3PA','FTM','FTA','AST/TOV','STL/TOV','Age_x','Birth_Place','Birthdate','Collage','Experience','Pos','Team','BMI','CD Last Name',
              'CD First Name','CD NAME','Name (CD)','Age_y','TOV%.1','FT/FGA.1','Arena','Attend.','Attend./G'],axis =1)


## Split into training and testing

In [None]:
# Create our features
X = df.drop("SHOT_RESULT", axis=1)

#X = pd.get_dummies(X)

# Create our target
y = df["SHOT_RESULT"]

In [None]:
X

In [None]:
X['Team'].dtypes

In [None]:
X['MATCHUP'].dtypes == 'object'

In [None]:
for col in X.columns:
    if X[col].dtypes == 'object':
        print(col)

In [None]:
X['Experience'] = X['Experience'].str.replace('R', '0').astype(float)

In [None]:
X.dtypes

In [None]:
list(X)

Need to add GAME_CLOCK AND experience after converting data types
1. Turn columns typed as "object" that are actually numbers, to numbers --- get rid of timestamp, name,or categorical variables that aren't directly related to your target
2. For the columns that are categorical and you want to keep, use pd.dummies (ONLY for these, not for the float or int types)
3. Turn GAME_CLOCK into # of seconds elapsed since beginning of the game, instead of just HH:MM:SS

In [None]:
X.describe()

In [None]:
# Check the balance of our target values
y.value_counts()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Ensemble Learners
1.Train the model using the training data.
2.Calculate the balanced accuracy score from sklearn.metrics.
3.Print the confusion matrix from sklearn.metrics.
4.Generate a classication report using the imbalanced_classification_report from imbalanced-learn.

Balanced Random Forest Classifier

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
random_forest = BalancedRandomForestClassifier(n_estimators = 100)
random_forest = random_forest.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = random_forest.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
feature_names = X.columns
sorted(zip(random_forest.feature_importances_, feature_names), reverse=True)

## Easy Ensemble AdaBoost Classifier

In [None]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
easy = EasyEnsembleClassifier(n_estimators = 100,random_state=1)
easy = easy.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = easy.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))