In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

## Read the CSV and perform the basic cleaning

In [4]:
# Load the data
file_path = Path('Team_merge.csv')
df = pd.read_csv(file_path)

# Drop the null columns where all values are null
#df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,MATCHUP,SHOT_NUMBER,PERIOD,GAME_CLOCK,SHOT_DIST,SHOT_RESULT,CLOSEST_DEFENDER,CLOSEST_DEFENDER_PLAYER_ID,CLOSE_DEF_DIST,player_name,...,TOV%,ORB%,FT/FGA,eFG%.1,TOV%.1,DRB%,FT/FGA.1,Arena,Attend.,Attend./G
0,"MAR 04, 2015 - CHA @ BKN",1,1,00:01:09,7.7,made,"Anderson, Alan",101187,1.3,Brian Roberts,...,11.2,22.1,0.202,0.487,12.0,79.3,0.188,Time Warner Cable Arena,704886,17192
1,"MAR 04, 2015 - CHA @ BKN",2,1,00:02:31,20.0,missed,"Anderson, Alan",101187,1.8,Gerald Henderson,...,11.2,22.1,0.202,0.487,12.0,79.3,0.188,Time Warner Cable Arena,704886,17192
2,"MAR 04, 2015 - CHA @ BKN",5,3,00:11:36,21.2,made,"Anderson, Alan",101187,4.1,Gerald Henderson,...,11.2,22.1,0.202,0.487,12.0,79.3,0.188,Time Warner Cable Arena,704886,17192
3,"MAR 04, 2015 - CHA @ BKN",6,3,00:09:52,4.3,made,"Anderson, Alan",101187,3.6,Gerald Henderson,...,11.2,22.1,0.202,0.487,12.0,79.3,0.188,Time Warner Cable Arena,704886,17192
4,"MAR 04, 2015 - CHA @ BKN",8,3,00:03:39,21.9,made,"Anderson, Alan",101187,8.0,Gerald Henderson,...,11.2,22.1,0.202,0.487,12.0,79.3,0.188,Time Warner Cable Arena,704886,17192


In [5]:
df.columns

Index(['MATCHUP', 'SHOT_NUMBER', 'PERIOD', 'GAME_CLOCK', 'SHOT_DIST',
       'SHOT_RESULT', 'CLOSEST_DEFENDER', 'CLOSEST_DEFENDER_PLAYER_ID',
       'CLOSE_DEF_DIST', 'player_name', 'player_id', 'DATE', 'TIMESTAMP',
       'Name (Shooter)', 'Games Played', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%',
       '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', 'EFF', 'AST/TOV', 'STL/TOV', 'Age_x',
       'Birth_Place', 'Birthdate', 'Collage', 'Experience', 'Height (Shooter)',
       'Pos', 'Team', 'Weight', 'BMI', 'CD Last Name', 'CD First Name',
       'CD NAME', 'Name (CD)', 'Height (CD)', 'Rk', 'Age_y', 'W', 'L', 'PW',
       'PL', 'MOV', 'SOS', 'SRS', 'ORtg', 'DRtg', 'NRtg', 'Pace', 'FTr',
       '3PAr', 'TS%', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'eFG%.1', 'TOV%.1',
       'DRB%', 'FT/FGA.1', 'Arena', 'Attend.', 'Attend./G'],
      dtype='object')

In [6]:
df = df.drop(['player_id','DATE','Birth_Place','Birthdate','Collage','Pos','BMI','Age_y','Arena'],axis =1)

## Split into training and testing

In [7]:
# Create our features
X = df.drop("SHOT_RESULT", axis=1)

#X = pd.get_dummies(X)

# Create our target
y = df["SHOT_RESULT"]

In [None]:
X.describe()

In [None]:
# Check the balance of our target values
y.value_counts()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Ensemble Learners
1.Train the model using the training data.
2.Calculate the balanced accuracy score from sklearn.metrics.
3.Print the confusion matrix from sklearn.metrics.
4.Generate a classication report using the imbalanced_classification_report from imbalanced-learn.

Balanced Random Forest Classifier

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
random_forest = BalancedRandomForestClassifier(n_estimators = 100)
random_forest = random_forest.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = random_forest.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
feature_names = X.columns
sorted(zip(random_forest.feature_importances_, feature_names), reverse=True)