In [1]:
import warnings
warnings.filterwarnings('ignore')
from IPython.display import Audio
sound_file = './applause3.wav'
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from xgboost import plot_importance
import matplotlib.pyplot as plt
from matplotlib import pyplot
import seaborn as sns
from sklearn import svm, tree
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, roc_curve, auc, r2_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier

In [2]:
players_df2 = pd.read_csv('players_df2.csv')

In [3]:
players_df2.head()

Unnamed: 0,GAME_ID,Match_Date,Team,Opponent,LOCATION,MATCH_OUTCOME,FINAL_MARGIN,SHOT_NUMBER,PERIOD,GAME_CLOCK,...,Defender_age,Player_id,Player_name,Player_years_experience,Player_G,Player_F,Player_C,Player_height(in),Player_weight,Player_age
0,21400899,2015-03-04,CHA,BKN,A,W,24,1,1,69,...,32.0,203148,Brian Roberts,2,1,0,0,73,173.0,29.0
1,21400899,2015-03-04,CHA,BKN,A,W,24,2,1,14,...,26.0,203148,Brian Roberts,2,1,0,0,73,173.0,29.0
2,21400899,2015-03-04,CHA,BKN,A,W,24,3,1,0,...,26.0,203148,Brian Roberts,2,1,0,0,73,173.0,29.0
3,21400899,2015-03-04,CHA,BKN,A,W,24,4,2,707,...,23.0,203148,Brian Roberts,2,1,0,0,73,173.0,29.0
4,21400899,2015-03-04,CHA,BKN,A,W,24,5,2,634,...,27.0,203148,Brian Roberts,2,1,0,0,73,173.0,29.0


# All Features

In [4]:
Target = players_df2["SHOT_RESULT"]

In [5]:
headers = ["FINAL_MARGIN",
                      "SHOT_NUMBER",
                      "PERIOD",
                      "GAME_CLOCK",
                      "SHOT_CLOCK",
                      "DRIBBLES",
                      "TOUCH_TIME",
                      "SHOT_DIST",
                      "PTS_TYPE",
                      "Defender_distance",
                      "Defender_years_experience",
                      "Defender_G",
                      "Defender_F",
                      "Defender_C",
                      "Defender_height(in)",
                      "Defender_weight",
                      "Defender_age",
                      "Player_years_experience",
                      "Player_G",
                      "Player_F",
                      "Player_C",
                      "Player_height(in)",
                      "Player_weight",
                      "Player_age"]
Features = players_df2[headers]

In [6]:
print('Shots made', np.count_nonzero(Target))
print('Shots missed', Target.size-np.count_nonzero(Target))
print('Total shots', Target.size)
print('Precision score', (Target.size-np.count_nonzero(Target))/Target.size)

Shots made 56960
Shots missed 68883
Total shots 125843
Precision score 0.5473725197269613


## Prep the data for model

In [7]:
X_train, X_test, y_train, y_test = train_test_split(Features.values, Target.values, test_size=0.25, random_state=8165)

In [8]:
X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

In [9]:
pca = PCA(n_components=4)
X_train = pca.fit_transform(X_train)
X_test = pca.fit_transform(X_test)

In [10]:
X_train_feature_names = ['PCA1', 'PCA2', 'PCA3', 'PCA4']
df_trans = pd.DataFrame(X_train, columns = X_train_feature_names)
df_trans['target'] = y_train
df_trans.head()

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,target
0,1.956804,3.150166,0.439118,-0.421457,1
1,-3.19741,1.095753,-0.765944,2.179625,0
2,0.379802,5.101061,3.187259,-1.520707,0
3,-3.390915,0.293494,-1.222491,0.513101,1
4,-2.027555,-2.542645,0.765556,2.774384,0


In [11]:
print(X_train.shape)
print(y_train.shape)
print(pca.components_.shape)
print(pca.explained_variance_)

(94382, 4)
(94382,)
(4, 24)
[5.36702801 2.6577223  2.02369684 1.9752136 ]


## Baseline Model - RandomForest

In [12]:
model = RandomForestClassifier(random_state=8165)
model.fit(X_train, y_train)
train_preds = model.predict(X_train)
val_preds = model.predict(X_test)
training_accuracy = accuracy_score(y_train, train_preds)
val_accuracy = accuracy_score(y_test, val_preds)
training_precision = precision_score(y_train, train_preds)
val_precision = precision_score(y_test, val_preds)

print("Training Accuracy:   {:.4}%".format(training_accuracy * 100))
print("Validation Accuracy: {:.4}%".format(val_accuracy * 100))
print('----------------------------')
print("Training precision:   {:.4}%".format(training_precision * 100))
print("Validation precision: {:.4}%".format(val_precision * 100))
print('----------------------------')
print("Training Confusion Matrix:")
print(confusion_matrix(y_train, train_preds))
print("Validation Confusion Matrix:")
print(confusion_matrix(y_test, val_preds))

Training Accuracy:   97.9%
Validation Accuracy: 53.75%
----------------------------
Training precision:   99.46%
Validation precision: 48.42%
----------------------------
Training Confusion Matrix:
[[51428   223]
 [ 1763 40968]]
Validation Confusion Matrix:
[[11993  5239]
 [ 9311  4918]]


## Tuning the RandomForest

In [13]:
parameters_for_testing = {
    'n_estimators': [100, 200],
    'criterion': ['entropy', 'gini'],
    'max_depth': [5, 10, 20],
    'min_samples_leaf': [0.05, 0.1],
    'min_samples_split': [0.05, 0.1]
}

rf_model = RandomForestClassifier(random_state=8165)

gsearch = GridSearchCV(estimator = rf_model,
                        param_grid = parameters_for_testing,
                        scoring='accuracy',
                        cv=3)

gsearch.fit(X_train, y_train)

print('Best params')
print(gsearch.best_params_)
print('Best score: {:.4}%'.format(gsearch.best_score_ * 100))

Best params
{'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 0.05, 'min_samples_split': 0.05, 'n_estimators': 100}
Best score: 56.77%


In [14]:
parameters_for_testing = {
    'n_estimators': [100, 200],
    'criterion': ['entropy', 'gini'],
    'max_depth': [5, 10, 20],
    'min_samples_leaf': [0.05, 0.1],
    'min_samples_split': [0.05, 0.1]
}

rf_model = RandomForestClassifier(random_state=8165)

gsearch = GridSearchCV(estimator = rf_model,
                        param_grid = parameters_for_testing,
                        scoring='precision',
                        cv=3)

gsearch.fit(X_train, y_train)

print('Best params')
print(gsearch.best_params_)
print('Best score: {:.4}%'.format(gsearch.best_score_ * 100))

Best params
{'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 0.05, 'min_samples_split': 0.05, 'n_estimators': 100}
Best score: 55.07%


In [15]:
Audio(sound_file, autoplay=True)