In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import (cross_val_score, train_test_split, 
                                     cross_val_predict, StratifiedKFold, GridSearchCV)

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier

np.random.seed(23)

In [2]:
df_final = pd.read_csv('Data/data_final')

In [3]:
y_train = pd.read_csv('Data/y_train_processed.csv')
y_test = pd.read_csv('Data/y_test_processed.csv')

X_train = pd.read_csv('Data/x_train_processed.csv')
X_test = pd.read_csv('Data/x_test_processed.csv')

## Let's create a function to print out cross val score

In [4]:
def model_scores(cv_scores, model, X, y, model_name):  
    print(f'Score from {model_name}: {model.score(X, y):.4f}')
    print(f'Min and Max scores are: [{cv_scores.min():.4f}, {cv_scores.max():.4f}]')
    print(f'Confidence interval is : {cv_scores.mean():.4f} \u00B1 {2*cv_scores.std():.4f}')

## Now let's create a Random Forest Model

In [5]:
rf = RandomForestClassifier(n_jobs=-1, random_state=23, n_estimators=20, class_weight='balanced', max_depth=10)
rf.fit(X_train, np.ravel(y_train))

RandomForestClassifier(class_weight='balanced', max_depth=10, n_estimators=20,
                       n_jobs=-1, random_state=23)

In [6]:
X = df_final.drop('shot_made_flag', axis=1)
y = df_final.shot_made_flag

In [7]:
cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

scores = cross_val_score(rf, X, y, cv=cv)

model_scores(scores, rf, X_test, y_test, 'random forest')

Score from random forest: 0.6070
Min and Max scores are: [0.5965, 0.6261]
Confidence interval is : 0.6101 ± 0.0163


### For our Random Forests model our accuracy was 60.70% which is 5.7% better than our baseline of 55%. Our mean cross val score for our model is 61.01% with a confidence interval of ± 1.63% which is a small interval showing we have low variance.

## Now let's look at the feature importance of our Random Forest model to identify the features that are most relevant.

In [8]:
df_feature_importance = pd.DataFrame({'feature':X_train.columns,
                                      'importance':rf.feature_importances_})
df_feature_importance.sort_values('importance', ascending=False).head()

Unnamed: 0,feature,importance
19,shot_distance,0.303423
2,combined_shot_type_Dunk,0.168515
3,combined_shot_type_Jump_Shot,0.106335
0,minutes_remaining,0.102162
4,combined_shot_type_Layup,0.069738


## According to our Random Forest model the above 5 features have the best predictive power (level of importance).

In [9]:
df_final.groupby('shot_made_flag').describe()['shot_distance']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
shot_made_flag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,14232.0,15.1276,9.193606,0.0,8.0,16.0,22.0,79.0
1.0,11465.0,11.383428,9.213679,0.0,0.0,12.0,19.0,43.0


### Top 2 features
 - shot_distance: We can see that the average shot distance for made shots is 11.38 feet and for misses is 15.12 feet. We want shots to be closer to 11 feet for better odds of making it.
 - combined_shot_type_Dunk: We want to make sure dunks are emphasized since our model predicted this as being important and leading to more successful shots made.