In [13]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, mean_absolute_error

from tqdm import tqdm 

In [14]:
box_scores = pd.read_csv("../datasets/box_scores_cleaned", index_col = None)

In [15]:
box_scores.head()

Unnamed: 0.1,Unnamed: 0,away_assist_percentage,away_assists,away_block_percentage,away_blocks,away_defensive_rating,away_defensive_rebound_percentage,away_defensive_rebounds,away_effective_field_goal_percentage,away_field_goal_attempts,...,home_total_rebound_percentage,home_total_rebounds,home_true_shooting_percentage,home_turnover_percentage,home_turnovers,home_two_point_field_goal_attempts,home_two_point_field_goal_percentage,home_two_point_field_goals,home_wins,pace
0,0,77.8,21,6.7,4,112.0,73.3,33,0.407,70,...,51.1,44,0.561,13.3,22,57,0.421,24,0,98.3
1,1,54.3,19,3.3,2,113.6,70.3,26,0.528,72,...,48.6,37,0.577,14.6,19,57,0.509,29,0,88.9
2,2,47.1,16,6.5,4,95.1,79.5,35,0.433,90,...,47.8,48,0.521,17.4,12,63,0.381,24,0,98.9
3,3,56.4,22,8.2,5,112.4,71.7,33,0.506,84,...,46.4,45,0.536,10.4,13,69,0.464,32,0,89.8
4,4,65.1,28,3.2,2,119.9,78.9,30,0.512,86,...,47.4,41,0.599,8.9,10,71,0.577,41,0,92.6


In [16]:
box_scores.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [17]:
box_scores.columns

Index(['away_assist_percentage', 'away_assists', 'away_block_percentage',
       'away_blocks', 'away_defensive_rating',
       'away_defensive_rebound_percentage', 'away_defensive_rebounds',
       'away_effective_field_goal_percentage', 'away_field_goal_attempts',
       'away_field_goal_percentage', 'away_field_goals',
       'away_free_throw_attempt_rate', 'away_free_throw_attempts',
       'away_free_throw_percentage', 'away_free_throws', 'away_losses',
       'away_minutes_played', 'away_offensive_rating',
       'away_offensive_rebound_percentage', 'away_offensive_rebounds',
       'away_personal_fouls', 'away_points', 'away_steal_percentage',
       'away_steals', 'away_three_point_attempt_rate',
       'away_three_point_field_goal_attempts',
       'away_three_point_field_goal_percentage',
       'away_three_point_field_goals', 'away_total_rebound_percentage',
       'away_total_rebounds', 'away_true_shooting_percentage',
       'away_turnover_percentage', 'away_turnovers',
  

## Random Forest Model

In [19]:
# getting rid of some columns we don't need

drop_col = ['away_points', 'home_points']

In [20]:
X = box_scores.drop(drop_col, 1) # includes our target vectors
y = box_scores[['home_points', 'away_points']] # split out our target vector

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [22]:
parameters = {'bootstrap': False,
              'min_samples_leaf': 3,
              'n_estimators': 15,
              'min_samples_split': 10,
              'max_features': 'sqrt',
              'max_depth': 6}
model = RandomForestRegressor(**parameters)
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=6,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=3, min_samples_split=10,
                      min_weight_fraction_leaf=0.0, n_estimators=15,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [26]:
# GridSearch
leafs = [1, 5, 10, 20, 50, 100]
params = {'n_estimators' : range(5,200,4), # I want to look at a whole range of estimators
             'max_depth': range(1,20,2), # 
             'min_samples_leaf': leafs
             }
gs = GridSearchCV(estimator = model, # use our RF model as the estimator
                  param_grid = params,
                  cv = 10, # 10 fold cross validation
                  n_jobs = -1) # all processors!

gs.fit(X_train, y_train) # fit the GridSearchCV object
gs.best_params_ # shows the best parameters fitted

{'max_depth': 19, 'min_samples_leaf': 1, 'n_estimators': 189}

In [27]:
# print our simulation against the real games

print(gs.predict(X_test).astype(int), y_test)

[[ 93 100]
 [106 109]
 [ 99 112]
 ...
 [ 95 106]
 [113  97]
 [ 92  86]]        home_points  away_points
22416           94          100
24350          106          110
1296           100          113
17170          108          109
22073          100          111
...            ...          ...
16134          113           80
22356          114          112
5808            96          107
12388          115           98
5292            92           86

[6268 rows x 2 columns]


In [28]:
# printing our train & test scores

print(gs.score(X_train, y_train))
print(gs.score(X_test, y_test))

0.9974447670884682
0.9856417206242964




In [32]:
y_pred = gs.predict(X_test) # saving predictions as their own variable
y_true = y_test # the "true" values

print(mean_absolute_error(y_pred, y_true))

0.9374840952997033


## Dean Oliver RF Model

In [18]:
# getting rid of stats not covered in the Four Factors

four_fac = box_scores[['away_effective_field_goal_percentage', 'home_effective_field_goal_percentage',
                        'away_turnover_percentage', 'home_turnover_percentage',
                        'away_defensive_rebound_percentage', 'home_defensive_rebound_percentage',
                        'away_offensive_rebound_percentage', 'home_offensive_rebound_percentage',
                        'away_free_throw_percentage', 'home_free_throw_percentage',
                        'away_points', 'home_points']]

In [19]:
drop_col = ['away_points', 'home_points']

In [20]:
X = four_fac.drop(drop_col, 1) # includes our target vectors
y = four_fac[['home_points', 'away_points']] # split out our target vector

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25) # I want to train on 75% of the data 

In [22]:
# same model as above

parameters = {'bootstrap': True,
              'min_samples_leaf': 3,
              'n_estimators': 15,
              'min_samples_split': 10,
              'max_features': 'sqrt',
              'max_depth': 6}
model = RandomForestRegressor(**parameters)
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=3, min_samples_split=10,
                      min_weight_fraction_leaf=0.0, n_estimators=15,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [23]:
# GridSearch
leafs = [1, 5, 10, 20, 50, 100]
params = {'n_estimators' : range(5,200,4),
             'max_depth': range(1,20,2),
             'min_samples_leaf': leafs
             }
gs = GridSearchCV(estimator = model,
                  param_grid = params,
                  cv = 10,
                  n_jobs = -1) # all processors!

gs.fit(X_train, y_train) # fit the GridSearchCV object
gs.best_params_ 

{'max_depth': 19, 'min_samples_leaf': 1, 'n_estimators': 173}

In [24]:
print(gs.score(X_train, y_train))
print(gs.score(X_test, y_test))

0.9163095370696099
0.8013575129175174




In [25]:
y_pred = gs.predict(X_test)
y_true = y_test

print(mean_absolute_error(y_pred, y_true))

4.372239281418894


In [28]:
results = pd.DataFrame(gs.cv_results_)
results.sort_values(by = ['rank_test_score'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
2688,7.036617,0.059334,0.088153,0.002891,19,1,173,"{'max_depth': 19, 'min_samples_leaf': 1, 'n_es...",0.800888,0.793742,...,0.778831,0.776327,0.798162,0.796057,0.787237,0.787866,0.786720,0.790673,0.008274,1
2691,7.419541,0.086976,0.090866,0.002547,19,1,185,"{'max_depth': 19, 'min_samples_leaf': 1, 'n_es...",0.800781,0.792857,...,0.777239,0.776140,0.798144,0.796448,0.787416,0.786996,0.787880,0.790565,0.008615,2
2690,7.324101,0.051239,0.088889,0.002262,19,1,181,"{'max_depth': 19, 'min_samples_leaf': 1, 'n_es...",0.800701,0.792759,...,0.777640,0.775882,0.798843,0.796928,0.788541,0.785989,0.787725,0.790502,0.008494,3
2675,4.897772,0.084379,0.078018,0.025221,19,1,121,"{'max_depth': 19, 'min_samples_leaf': 1, 'n_es...",0.802609,0.794664,...,0.776708,0.776695,0.797052,0.794969,0.787213,0.785256,0.786804,0.790238,0.008702,4
2677,5.115726,0.035249,0.068526,0.008437,19,1,129,"{'max_depth': 19, 'min_samples_leaf': 1, 'n_es...",0.801508,0.793254,...,0.778225,0.777252,0.796201,0.795743,0.786912,0.785948,0.787408,0.790234,0.008034,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,0.031258,0.001020,0.007586,0.004040,1,100,5,"{'max_depth': 1, 'min_samples_leaf': 100, 'n_e...",0.085833,0.148522,...,0.085637,0.217985,0.089192,0.243152,0.242005,0.140742,0.169677,0.157802,0.057978,2936
150,0.091988,0.003983,0.015620,0.012795,1,20,17,"{'max_depth': 1, 'min_samples_leaf': 20, 'n_es...",0.117382,0.241265,...,0.151804,0.131578,0.153160,0.172028,0.151668,0.134242,0.151287,0.155716,0.031886,2937
1,0.054802,0.005309,0.004696,0.001209,1,1,9,"{'max_depth': 1, 'min_samples_leaf': 1, 'n_est...",0.129256,0.179734,...,0.127131,0.152604,0.096774,0.154294,0.173323,0.130476,0.158042,0.151532,0.031170,2938
148,0.051382,0.002800,0.010535,0.007572,1,20,9,"{'max_depth': 1, 'min_samples_leaf': 20, 'n_es...",0.129661,0.122878,...,0.114313,0.141323,0.189392,0.189178,0.115267,0.188668,0.185914,0.147773,0.034508,2939


Even though we lost some of the accuracy from our first Random Forest model - 80% accuracy with a MAE of less than 3 scores is not bad especially considering the improvement on the Linear Regression model. Despite the improvement in score - this model still takes a prohibitively long time to train, and overfits the training data.