In [55]:
# Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

from sportsreference.nba.teams import Teams
from sportsreference.nba.schedule import Schedule

In [2]:
stats = pd.read_csv('./2020_team_stats.csv', index_col = None)
box_scores = pd.read_csv("./dataset.csv", index_col = None)

To model results I'll be using stats pulled using SportsReference API. For my first model I want to try a random forest before I engineer our "4 factor" features. The below code block pulls all the box scores for the season, but instead of doing this every time I open the notebook, I have read in the season-to-date in the cell above.

In [3]:
# dataset = pd.DataFrame()
# teams = Teams()
# for team in teams:
    # dataset = pd.concat([dataset, team.schedule.dataframe_extended])
# dataset.to_csv("./dataset.csv")
# dataset.head()

In [4]:
box_scores.head()

Unnamed: 0.1,Unnamed: 0,away_assist_percentage,away_assists,away_block_percentage,away_blocks,away_defensive_rating,away_defensive_rebound_percentage,away_defensive_rebounds,away_effective_field_goal_percentage,away_field_goal_attempts,...,home_two_point_field_goal_percentage,home_two_point_field_goals,home_wins,location,losing_abbr,losing_name,pace,winner,winning_abbr,winning_name
0,201910240HOU,67.4,31,23.8,10,101.1,82.5,47,0.545,99,...,0.566,30,0,"Toyota Center, Houston, Texas",HOU,Houston Rockets,109.8,Away,MIL,Milwaukee Bucks
1,201910260MIL,71.7,33,7.5,3,104.0,90.2,46,0.535,101,...,0.526,30,0,"Fiserv Forum, Milwaukee, Wisconsin",MIL,Milwaukee Bucks,109.8,Away,MIA,Miami Heat
2,201910280MIL,42.2,19,5.6,3,122.9,82.6,38,0.51,100,...,0.541,33,0,"Fiserv Forum, Milwaukee, Wisconsin",CLE,Cleveland Cavaliers,104.9,Home,MIL,Milwaukee Bucks
3,201910300BOS,55.3,21,7.8,4,113.3,88.9,40,0.549,82,...,0.649,24,0,"TD Garden, Boston, Massachusetts",MIL,Milwaukee Bucks,102.4,Home,BOS,Boston Celtics
4,201911010ORL,51.1,24,7.4,4,89.6,82.5,47,0.597,93,...,0.652,30,0,"Amway Center, Orlando, Florida",ORL,Orlando Magic,101.5,Away,MIL,Milwaukee Bucks


In [5]:
box_scores.columns

Index(['Unnamed: 0', 'away_assist_percentage', 'away_assists',
       'away_block_percentage', 'away_blocks', 'away_defensive_rating',
       'away_defensive_rebound_percentage', 'away_defensive_rebounds',
       'away_effective_field_goal_percentage', 'away_field_goal_attempts',
       'away_field_goal_percentage', 'away_field_goals',
       'away_free_throw_attempt_rate', 'away_free_throw_attempts',
       'away_free_throw_percentage', 'away_free_throws', 'away_losses',
       'away_minutes_played', 'away_offensive_rating',
       'away_offensive_rebound_percentage', 'away_offensive_rebounds',
       'away_personal_fouls', 'away_points', 'away_steal_percentage',
       'away_steals', 'away_three_point_attempt_rate',
       'away_three_point_field_goal_attempts',
       'away_three_point_field_goal_percentage',
       'away_three_point_field_goals', 'away_total_rebound_percentage',
       'away_total_rebounds', 'away_true_shooting_percentage',
       'away_turnover_percentage', 'away_

In [6]:
box_scores.away_points.dtypes

dtype('int64')

In [7]:
box_scores.dropna().drop_duplicates()

Unnamed: 0.1,Unnamed: 0,away_assist_percentage,away_assists,away_block_percentage,away_blocks,away_defensive_rating,away_defensive_rebound_percentage,away_defensive_rebounds,away_effective_field_goal_percentage,away_field_goal_attempts,...,home_two_point_field_goal_percentage,home_two_point_field_goals,home_wins,location,losing_abbr,losing_name,pace,winner,winning_abbr,winning_name
0,201910240HOU,67.4,31,23.8,10,101.1,82.5,47,0.545,99,...,0.566,30,0,"Toyota Center, Houston, Texas",HOU,Houston Rockets,109.8,Away,MIL,Milwaukee Bucks
1,201910260MIL,71.7,33,7.5,3,104.0,90.2,46,0.535,101,...,0.526,30,0,"Fiserv Forum, Milwaukee, Wisconsin",MIL,Milwaukee Bucks,109.8,Away,MIA,Miami Heat
2,201910280MIL,42.2,19,5.6,3,122.9,82.6,38,0.510,100,...,0.541,33,0,"Fiserv Forum, Milwaukee, Wisconsin",CLE,Cleveland Cavaliers,104.9,Home,MIL,Milwaukee Bucks
3,201910300BOS,55.3,21,7.8,4,113.3,88.9,40,0.549,82,...,0.649,24,0,"TD Garden, Boston, Massachusetts",MIL,Milwaukee Bucks,102.4,Home,BOS,Boston Celtics
4,201911010ORL,51.1,24,7.4,4,89.6,82.5,47,0.597,93,...,0.652,30,0,"Amway Center, Orlando, Florida",ORL,Orlando Magic,101.5,Away,MIL,Milwaukee Bucks
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1679,202001280CHO,44.7,17,12.5,7,103.7,68.0,34,0.494,86,...,0.537,29,0,"Spectrum Center, Charlotte, North Carolina",NYK,New York Knicks,93.6,Home,CHO,Charlotte Hornets
1683,202002060NYK,64.7,22,12.5,9,105.7,76.2,32,0.513,75,...,0.568,25,0,"Madison Square Garden (IV), New York, New York",ORL,Orlando Magic,99.3,Home,NYK,New York Knicks
1689,202002260CHO,51.2,22,11.3,6,114.4,74.4,29,0.529,85,...,0.557,39,0,"Spectrum Center, Charlotte, North Carolina",NYK,New York Knicks,93.6,Home,CHO,Charlotte Hornets
1736,202001200CHO,56.8,25,9.3,4,93.0,78.0,32,0.554,92,...,0.469,30,0,"Spectrum Center, Charlotte, North Carolina",CHO,Charlotte Hornets,89.2,Away,ORL,Orlando Magic


In [26]:
box_scores.info

<bound method DataFrame.info of         Unnamed: 0  away_assist_percentage  away_assists  \
0     201910240HOU                    67.4            31   
1     201910260MIL                    71.7            33   
2     201910280MIL                    42.2            19   
3     201910300BOS                    55.3            21   
4     201911010ORL                    51.1            24   
...            ...                     ...           ...   
1809  202002220CHO                    61.9            26   
1810  202002250IND                    71.0            22   
1811  202002260CHO                    51.2            22   
1812  202002280TOR                    67.6            23   
1813  202003010CHO                    57.1            20   

      away_block_percentage  away_blocks  away_defensive_rating  \
0                      23.8           10                  101.1   
1                       7.5            3                  104.0   
2                       5.6            3      

In [39]:
# getting rid of some columns we don't need

drop_col = ['Unnamed: 0', 'date', 'away_points', 'home_points', 'location',
            'losing_abbr', 'losing_name', 'winner', 'winning_abbr', 'losing_abbr', 'winning_name', 'losing_name']

In [40]:
X = box_scores.drop(drop_col, 1)
y = box_scores[['home_points', 'away_points']] # split out our target vector

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [42]:
parameters = {'bootstrap': False,
              'min_samples_leaf': 3,
              'n_estimators': 50,
              'min_samples_split': 10,
              'max_features': 'sqrt',
              'max_depth': 6}
model = RandomForestRegressor(**parameters)
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=6,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=3, min_samples_split=10,
                      min_weight_fraction_leaf=0.0, n_estimators=50,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [43]:
# print our simulation

print(model.predict(X_test).astype(int), y_test)

[[ 93  95]
 [116 118]
 [105 115]
 [125 111]
 [110 111]
 [110 112]
 [126 103]
 [113  93]
 [102  95]
 [107 116]
 [127 104]
 [ 94  91]
 [103 111]
 [114 108]
 [ 99 106]
 [100 121]
 [112 113]
 [124 112]
 [100 125]
 [119 100]
 [100 121]
 [105 117]
 [107 113]
 [126 105]
 [106 115]
 [126 127]
 [104 103]
 [117 114]
 [123 117]
 [114 118]
 [119 109]
 [111 119]
 [108 118]
 [105 113]
 [108 115]
 [117 112]
 [106  98]
 [129 111]
 [115 107]
 [120  91]
 [110 126]
 [106  92]
 [123 121]
 [114 121]
 [104 100]
 [109 118]
 [103 106]
 [128 115]
 [119  94]
 [106  98]
 [120 112]
 [ 97 107]
 [113 102]
 [102  89]
 [105 102]
 [126 127]
 [125 127]
 [123 124]
 [126 117]
 [117 123]
 [107 104]
 [114  94]
 [122 117]
 [101  94]
 [109 115]
 [122 121]
 [101 112]
 [104 120]
 [113 106]
 [110 121]
 [104  93]
 [107 102]
 [118 119]
 [107  94]
 [110 105]
 [110 121]
 [106 107]
 [124 117]
 [121 122]
 [124 108]
 [115  92]
 [107 102]
 [121 122]
 [ 94  99]
 [119 119]
 [121 125]
 [ 96 113]
 [109 122]
 [127 113]
 [116 107]
 [102 107]

In [44]:
print(model.predict(X_test).astype(int))

[[ 93  95]
 [116 118]
 [105 115]
 [125 111]
 [110 111]
 [110 112]
 [126 103]
 [113  93]
 [102  95]
 [107 116]
 [127 104]
 [ 94  91]
 [103 111]
 [114 108]
 [ 99 106]
 [100 121]
 [112 113]
 [124 112]
 [100 125]
 [119 100]
 [100 121]
 [105 117]
 [107 113]
 [126 105]
 [106 115]
 [126 127]
 [104 103]
 [117 114]
 [123 117]
 [114 118]
 [119 109]
 [111 119]
 [108 118]
 [105 113]
 [108 115]
 [117 112]
 [106  98]
 [129 111]
 [115 107]
 [120  91]
 [110 126]
 [106  92]
 [123 121]
 [114 121]
 [104 100]
 [109 118]
 [103 106]
 [128 115]
 [119  94]
 [106  98]
 [120 112]
 [ 97 107]
 [113 102]
 [102  89]
 [105 102]
 [126 127]
 [125 127]
 [123 124]
 [126 117]
 [117 123]
 [107 104]
 [114  94]
 [122 117]
 [101  94]
 [109 115]
 [122 121]
 [101 112]
 [104 120]
 [113 106]
 [110 121]
 [104  93]
 [107 102]
 [118 119]
 [107  94]
 [110 105]
 [110 121]
 [106 107]
 [124 117]
 [121 122]
 [124 108]
 [115  92]
 [107 102]
 [121 122]
 [ 94  99]
 [119 119]
 [121 125]
 [ 96 113]
 [109 122]
 [127 113]
 [116 107]
 [102 107]

In [45]:
print(y_test)

      home_points  away_points
1714           87           93
216           116          122
1069          104          117
384           134          109
491           116          109
...           ...          ...
760           106          111
1015          111          121
445            94          103
1154           87           91
713           107           95

[454 rows x 2 columns]


In [46]:
model.score(X_train, y_train)



0.9064518537763443

In [47]:
model.score(X_test, y_test)



0.8682699964441247

In [56]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [57]:
print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

0.9990007134569756
0.9988646065117824




In [58]:
print(lr.predict(X_test).astype(int))

[[ 86  92]
 [116 122]
 [103 116]
 [133 108]
 [115 109]
 [102 109]
 [144 101]
 [116  89]
 [ 99  93]
 [104 116]
 [133 101]
 [ 93  84]
 [ 98 111]
 [115 106]
 [ 98 106]
 [ 88 126]
 [119 121]
 [128 111]
 [ 92 133]
 [119 101]
 [ 91 123]
 [101 118]
 [105 112]
 [130 102]
 [101 109]
 [129 134]
 [103 104]
 [115 118]
 [125 116]
 [117 123]
 [117 108]
 [116 126]
 [107 117]
 [ 98 108]
 [106 115]
 [116 111]
 [102  95]
 [138 111]
 [114 103]
 [119  91]
 [111 130]
 [119  91]
 [128 123]
 [119 124]
 [104  98]
 [106 118]
 [ 98 102]
 [132 117]
 [122  88]
 [109  95]
 [125 113]
 [ 92 102]
 [116 104]
 [104  80]
 [105  99]
 [131 139]
 [133 137]
 [126 130]
 [128 118]
 [116 127]
 [105  99]
 [117  93]
 [126 118]
 [100  93]
 [108 114]
 [124 121]
 [ 99 111]
 [106 127]
 [122 111]
 [106 126]
 [ 99  91]
 [106 100]
 [117 120]
 [108  88]
 [107 105]
 [109 127]
 [104 105]
 [128 123]
 [117 123]
 [126 106]
 [110  87]
 [111 103]
 [117 123]
 [ 85  96]
 [121 120]
 [119 130]
 [ 85 112]
 [111 124]
 [129 114]
 [117 105]
 [ 99 109]

In [59]:
print(y_test)

      home_points  away_points
1714           87           93
216           116          122
1069          104          117
384           134          109
491           116          109
...           ...          ...
760           106          111
1015          111          121
445            94          103
1154           87           91
713           107           95

[454 rows x 2 columns]
