In [2]:
# Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

from sportsreference.nba.teams import Teams
from sportsreference.nba.schedule import Schedule

In [3]:
stats = pd.read_csv('../2020_team_stats.csv', index_col = None)
box_scores = pd.read_csv("../dataset.csv", index_col = None)

To model results I'll be using stats pulled using SportsReference API. For my first model I want to try a random forest before I engineer our "4 factor" features. The below code block pulls all the box scores for the season, but instead of doing this every time I open the notebook, I have read in the season-to-date in the cell above.

In [4]:
# dataset = pd.DataFrame()
# teams = Teams()
# for team in teams:
    # dataset = pd.concat([dataset, team.schedule.dataframe_extended])
# dataset.to_csv("./dataset.csv")
# dataset.head()

In [5]:
box_scores.head()

Unnamed: 0.1,Unnamed: 0,away_assist_percentage,away_assists,away_block_percentage,away_blocks,away_defensive_rating,away_defensive_rebound_percentage,away_defensive_rebounds,away_effective_field_goal_percentage,away_field_goal_attempts,...,home_two_point_field_goal_percentage,home_two_point_field_goals,home_wins,location,losing_abbr,losing_name,pace,winner,winning_abbr,winning_name
0,201910240HOU,67.4,31,23.8,10,101.1,82.5,47,0.545,99,...,0.566,30,0,"Toyota Center, Houston, Texas",HOU,Houston Rockets,109.8,Away,MIL,Milwaukee Bucks
1,201910260MIL,71.7,33,7.5,3,104.0,90.2,46,0.535,101,...,0.526,30,0,"Fiserv Forum, Milwaukee, Wisconsin",MIL,Milwaukee Bucks,109.8,Away,MIA,Miami Heat
2,201910280MIL,42.2,19,5.6,3,122.9,82.6,38,0.51,100,...,0.541,33,0,"Fiserv Forum, Milwaukee, Wisconsin",CLE,Cleveland Cavaliers,104.9,Home,MIL,Milwaukee Bucks
3,201910300BOS,55.3,21,7.8,4,113.3,88.9,40,0.549,82,...,0.649,24,0,"TD Garden, Boston, Massachusetts",MIL,Milwaukee Bucks,102.4,Home,BOS,Boston Celtics
4,201911010ORL,51.1,24,7.4,4,89.6,82.5,47,0.597,93,...,0.652,30,0,"Amway Center, Orlando, Florida",ORL,Orlando Magic,101.5,Away,MIL,Milwaukee Bucks


In [6]:
box_scores.columns

Index(['Unnamed: 0', 'away_assist_percentage', 'away_assists',
       'away_block_percentage', 'away_blocks', 'away_defensive_rating',
       'away_defensive_rebound_percentage', 'away_defensive_rebounds',
       'away_effective_field_goal_percentage', 'away_field_goal_attempts',
       'away_field_goal_percentage', 'away_field_goals',
       'away_free_throw_attempt_rate', 'away_free_throw_attempts',
       'away_free_throw_percentage', 'away_free_throws', 'away_losses',
       'away_minutes_played', 'away_offensive_rating',
       'away_offensive_rebound_percentage', 'away_offensive_rebounds',
       'away_personal_fouls', 'away_points', 'away_steal_percentage',
       'away_steals', 'away_three_point_attempt_rate',
       'away_three_point_field_goal_attempts',
       'away_three_point_field_goal_percentage',
       'away_three_point_field_goals', 'away_total_rebound_percentage',
       'away_total_rebounds', 'away_true_shooting_percentage',
       'away_turnover_percentage', 'away_

In [7]:
box_scores.away_points.dtypes

dtype('int64')

In [8]:
box_scores.dropna().drop_duplicates()

Unnamed: 0.1,Unnamed: 0,away_assist_percentage,away_assists,away_block_percentage,away_blocks,away_defensive_rating,away_defensive_rebound_percentage,away_defensive_rebounds,away_effective_field_goal_percentage,away_field_goal_attempts,...,home_two_point_field_goal_percentage,home_two_point_field_goals,home_wins,location,losing_abbr,losing_name,pace,winner,winning_abbr,winning_name
0,201910240HOU,67.4,31,23.8,10,101.1,82.5,47,0.545,99,...,0.566,30,0,"Toyota Center, Houston, Texas",HOU,Houston Rockets,109.8,Away,MIL,Milwaukee Bucks
1,201910260MIL,71.7,33,7.5,3,104.0,90.2,46,0.535,101,...,0.526,30,0,"Fiserv Forum, Milwaukee, Wisconsin",MIL,Milwaukee Bucks,109.8,Away,MIA,Miami Heat
2,201910280MIL,42.2,19,5.6,3,122.9,82.6,38,0.510,100,...,0.541,33,0,"Fiserv Forum, Milwaukee, Wisconsin",CLE,Cleveland Cavaliers,104.9,Home,MIL,Milwaukee Bucks
3,201910300BOS,55.3,21,7.8,4,113.3,88.9,40,0.549,82,...,0.649,24,0,"TD Garden, Boston, Massachusetts",MIL,Milwaukee Bucks,102.4,Home,BOS,Boston Celtics
4,201911010ORL,51.1,24,7.4,4,89.6,82.5,47,0.597,93,...,0.652,30,0,"Amway Center, Orlando, Florida",ORL,Orlando Magic,101.5,Away,MIL,Milwaukee Bucks
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1679,202001280CHO,44.7,17,12.5,7,103.7,68.0,34,0.494,86,...,0.537,29,0,"Spectrum Center, Charlotte, North Carolina",NYK,New York Knicks,93.6,Home,CHO,Charlotte Hornets
1683,202002060NYK,64.7,22,12.5,9,105.7,76.2,32,0.513,75,...,0.568,25,0,"Madison Square Garden (IV), New York, New York",ORL,Orlando Magic,99.3,Home,NYK,New York Knicks
1689,202002260CHO,51.2,22,11.3,6,114.4,74.4,29,0.529,85,...,0.557,39,0,"Spectrum Center, Charlotte, North Carolina",NYK,New York Knicks,93.6,Home,CHO,Charlotte Hornets
1736,202001200CHO,56.8,25,9.3,4,93.0,78.0,32,0.554,92,...,0.469,30,0,"Spectrum Center, Charlotte, North Carolina",CHO,Charlotte Hornets,89.2,Away,ORL,Orlando Magic


In [9]:
box_scores.info

<bound method DataFrame.info of         Unnamed: 0  away_assist_percentage  away_assists  \
0     201910240HOU                    67.4            31   
1     201910260MIL                    71.7            33   
2     201910280MIL                    42.2            19   
3     201910300BOS                    55.3            21   
4     201911010ORL                    51.1            24   
...            ...                     ...           ...   
1809  202002220CHO                    61.9            26   
1810  202002250IND                    71.0            22   
1811  202002260CHO                    51.2            22   
1812  202002280TOR                    67.6            23   
1813  202003010CHO                    57.1            20   

      away_block_percentage  away_blocks  away_defensive_rating  \
0                      23.8           10                  101.1   
1                       7.5            3                  104.0   
2                       5.6            3      

In [10]:
# getting rid of some columns we don't need

drop_col = ['Unnamed: 0', 'date', 'away_points', 'home_points', 'location',
            'losing_abbr', 'losing_name', 'winner', 'winning_abbr', 'losing_abbr', 'winning_name', 'losing_name']

In [11]:
X = box_scores.drop(drop_col, 1) # includes our target vectors
y = box_scores[['home_points', 'away_points']] # split out our target vector

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [13]:
parameters = {'bootstrap': True,
              'min_samples_leaf': 3,
              'n_estimators': 50,
              'min_samples_split': 10,
              'max_features': 'sqrt',
              'max_depth': 6}
model = RandomForestRegressor(**parameters)
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=3, min_samples_split=10,
                      min_weight_fraction_leaf=0.0, n_estimators=50,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [14]:
# print our simulation

print(model.predict(X_test).astype(int), y_test)

[[124 118]
 [109 121]
 [112 102]
 [106 117]
 [112 100]
 [103 102]
 [103 103]
 [125 105]
 [125 115]
 [112 122]
 [109 101]
 [108  95]
 [105 113]
 [100 124]
 [102 119]
 [123 108]
 [109 100]
 [129 110]
 [106 117]
 [113 115]
 [136 136]
 [129 109]
 [107 118]
 [127  96]
 [123 104]
 [105 118]
 [122 111]
 [105 109]
 [125 110]
 [123 117]
 [118 101]
 [124 107]
 [111 109]
 [118 119]
 [112 112]
 [111 104]
 [121 120]
 [117 114]
 [103 112]
 [120 109]
 [127  96]
 [124 106]
 [129  94]
 [115 115]
 [122  90]
 [103  98]
 [106 119]
 [104  95]
 [111 117]
 [ 97 102]
 [104 113]
 [125 115]
 [124 111]
 [119 117]
 [111 110]
 [102 108]
 [108 114]
 [118 110]
 [118 123]
 [107 128]
 [117 100]
 [120 126]
 [107 111]
 [102 114]
 [122 105]
 [109 114]
 [122 116]
 [114 107]
 [128 118]
 [109 125]
 [113 109]
 [ 98 112]
 [136 124]
 [131 116]
 [108 107]
 [108 100]
 [125 114]
 [129 123]
 [116 109]
 [110 109]
 [114 107]
 [123 113]
 [117 114]
 [112 107]
 [114 105]
 [ 96 113]
 [113 112]
 [109 113]
 [106 102]
 [119 112]
 [119 110]

In [15]:
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

0.8894566013304056
0.8529084895347032




In [17]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [18]:
print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

0.9989792775005047
0.9989166155618341




In [19]:
print(lr.predict(X_test).astype(int))

[[128 116]
 [111 122]
 [111 101]
 [ 99 119]
 [110  97]
 [103 104]
 [100 104]
 [126 103]
 [125 116]
 [118 120]
 [106 100]
 [103  87]
 [102 111]
 [ 91 128]
 [ 95 120]
 [126 106]
 [106 100]
 [139 110]
 [101 117]
 [113 113]
 [156 158]
 [139 111]
 [103 116]
 [142  93]
 [126 104]
 [101 121]
 [126 117]
 [106 111]
 [130 103]
 [126 117]
 [124 103]
 [133 103]
 [113 108]
 [125 126]
 [110 111]
 [122 111]
 [120 114]
 [120 114]
 [103 112]
 [123 105]
 [142  93]
 [129 110]
 [132  87]
 [117 115]
 [119  80]
 [103  94]
 [105 121]
 [100  86]
 [116 120]
 [ 91  99]
 [107 114]
 [127 122]
 [128 111]
 [121 116]
 [107 108]
 [ 98 104]
 [103 110]
 [124 122]
 [115 122]
 [103 135]
 [116 102]
 [117 128]
 [105 107]
 [105 121]
 [122 101]
 [111 116]
 [128 123]
 [114 106]
 [133 117]
 [106 130]
 [119 110]
 [ 80 105]
 [150 132]
 [137 116]
 [104 106]
 [110 100]
 [133 112]
 [139 132]
 [123 104]
 [100 102]
 [114 106]
 [127 119]
 [115 115]
 [107 101]
 [111 104]
 [ 98 116]
 [121 117]
 [105 113]
 [105 103]
 [115 106]
 [121 113]