In [46]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network._multilayer_perceptron import MLPRegressor
from google.oauth2 import service_account
from datetime import datetime as date
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,make_scorer,mean_squared_error
from sklearn.model_selection import GridSearchCV
from statsmodels.tsa.arima.model import ARIMA

import pandas as pd
import pandas_gbq
import numpy as np
import xgboost

### Gathering Data and Feature Engineering

In [47]:
player_features= ["min_3gm_avg", "fgm_3gm_avg", "fga_3gm_avg", "fg%_3gm_avg", "3pm_3gm_avg", 
                   "3pa_3gm_avg", "3p%_3gm_avg", "ftm_3gm_avg", "fta_3gm_avg", "ft%_3gm_avg", 
                   "oreb_3gm_avg", "dreb_3gm_avg", "reb_3gm_avg", "ast_3gm_avg", "stl_3gm_avg", 
                   "blk_3gm_avg", "to_3gm_avg", "pf_3gm_avg", "pts_3gm_avg", "plus_mins_3gm_avg",
                   "pts_season", "pts_momentum", "min_season", "min_momentum", "fgm_season", "fgm_momentum", 
                    "fga_season", "fga_momentum", "fg%_season", "fg%_momentum", "3pm_season", "3pm_momentum", 
                    "3pa_season", "3pa_momentum", "3p%_season", "3p%_momentum", "ftm_season", "ftm_momentum", 
                    "fta_season", "fta_momentum", "ft%_season", "ft%_momentum", "oreb_season", "oreb_momentum", 
                    "dreb_season", "dreb_momentum", "reb_season", "reb_momentum", "ast_season", "ast_momentum", 
                    "stl_season", "stl_momentum", "blk_season", "blk_momentum", "to_season", "to_momentum", 
                    "pf_season", "pf_momentum", "plus_mins_season", "plus_mins_momentum"]

team_features = ["home","away","offrtg_3gm_avg", "defrtg_3gm_avg", "netrtg_3gm_avg", "ast%_3gm_avg", "ast_to_3gm_avg", 
                    "ast_ratio_3gm_avg", "oreb%_3gm_avg", "dreb%_3gm_avg", "reb%_3gm_avg", "tov%_3gm_avg", 
                    "efg%_3gm_avg", "ts%_3gm_avg", "pace_3gm_avg", "pie_3gm_avg",
                    "netrtg_season", "netrtg_momentum", "offrtg_season", "offrtg_momentum", "defrtg_season", "defrtg_momentum", 
                    "ast%_season", "ast%_momentum", "ast_to_season", "ast_to_momentum", "ast_ratio_season", "ast_ratio_momentum", 
                    "oreb%_season", "oreb%_momentum", "dreb%_season", "dreb%_momentum", "reb%_season", "reb%_momentum", 
                    "tov%_season", "tov%_momentum", "efg%_season", "efg%_momentum", "ts%_season", "ts%_momentum", 
                    "pace_season", "pace_momentum", "pie_season", "pie_momentum"]

#using shifted windows for rolling data to prevent data leakage
player_query = f""" 
SELECT player,team,game_id,game_date,matchup,pts,reb,ast,`3pm`, {','.join([f'`{player}`' for player in player_features])},season
from `capstone_data.player_modeling_data`
order by game_date asc
"""

team_query = f"""
SELECT team,game_id,game_date,home,away, {', '.join([f'`{team}`' for team in team_features])}
from `capstone_data.team_modeling_data`
order by game_date asc
"""


In [48]:
try:
    full_data = pd.read_csv('full_data.csv')

except:
    nba_player_data = pd.DataFrame(pandas_gbq.read_gbq(player_query,project_id='miscellaneous-projects-444203'))
    team_data = pd.DataFrame(pandas_gbq.read_gbq(team_query,project_id='miscellaneous-projects-444203'))
    opponent_data = team_data.rename(columns={
    col: ('matchup' if col == 'team' else 'game_id' if col == 'game_id' else f'opponent_{col}')
    for col in team_data.columns})
    features_for_team = ['home','away'] + team_features
    features_for_player = ['pts','reb','ast','blk','stl'] + player_features
    full_data = nba_player_data.merge(team_data, on = ['game_id','team'], how = 'inner',suffixes=('','remove'))
    full_data = nba_player_data.merge(opponent_data,on = ['game_id','matchup'])
    full_data.drop([column for column in full_data.columns if 'remove' in column],axis = 1 , inplace=True)
    full_data.to_csv('full_data.csv',mode = 'x')



Downloading: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


In [49]:
data_ordered = full_data.sort_values('game_date')

data_ordered.dropna(inplace=True)

data_ordered = data_ordered.groupby(['player','season']).apply(lambda x: x.iloc[3:]).reset_index(drop=True)

  data_ordered = data_ordered.groupby(['player','season']).apply(lambda x: x.iloc[3:]).reset_index(drop=True)


In [50]:
pd.set_option('display.max_columns',100)

### Linear Model

In [51]:
numeric_columns = full_data.select_dtypes(include=['number']).columns.tolist()
numeric_columns = [column for column in numeric_columns if column not in ['home','pts','away','reb','ast','blk','stl','3pm']]

features = {feature:[] for feature in ['pts','reb','ast','3pm']}

In [52]:
for category in features.keys():
    print(category)
    for column in numeric_columns:
        correlation = pearsonr(full_data[column],full_data[category])
        if correlation[1] < .05:
            print(column)
            print(f'correalation {correlation[0]} p_value {correlation[1]}')
            features[category].append(column)

pts
min_3gm_avg
correalation 0.5490739394432836 p_value 0.0
fgm_3gm_avg
correalation 0.6433296507777794 p_value 0.0
fga_3gm_avg
correalation 0.6613742328846867 p_value 0.0
fg%_3gm_avg
correalation 0.19966856930412394 p_value 0.0
3pm_3gm_avg
correalation 0.40219479112557166 p_value 0.0
3pa_3gm_avg
correalation 0.4571765411393891 p_value 0.0
3p%_3gm_avg
correalation 0.22848134678035573 p_value 0.0
ftm_3gm_avg
correalation 0.5570888974632981 p_value 0.0
fta_3gm_avg
correalation 0.5527660992054668 p_value 0.0
ft%_3gm_avg
correalation 0.4652825124036077 p_value 0.0
oreb_3gm_avg
correalation 0.14721870215168717 p_value 0.0
dreb_3gm_avg
correalation 0.42330731916710035 p_value 0.0
reb_3gm_avg
correalation 0.3754762876748695 p_value 0.0
ast_3gm_avg
correalation 0.4860797860730296 p_value 0.0
stl_3gm_avg
correalation 0.2875710485962085 p_value 0.0
blk_3gm_avg
correalation 0.16745655440119817 p_value 0.0
to_3gm_avg
correalation 0.5074384822306152 p_value 0.0
pf_3gm_avg
correalation 0.28847130346

In [53]:
#Checking spearmanr 

for category in features.keys():
    print(category)
    for column in numeric_columns:
        correlation = spearmanr(full_data[column],full_data[category])
        if correlation[1] < .05 and column not in features[category]:
            print(column)
            print(f'correalation {correlation[0]} p_value {correlation[1]}')
            features[category].append(category)

pts
opponent_reb%_season
correalation -0.009093202518908963 p_value 0.005181055269378617
reb
3p%_3gm_avg
correalation 0.05129346887028831 p_value 4.312963099077646e-56
3p%_momentum
correalation -0.012200545236413278 p_value 0.0001761873464005649
opponent_reb%_3gm_avg
correalation -0.006613977875529583 p_value 0.04201827384984019
opponent_pace_3gm_avg
correalation 0.015215522836575825 p_value 2.8977587475467627e-06
opponent_ast%_season
correalation -0.007042447999180673 p_value 0.03038254659221552
opponent_ast_ratio_season
correalation -0.006821460732270323 p_value 0.035982320775923136
ast
plus_mins_momentum
correalation -0.006739262131136064 p_value 0.038278854174455916
opponent_reb%_season
correalation -0.006511936182348656 p_value 0.04528820468551557
opponent_efg%_season
correalation 0.009685645160408152 p_value 0.0029041926909369243
3pm
reb_momentum
correalation -0.01552750182564823 p_value 1.8072390972477286e-06
opponent_dreb%_season
correalation -0.008468076154123353 p_value 0.009

In [54]:
for category in features.keys():
    features[category] = [f for f in features[category] if f != category]


In [55]:
split_index = int(len(data_ordered) * .8)

train_data = data_ordered.iloc[:split_index]
test_data = data_ordered[split_index:]

for category in features.keys():
    features_list = [f for f in features[category] if f != category]
    x_train,y_train = train_data[features_list],train_data[category]
    x_test, y_test = test_data[features_list],test_data[category]
    linear_model = LinearRegression()

    linear_model.fit(x_train,y_train)

    output = pd.DataFrame({'prediction':linear_model.predict(x_test), 'actual':y_test})
    print(category)
    print(r2_score(y_true=output['actual'],y_pred=output['prediction']))

pts
0.5401367245485902
reb
0.43323986297709594
ast
0.5647322144819158
3pm
0.3666347703988847


In [56]:
corr_matrix = full_data[features['reb']].corr()
high_corr_vars = np.where(abs(corr_matrix) > 0.8)
high_corr_pairs = [(corr_matrix.index[x], corr_matrix.columns[y]) 
                   for x, y in zip(*high_corr_vars) if x != y and x < y]
print(high_corr_pairs)  # Drop one from each highly correlated pair


[('min_3gm_avg', 'fgm_3gm_avg'), ('min_3gm_avg', 'fga_3gm_avg'), ('min_3gm_avg', 'pts_3gm_avg'), ('fgm_3gm_avg', 'fga_3gm_avg'), ('fgm_3gm_avg', 'pts_3gm_avg'), ('fgm_3gm_avg', 'pts_season'), ('fgm_3gm_avg', 'fgm_season'), ('fga_3gm_avg', 'pts_3gm_avg'), ('fga_3gm_avg', 'pts_season'), ('fga_3gm_avg', 'fgm_season'), ('fga_3gm_avg', 'fga_season'), ('3pm_3gm_avg', '3pa_3gm_avg'), ('3pa_3gm_avg', '3pm_season'), ('3pa_3gm_avg', '3pa_season'), ('ftm_3gm_avg', 'fta_3gm_avg'), ('dreb_3gm_avg', 'reb_3gm_avg'), ('reb_3gm_avg', 'reb_season'), ('ast_3gm_avg', 'ast_season'), ('pts_3gm_avg', 'pts_season'), ('pts_3gm_avg', 'fgm_season'), ('pts_3gm_avg', 'fga_season'), ('plus_mins_3gm_avg', 'plus_mins_momentum'), ('pts_season', 'min_season'), ('pts_season', 'fgm_season'), ('pts_season', 'fga_season'), ('pts_season', 'ftm_season'), ('pts_season', 'fta_season'), ('pts_season', 'ft%_season'), ('pts_season', 'to_season'), ('pts_momentum', 'fgm_momentum'), ('pts_momentum', 'fga_momentum'), ('min_season', '

### XGboost

In [57]:
param_grid = {'max_depth':[2,3,4],'eta':[.01,.05,.1,.3],'booster':['gbtree','dart']}
param_linear = {'boost':['gblinear'],'lambda':[0,.1,1],'alpha':[0,.1,1]}

In [58]:
xgb_regressor = xgboost.XGBRegressor()
mse_score = make_scorer(mean_squared_error,greater_is_better=False)
r2_score = make_scorer(r2_score)
scoring = {'MSE':mse_score,'r2':r2_score}
grid_search = GridSearchCV(estimator=xgb_regressor,param_grid=param_grid,scoring = scoring,cv=5,n_jobs=1,verbose=0,refit='r2')

In [59]:
xg_features = [feature for feature in data_ordered.columns if data_ordered[feature].dtype == 'float' and feature not in features.keys()]

In [60]:
for category in features.keys():
    
    x_train,y_train = train_data[xg_features],train_data[category]
    x_test, y_test = test_data[xg_features],test_data[category]
    # print(y_test)
    grid_search.fit(x_train,y_train)

    print(category)
    print(grid_search.best_params_)
    print(grid_search.best_score_)

pts
{'booster': 'dart', 'eta': 0.05, 'max_depth': 4}
0.5403287610650266
reb
{'booster': 'dart', 'eta': 0.1, 'max_depth': 3}
0.4785974454265366


KeyboardInterrupt: 