In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network._multilayer_perceptron import MLPRegressor
from google.oauth2 import service_account
from datetime import datetime as date
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,make_scorer,mean_squared_error
from sklearn.model_selection import GridSearchCV
from statsmodels.tsa.arima.model import ARIMA

import pandas as pd
import pandas_gbq
import numpy as np
import xgboost

### Gathering Data

In [None]:
player_3gm_avg = ["min_3gm_avg", "fgm_3gm_avg", "fga_3gm_avg", "fg%_3gm_avg", "3pm_3gm_avg", 
                   "3pa_3gm_avg", "3p%_3gm_avg", "ftm_3gm_avg", "fta_3gm_avg", "ft%_3gm_avg", 
                   "oreb_3gm_avg", "dreb_3gm_avg", "reb_3gm_avg", "ast_3gm_avg", "stl_3gm_avg", 
                   "blk_3gm_avg", "to_3gm_avg", "pf_3gm_avg", "pts_3gm_avg", "plus_mins_3gm_avg"]

teams_3gm_avg = ["home","away","offrtg_3gm_avg", "defrtg_3gm_avg", "netrtg_3gm_avg", "ast%_3gm_avg", "ast_to_3gm_avg", 
                    "ast_ratio_3gm_avg", "oreb%_3gm_avg", "dreb%_3gm_avg", "reb%_3gm_avg", "tov%_3gm_avg", 
                    "efg%_3gm_avg", "ts%_3gm_avg", "pace_3gm_avg", "pie_3gm_avg"]

#using shifted windows for rolling data to prevent data leakage
player_query = f""" 
SELECT player,team,game_id,game_date,matchup,pts,reb,ast,`3pm`, {','.join([f'`{player}`' for player in player_3gm_avg])},season
from `capstone_data.player_modeling_data`
order by game_date asc
"""

team_query = f"""
SELECT team,game_id,game_date,home,away, {', '.join([f'`{team}`' for team in teams_3gm_avg])}
from `capstone_data.team_modeling_data`
order by game_date asc
"""


In [21]:
try:
    full_data = pd.read_csv('full_data.csv')

except:
    nba_player_data = pd.DataFrame(pandas_gbq.read_gbq(player_query,project_id='miscellaneous-projects-444203'))
    team_data = pd.DataFrame(pandas_gbq.read_gbq(team_query,project_id='miscellaneous-projects-444203'))
    opponent_data = team_data.rename(columns={
    col: ('matchup' if col == 'team' else 'game_id' if col == 'game_id' else f'opponent_{col}')
    for col in team_data.columns})
    features_for_team = ['home','away'] + teams_3gm_avg
    features_for_player = ['pts','reb','ast','blk','stl'] + player_3gm_avg
    full_data = nba_player_data.merge(team_data, on = ['game_id','team'], how = 'inner',suffixes=('','remove'))
    full_data = nba_player_data.merge(opponent_data,on = ['game_id','matchup'])
    full_data.drop([column for column in full_data.columns if 'remove' in column],axis = 1 , inplace=True)
    full_data.to_csv('full_data.csv',mode = 'x')

In [22]:
pd.set_option('display.max_columns',100)

### Linear Model

In [None]:
numeric_columns = full_data.select_dtypes(include=['number']).columns.tolist()
numeric_columns = [column for column in numeric_columns if column not in ['home','pts','away','reb','ast','blk','stl','3pm']]

features = {feature:[] for feature in ['pts','reb','ast','3pm']}

In [24]:
for category in features.keys():
    print(category)
    for column in numeric_columns:
        correlation = pearsonr(full_data[column],full_data[category])
        if correlation[1] < .05:
            print(column)
            print(f'correalation {correlation[0]} p_value {correlation[1]}')
            features[category].append(column)

pts
Unnamed: 0
correalation 0.007609100937364834 p_value 0.0193210162348264
min_3gm_avg
correalation 0.5490279038245118 p_value 0.0
fgm_3gm_avg
correalation 0.6432319813982464 p_value 0.0
fga_3gm_avg
correalation 0.6613150314735703 p_value 0.0
fg%_3gm_avg
correalation 0.19956333362861087 p_value 0.0
3pm_3gm_avg
correalation 0.4020853403705054 p_value 0.0
3pa_3gm_avg
correalation 0.4571040900199037 p_value 0.0
3p%_3gm_avg
correalation 0.22845278617947226 p_value 0.0
ftm_3gm_avg
correalation 0.5570031936999896 p_value 0.0
fta_3gm_avg
correalation 0.5526959854368131 p_value 0.0
ft%_3gm_avg
correalation 0.4652846987450218 p_value 0.0
oreb_3gm_avg
correalation 0.14725375732882032 p_value 0.0
dreb_3gm_avg
correalation 0.42343659545274853 p_value 0.0
reb_3gm_avg
correalation 0.37560482974397 p_value 0.0
ast_3gm_avg
correalation 0.48600521626876547 p_value 0.0
stl_3gm_avg
correalation 0.28749160322405337 p_value 0.0
blk_3gm_avg
correalation 0.16746462964148384 p_value 0.0
to_3gm_avg
correalati

In [25]:
#Checking spearmanr 

for category in features.keys():
    print(category)
    for column in numeric_columns:
        correlation = spearmanr(full_data[column],full_data[category])
        if correlation[1] < .05 and column not in features[category]:
            print(column)
            print(f'correalation {correlation[0]} p_value {correlation[1]}')
            features[category].append(category)

pts
reb
3p%_3gm_avg
correalation 0.051423248595957885 p_value 2.287601173702068e-56
opponent_reb%_3gm_avg
correalation -0.006613977875529583 p_value 0.04201827384984019
opponent_pace_3gm_avg
correalation 0.015215522836575825 p_value 2.8977587475467627e-06
ast
blk
opponent_defrtg_3gm_avg
correalation 0.007645724968407421 p_value 0.018746274334895553
opponent_oreb%_3gm_avg
correalation 0.007202733030362762 p_value 0.02680505776355268
opponent_pace_3gm_avg
correalation 0.007454192877624759 p_value 0.021925635177532928
stl
opponent_ast%_3gm_avg
correalation 0.008671378261001199 p_value 0.007678986252161165
opponent_efg%_3gm_avg
correalation -0.0078789939088199 p_value 0.015424529340088576
opponent_ts%_3gm_avg
correalation -0.009342795394107876 p_value 0.0040750577232058875
opponent_pace_3gm_avg
correalation 0.012311613860064161 p_value 0.0001536660923506407
3pm


In [26]:
data_ordered = full_data.sort_values('game_date')

In [27]:
for category in features.keys():
    features[category] = [f for f in features[category] if f != category]


In [28]:
split_index = int(len(data_ordered) * .8)

train_data = data_ordered.iloc[:split_index]
test_data = data_ordered[split_index:]

for category in features.keys():
    features_list = [f for f in features[category] if f != category]
    x_train,y_train = train_data[features_list],train_data[category]
    x_test, y_test = test_data[features_list],test_data[category]
    linear_model = LinearRegression()

    linear_model.fit(x_train,y_train)

    output = pd.DataFrame({'prediction':linear_model.predict(x_test), 'actual':y_test})
    print(category)
    print(r2_score(y_true=output['actual'],y_pred=output['prediction']))

pts
0.4831329047141588
reb
0.3969844537849755
ast
0.4558905738127811
blk
0.1483182645051161
stl
0.08499658766602225
3pm
0.3024831516223184


In [29]:
corr_matrix = full_data[features['reb']].corr()
high_corr_vars = np.where(abs(corr_matrix) > 0.8)
high_corr_pairs = [(corr_matrix.index[x], corr_matrix.columns[y]) 
                   for x, y in zip(*high_corr_vars) if x != y and x < y]
print(high_corr_pairs)  # Drop one from each highly correlated pair


[('Unnamed: 0', 'game_id'), ('min_3gm_avg', 'fgm_3gm_avg'), ('min_3gm_avg', 'fga_3gm_avg'), ('min_3gm_avg', 'pts_3gm_avg'), ('fgm_3gm_avg', 'fga_3gm_avg'), ('fgm_3gm_avg', 'pts_3gm_avg'), ('fga_3gm_avg', 'pts_3gm_avg'), ('3pm_3gm_avg', '3pa_3gm_avg'), ('ftm_3gm_avg', 'fta_3gm_avg'), ('dreb_3gm_avg', 'reb_3gm_avg')]


### XGboost

In [30]:
param_grid = {'max_depth':[2,3,4],'eta':[.01,.05,.1,.3],'booster':['gbtree','dart']}
param_linear = {'boost':['gblinear'],'lambda':[0,.1,1],'alpha':[0,.1,1]}

In [31]:
xgb_regressor = xgboost.XGBRegressor()
mse_score = make_scorer(mean_squared_error,greater_is_better=False)
r2_score = make_scorer(r2_score)
scoring = {'MSE':mse_score,'r2':r2_score}
grid_search = GridSearchCV(estimator=xgb_regressor,param_grid=param_grid,scoring = scoring,cv=5,n_jobs=1,verbose=0,refit='r2')

In [32]:
xg_features = [feature for feature in data_ordered.columns if data_ordered[feature].dtype == 'float' and feature not in features.keys()]

In [33]:
for category in features.keys():
    
    x_train,y_train = train_data[xg_features],train_data[category]
    x_test, y_test = test_data[xg_features],test_data[category]
    # print(y_test)
    grid_search.fit(x_train,y_train)

    print(category)
    print(grid_search.best_params_)
    print(grid_search.best_score_)

pts
{'booster': 'dart', 'eta': 0.1, 'max_depth': 4}
0.49552866782078675
reb
{'booster': 'gbtree', 'eta': 0.1, 'max_depth': 4}
0.4055309187577899
ast
{'booster': 'dart', 'eta': 0.1, 'max_depth': 4}
0.47181123352774446
blk
{'booster': 'gbtree', 'eta': 0.05, 'max_depth': 4}
0.1534699190331332
stl
{'booster': 'gbtree', 'eta': 0.1, 'max_depth': 2}
0.08425126079461823
3pm
{'booster': 'dart', 'eta': 0.1, 'max_depth': 3}
0.2994354971806124
