In [3]:
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network._multilayer_perceptron import MLPRegressor
from statsmodels.tsa.statespace.sarimax import SARIMAX
from google.oauth2 import service_account
from datetime import datetime as date
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import TimeSeriesSplit 
from sklearn.metrics import r2_score,make_scorer,mean_squared_error
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_regression
from statsmodels.tsa.stattools import adfuller

import joblib
import pickle
import pandas as pd
import pandas_gbq
import numpy as np
import xgboost
import lightgbm
import os

### Gathering Data and Feature Engineering

In [4]:


#using shifted windows for rolling data to prevent data leakage
player_query = f""" 
SELECT *
from `capstone_data.player_modeling_data_partitioned`
order by game_date asc
"""

team_query = f"""
SELECT *
from `capstone_data.team_modeling_data_partitioned`
order by game_date asc
"""


In [5]:
 
try:
    full_data = pd.read_csv('full_data.csv')

except:
    nba_player_data = pd.DataFrame(pandas_gbq.read_gbq(player_query,project_id='miscellaneous-projects-444203',progress_bar_type='tqdm'))
    team_data = pd.DataFrame(pandas_gbq.read_gbq(team_query,project_id='miscellaneous-projects-444203',progress_bar_type='tqdm'))
    team_data  = team_data.merge(team_data,on='game_id',suffixes=('',"_opponent"))
    team_data = team_data[team_data["team_id"] != team_data["team_id_opponent"]]
    full_data = nba_player_data.merge(team_data, on = ['game_id','team'], how = 'inner',suffixes=('','remove'))
    full_data.drop([column for column in full_data.columns if 'remove' in column],axis = 1 , inplace=True) 
    full_data.drop([column for column in full_data.columns if '_1' in column],axis = 1 , inplace=True)
    full_data.to_csv('full_data.csv',mode = 'x')

In [6]:
data_ordered = full_data.sort_values('game_date')

data_ordered.dropna(inplace=True)


#### Feature Engineering Ideas 

* (ratio of 3pa and fga and 3pm and 3pa) TS% for players efg% 
* for players assist_to_turnover ratio assist ratio, 
* rebound_cahnce, defesnive reb %, 
* ast_ratio_season * pace, 
* home * pts season - data pts 3pm avg,
* cold_streak pts_3gm_avg < pts_season boolean, 
* away difficulty away * opponent_defrtg_3gm_avg,
* home_performance = data_ordered[data_ordered["home"] == 1].groupby("team")["pts_season"].mean()
* away_performance = data_ordered[data_ordered["away"] == 1].groupby("team")["pts_season"].mean() these would be to see how the team performance changes 


In [7]:
# data_ordered['pts_per_min_3gm'] = data_ordered['pts_3gm_avg']/data_ordered['min_3gm_avg']
# data_ordered['pts_per_min_season'] = data_ordered['pts_season']/data_ordered['min_season']
# data_ordered['pts_per_min_momentum'] = data_ordered['pts_per_min_3gm'] - data_ordered['pts_per_min_season']

# data_ordered['fg3m_per_min_3gm'] = data_ordered['fg3m_3gm_avg']/data_ordered['min_3gm_avg']
# data_ordered['fg3m_per_min_season'] = data_ordered['fg3m_season']/data_ordered['min_season']
# data_ordered['fg3m_per_min_momentum'] = data_ordered['fg3m_per_min_3gm'] - data_ordered['fg3m_per_min_season'] 

# data_ordered['reb_per_min_3gm'] = data_ordered['reb_3gm_avg']/data_ordered['min_3gm_avg']
# data_ordered['reb_per_min_season'] = data_ordered['reb_season']/data_ordered['min_season']
# data_ordered['reb_per_min_momentum'] = data_ordered['fg3m_per_min_3gm'] - data_ordered['reb_per_min_season']


In [8]:
data_ordered = data_ordered.groupby(['player','season']).apply(lambda x: x.iloc[3:]).reset_index(drop=True)

  data_ordered = data_ordered.groupby(['player','season']).apply(lambda x: x.iloc[3:]).reset_index(drop=True)


In [9]:
data_ordered.sort_values(by='game_date',inplace=True)

In [10]:
data_ordered['game_date'] = pd.to_datetime(data_ordered['game_date'])

In [11]:
data_ordered['days_ago'] = (data_ordered['game_date'].max() - data_ordered['game_date']).dt.days
data_ordered['time_decay_weight'] = 1 / (1 + np.log(1 + data_ordered['days_ago']))

In [12]:
pd.set_option('display.max_columns',100000)

In [13]:
try:
    data_ordered = data_ordered.drop('Unnamed: 0', axis =1)
except KeyError:
    print('Irregular column not made')

In [14]:
# Fill NaNs with the column mean, but only for numeric columns
data_ordered.fillna(data_ordered.select_dtypes(include=['number']).mean(), inplace=True)


In [15]:
numeric_columns = data_ordered.select_dtypes(include=['number']).columns.tolist()
numeric_columns = [column for column in numeric_columns if column not in ['pts','reb','ast','blk','stl','3pm','game_id','game_date','days_ago','time_decay_weight','team_id', "gp_rank", "w_rank", "l_rank", "w_pct_rank", "min_rank", "fgm_rank",
    "fga_rank", "fg_pct_rank", "fg3m_rank", "fg3a_rank", "fg3_pct_rank",
    "ftm_rank", "fta_rank", "ft_pct_rank", "oreb_rank", "dreb_rank",
    "reb_rank", "ast_rank", "tov_rank", "stl_rank", "blk_rank",
    "blka_rank", "pf_rank", "pfd_rank", "pts_rank", "plus_minus_rank",]]

numeric_columns = [feature for feature in numeric_columns if any(keyword in feature for keyword in ["3gm_avg", "season", "momentum"])]
features = {feature:[] for feature in ['pts','reb','ast','3pm']}

In [16]:
numeric_columns

['min_3gm_avg',
 'min_season',
 'min_momentum',
 'fgm_3gm_avg',
 'fgm_season',
 'fgm_momentum',
 'fga_3gm_avg',
 'fga_season',
 'fga_momentum',
 'fg_pct_3gm_avg',
 'fg_pct_season',
 'fg_pct_momentum',
 'fg3m_3gm_avg',
 'fg3m_season',
 'fg3m_momentum',
 'fg3a_3gm_avg',
 'fg3a_season',
 'fg3a_momentum',
 'fg3_pct_3gm_avg',
 'fg3_pct_season',
 'fg3_pct_momentum',
 'ftm_3gm_avg',
 'ftm_season',
 'ftm_momentum',
 'fta_3gm_avg',
 'fta_season',
 'fta_momentum',
 'ft_pct_3gm_avg',
 'ft_pct_season',
 'ft_pct_momentum',
 'oreb_3gm_avg',
 'oreb_season',
 'oreb_momentum',
 'dreb_3gm_avg',
 'dreb_season',
 'dreb_momentum',
 'reb_3gm_avg',
 'reb_season',
 'reb_momentum',
 'ast_3gm_avg',
 'ast_season',
 'ast_momentum',
 'stl_3gm_avg',
 'stl_season',
 'stl_momentum',
 'blk_3gm_avg',
 'blk_season',
 'blk_momentum',
 'to_3gm_avg',
 'to_season',
 'to_momentum',
 'pf_3gm_avg',
 'pf_season',
 'pf_momentum',
 'pts_3gm_avg',
 'pts_season',
 'pts_momentum',
 'plus_minus_3gm_avg',
 'plus_minus_season',
 'plus_

In [17]:
split_index = int(len(data_ordered) * .80)

train_data = data_ordered.iloc[:split_index]
test_data = data_ordered[split_index:]

In [18]:


for category in features.keys():
    x = train_data[numeric_columns]
    y = train_data[category]

    mi_scores = mutual_info_regression(x, y)
    mi_scores = pd.Series(mi_scores, index=numeric_columns)
    selected_features = mi_scores[mi_scores > 0.1].index.tolist()  # Keep features with MI > 0.05

    features[category] = selected_features


In [19]:
#These values appeared to have non-linear relationships applying transformations
# data_ordered['ft%_season'] = np.log1p(data_ordered['ft%_season'])
# data_ordered['stl_3gm_avg'] = np.log1p(data_ordered['stl_3gm_avg'])
# data_ordered['stl_season'] = np.log1p(data_ordered['stl_season'])
# data_ordered['to_season'] = data_ordered['to_season']**2 
# data_ordered['to_3gm_avg'] = data_ordered['to_3gm_avg']**2 

In [20]:
tscv = TimeSeriesSplit(n_splits=5)

In [21]:
saved_models = {category:{} for category in ['pts','reb','ast','3pm']} 
saved_results = {category:{} for category in ['pts','reb','ast','3pm']}

#### SHAP
Applying shap to help reduce collinearity

### Linear Model

In [22]:

for category in features.keys():

    features_list = [f for f in features[category] if f != category]
    print(len(features_list))
    x_train,y_train = train_data[features_list],train_data[category]
    x_test, y_test = test_data[features_list],test_data[category]
    linear_model = LinearRegression()

    linear_model.fit(x_train,y_train)

    y_pred = linear_model.predict(x_test)
    print(category)
    print(r2_score(y_true=y_test,y_pred=y_pred))

    saved_results[category]['linear_model']={'r2':{r2_score(y_true=y_test,y_pred=y_pred)}, 'mse':{mean_squared_error(y_true=y_test,y_pred=y_pred)}}
    saved_models[category]['linear_model'] = linear_model

173
pts
0.5972425215458068
120
reb
0.5331961310637111
100
ast
0.5745602343485372
92
3pm
0.40394212744145974


In [23]:
for category in features.keys():
    features_list = [f for f in features[category] if f != category]
    x_train,y_train = train_data[features_list],train_data[category]
    x_test, y_test = test_data[features_list],test_data[category]
    ridge_model = Ridge(alpha=1)

    ridge_model.fit(x_train,y_train)

    output = pd.DataFrame({'prediction':ridge_model.predict(x_test), 'actual':y_test})
    print(category)
    print(r2_score(y_true=output['actual'],y_pred=output['prediction']))

pts
0.5972451814379129
reb
0.5332161552544693
ast
0.5745602379669869
3pm
0.403942153590657


In [24]:
# from sklearn.feature_selection import RFE
# from sklearn.linear_model import Ridge, Lasso

# features = {}

# for category in ['pts', 'reb', 'ast', '3pm']:
#     x_train = train_data[numeric_columns]
#     y_train = train_data[category]

#     # Use Lasso to select features automatically
#     lasso = Lasso(alpha=0.01)  # Adjust alpha based on tuning
#     lasso.fit(x_train, y_train)

#     # Keep only features with nonzero coefficients
#     selected_features = [f for f, coef in zip(numeric_columns, lasso.coef_) if coef != 0]

#     # Cap the number of features (e.g., max 50)
#     selected_features = selected_features[:min(len(selected_features), 50)]

#     features[category] = selected_features

In [25]:
from sklearn.model_selection import cross_val_score

for category in features.keys():
    features_list = features[category]

    x_train, y_train = train_data[features_list], train_data[category]
    x_test, y_test = test_data[features_list], test_data[category]

    linear_model = Ridge(alpha=1.0)  # Use Ridge instead of LinearRegression
    linear_model.fit(x_train, y_train)

    # Cross-validation score instead of just test R²
    cv_r2 = cross_val_score(linear_model, x_train, y_train, cv=5, scoring='r2').mean()

    y_pred = linear_model.predict(x_test)
    test_r2 = r2_score(y_test, y_pred)

    print(f"{category}: Cross-Val R² = {cv_r2:.4f}, Test R² = {test_r2:.4f}")


pts: Cross-Val R² = 0.5725, Test R² = 0.5972
reb: Cross-Val R² = 0.5316, Test R² = 0.5332
ast: Cross-Val R² = 0.5818, Test R² = 0.5746
3pm: Cross-Val R² = 0.4018, Test R² = 0.4039


### XGboost

In [26]:
scaler = StandardScaler()

scaled_data = scaler.fit_transform(data_ordered[numeric_columns])

scaled_data_df = pd.DataFrame(scaled_data,columns=numeric_columns)

split_index = int(len(data_ordered) * .80)

scaled_train_data = scaled_data_df.iloc[:split_index]
scaled_test_data = scaled_data_df[split_index:]

In [27]:
param_grid = {'max_depth':[3,6,9],'learning_rate':[.01,.05,.1,.3],'booster':['gbtree','dart'],'subsample':[.5,.7,.9],'colsample_bytree':[.5,.7,.9],'n_estimators':[100,300,500]}
param_linear = {'booster':['gblinear'],'lambda':[0,.1,1,10],'alpha':[0,.1,1,10]}

In [28]:
xgb_regressor = xgboost.XGBRegressor()
mse_score = make_scorer(mean_squared_error,greater_is_better=False)
r2_scorer = make_scorer(r2_score)
scoring = {'MSE':mse_score,'r2':r2_scorer}
grid_search = GridSearchCV(estimator=xgb_regressor,param_grid=param_grid,scoring = scoring,cv=tscv,n_jobs=1,verbose=0,refit='r2')
grid_linear_search = GridSearchCV(estimator=xgb_regressor,param_grid=param_linear,scoring = scoring,cv=tscv,n_jobs=3,verbose=0,refit='r2')


In [29]:
xg_features =  features

In [30]:
xg_features['reb']

['min_3gm_avg',
 'min_season',
 'fgm_3gm_avg',
 'fgm_season',
 'fg_pct_season',
 'fta_season',
 'oreb_3gm_avg',
 'oreb_season',
 'dreb_3gm_avg',
 'dreb_season',
 'reb_3gm_avg',
 'reb_season',
 'blk_season',
 'pf_season',
 'pts_3gm_avg',
 'pts_season',
 'fgm_rank_season',
 'fgm_rank_momentum',
 'fga_rank_3gm_avg',
 'fga_rank_season',
 'fga_rank_momentum',
 'fg_pct_rank_3gm_avg',
 'fg_pct_rank_season',
 'fg_pct_rank_momentum',
 'fg3m_rank_season',
 'fg3m_rank_momentum',
 'fg3a_rank_3gm_avg',
 'fg3a_rank_season',
 'fg3a_rank_momentum',
 'fg3_pct_rank_3gm_avg',
 'fg3_pct_rank_season',
 'fg3_pct_rank_momentum',
 'ftm_rank_3gm_avg',
 'ftm_rank_season',
 'ftm_rank_momentum',
 'fta_rank_3gm_avg',
 'fta_rank_season',
 'fta_rank_momentum',
 'ft_pct_rank_3gm_avg',
 'ft_pct_rank_season',
 'ft_pct_rank_momentum',
 'oreb_rank_season',
 'oreb_rank_momentum',
 'dreb_rank_season',
 'dreb_rank_momentum',
 'reb_rank_3gm_avg',
 'reb_rank_season',
 'reb_rank_momentum',
 'ast_rank_season',
 'ast_rank_moment

In [31]:
for category in features.keys():
    x_train,y_train = scaled_train_data[xg_features[category]],train_data[category]
    x_test, y_test = scaled_test_data[xg_features[category]],test_data[category]

    fit_params = {'eval_set':[(x_test,y_test)],'early_stopping_rounds':20,'verbose':False}

    grid_linear_search.estimator.set_params(eval_metric='rmse')


    grid_linear_search.fit(x_train,y_train)


    print(category)
    print(grid_linear_search.best_params_)
    print(grid_linear_search.best_score_)

    y_pred = grid_linear_search.best_estimator_.predict(x_test)

    saved_models[category]['XGboost'] = grid_linear_search.best_estimator_
    saved_results[category]['XGboost']={'r2':{r2_score(y_true=y_test,y_pred=y_pred)}, 'mse':{mean_squared_error(y_true=y_test,y_pred=y_pred)}}


pts
{'alpha': 0, 'booster': 'gblinear', 'lambda': 0}
0.5746073167783181
reb
{'alpha': 0, 'booster': 'gblinear', 'lambda': 0}
0.5322043614336138
ast
{'alpha': 0, 'booster': 'gblinear', 'lambda': 0}
0.5796047472616144
3pm
{'alpha': 0, 'booster': 'gblinear', 'lambda': 0}
0.3997743089742949


### LightGBM

In [32]:
light = lightgbm.LGBMRegressor(boosting_type='gbdt', n_estimators=500)  # Using hist for faster training

param_grid = {
    'num_leaves': [31, 50],  
    'learning_rate': [0.01, 0.1],  
    'max_depth': [-1, 10],  
}


In [33]:
light_grid_search = GridSearchCV(estimator=light,param_grid=param_grid,cv=tscv,verbose=0,n_jobs=4)

In [34]:
split_index = int(len(data_ordered) * .80)

train_data = data_ordered.iloc[:split_index]
test_data = data_ordered[split_index:]
for category in features.keys():
    x_train,y_train = train_data[features[category]],train_data[category]
    x_test,y_test = test_data[features[category]],test_data[category]

    light_grid_search.fit(x_train,y_train)

    best_model = light_grid_search.best_estimator_
    print(category)
    print("Best Parameters:", light_grid_search.best_params_)

    y_pred = best_model.predict(x_test)

    mse = mean_squared_error(y_test,y_pred)
    r2 = r2_score(y_test,y_pred)

    saved_models[category]['lightgbm'] = best_model
    print(f'MSE: {mse}')
    print(f'R2: {r2}')

    saved_results[category]['lightgbm']={'r2':{r2_score(y_true=y_test,y_pred=y_pred)}, 'mse':{mean_squared_error(y_true=y_test,y_pred=y_pred)}}


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020365 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41312
[LightGBM] [Info] Number of data points in the train set: 64498, number of used features: 173
[LightGBM] [Info] Start training from score 9.752783
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.055914 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 41378
[LightGBM] [Info] Number of data points in the train set: 128992, number of used features: 173
[LightGBM] [Info] Start training from score 9.903932
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.200758 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41412
[LightGBM] [Info] Number of data points in the tra



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.144560 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41412
[LightGBM] [Info] Number of data points in the train set: 193486, number of used features: 173
[LightGBM] [Info] Start training from score 10.073039
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.158133 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41416
[LightGBM] [Info] Number of data points in the train set: 257980, number of used features: 173
[LightGBM] [Info] Start training from score 10.218370
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.199141 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41411
[LightGBM] [Info] Number of data points in the train set: 322474, number of used features: 173
[LightGBM] [Info

  _data = np.array(data, dtype=dtype, copy=copy,


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.092624 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41414
[LightGBM] [Info] Number of data points in the train set: 386968, number of used features: 173
[LightGBM] [Info] Start training from score 10.389676
pts
Best Parameters: {'learning_rate': 0.01, 'max_depth': -1, 'num_leaves': 50}
MSE: 30.827364605526707
R2: 0.6135017686869715
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013903 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 29490
[LightGBM] [Info] Number of data points in the train set: 64498, number of used features: 120
[LightGBM] [Info] Start training from score 4.145818
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.077988 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not 

  _data = np.array(data, dtype=dtype, copy=copy,


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068758 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 29538
[LightGBM] [Info] Number of data points in the train set: 386968, number of used features: 120
[LightGBM] [Info] Start training from score 4.189031
reb
Best Parameters: {'learning_rate': 0.01, 'max_depth': 10, 'num_leaves': 31}
MSE: 5.599741326148006
R2: 0.535635523749175
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011143 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24425
[LightGBM] [Info] Number of data points in the train set: 64498, number of used features: 100
[LightGBM] [Info] Start training from score 2.107879
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.060209 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total 

  _data = np.array(data, dtype=dtype, copy=copy,


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041508 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24471
[LightGBM] [Info] Number of data points in the train set: 386968, number of used features: 100
[LightGBM] [Info] Start training from score 2.278346
ast
Best Parameters: {'learning_rate': 0.01, 'max_depth': 10, 'num_leaves': 31}
MSE: 2.9915439188610473
R2: 0.5764292641649603
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010055 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22559
[LightGBM] [Info] Number of data points in the train set: 64498, number of used features: 92
[LightGBM] [Info] Start training from score 0.826010
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.050342 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total

  _data = np.array(data, dtype=dtype, copy=copy,


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032754 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22607
[LightGBM] [Info] Number of data points in the train set: 386968, number of used features: 92
[LightGBM] [Info] Start training from score 1.062313
3pm
Best Parameters: {'learning_rate': 0.01, 'max_depth': -1, 'num_leaves': 50}
MSE: 1.2867866885051822
R2: 0.43836569622782817


In [35]:
joblib.dump(saved_models,'models.pkl')

['models.pkl']

In [36]:
with open('saved_performance.txt', 'w') as file:
    for category, models in saved_results.items():
        file.write(f"Category: {category}\n")
        for model, metrics in models.items():
            file.write(f"  Model: {model}\n")
            for metric, value in metrics.items():
                file.write(f"    {metric}: {value}\n")
        file.write("\n")  # Newline between categories
