In [75]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network._multilayer_perceptron import MLPRegressor
from google.oauth2 import service_account
from datetime import datetime as date
from scipy.stats import pearsonr
from scipy.stats import spearmanr

import pandas as pd
import pandas_gbq


In [76]:
player_3gm_avg = ["min_3gm_avg", "fgm_3gm_avg", "fga_3gm_avg", "fg%_3gm_avg", "3pm_3gm_avg", 
                   "3pa_3gm_avg", "3p%_3gm_avg", "ftm_3gm_avg", "fta_3gm_avg", "ft%_3gm_avg", 
                   "oreb_3gm_avg", "dreb_3gm_avg", "reb_3gm_avg", "ast_3gm_avg", "stl_3gm_avg", 
                   "blk_3gm_avg", "to_3gm_avg", "pf_3gm_avg", "pts_3gm_avg", "plus_mins_3gm_avg"]

teams_3gm_avg = ["offrtg_3gm_avg", "defrtg_3gm_avg", "netrtg_3gm_avg", "ast%_3gm_avg", "ast_to_3gm_avg", 
                    "ast_ratio_3gm_avg", "oreb%_3gm_avg", "dreb%_3gm_avg", "reb%_3gm_avg", "tov%_3gm_avg", 
                    "efg%_3gm_avg", "ts%_3gm_avg", "pace_3gm_avg", "pie_3gm_avg"]

#using shifted windows for rolling data to prevent data leakage
player_query = f""" 
SELECT player,team,game_id,game_date,matchup,pts,reb,ast,blk,stl,`3pm`, {','.join([f'`{player}`' for player in player_3gm_avg])},season
from `capstone_data.player_modeling_data`
order by game_date asc
"""

team_query = f"""
SELECT team,game_id,game_date,home,away, {', '.join([f'`{team}`' for team in teams_3gm_avg])}
from `capstone_data.team_modeling_data`
order by game_date asc
"""


In [77]:
try:
    full_data = pd.read_csv('full_data.csv')

except:
    nba_player_data = pd.DataFrame(pandas_gbq.read_gbq(player_query,project_id='miscellaneous-projects-444203'))
    team_data = pd.DataFrame(pandas_gbq.read_gbq(team_query,project_id='miscellaneous-projects-444203'))
    features_for_team = ['home','away'] + teams_3gm_avg
    features_for_player = ['pts','reb','ast','blk','stl'] + player_3gm_avg
    full_data = nba_player_data.merge(team_data, on = ['game_id','team'], how = 'inner',suffixes=('','remove'))
    full_data.drop([column for column in full_data.columns if 'remove' in column],axis = 1 , inplace=True)
    full_data.to_csv('full_data.csv',mode = 'x')

Downloading: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


In [78]:
pd.set_option('display.max_columns',100)

In [79]:
full_data[full_data['season'] == '2021-2022']

Unnamed: 0,player,team,game_id,game_date,matchup,pts,reb,ast,blk,stl,3pm,min_3gm_avg,fgm_3gm_avg,fga_3gm_avg,fg%_3gm_avg,3pm_3gm_avg,3pa_3gm_avg,3p%_3gm_avg,ftm_3gm_avg,fta_3gm_avg,ft%_3gm_avg,oreb_3gm_avg,dreb_3gm_avg,reb_3gm_avg,ast_3gm_avg,stl_3gm_avg,blk_3gm_avg,to_3gm_avg,pf_3gm_avg,pts_3gm_avg,plus_mins_3gm_avg,season,home,away,offrtg_3gm_avg,defrtg_3gm_avg,netrtg_3gm_avg,ast%_3gm_avg,ast_to_3gm_avg,ast_ratio_3gm_avg,oreb%_3gm_avg,dreb%_3gm_avg,reb%_3gm_avg,tov%_3gm_avg,efg%_3gm_avg,ts%_3gm_avg,pace_3gm_avg,pie_3gm_avg
0,Blake Griffin,BKN,0022100001,2021-10-19,MIL,6.0,5.0,0.0,0.0,1.0,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2021-2022,0,1,0.00,0.0,0.0,0.00,0.0,0.00,0.0,0.00,0.0,0.0,0.00,0.00,0.0,0.00
1,Bruce Brown,BKN,0022100001,2021-10-19,MIL,0.0,1.0,0.0,1.0,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2021-2022,0,1,0.00,0.0,0.0,0.00,0.0,0.00,0.0,0.00,0.0,0.0,0.00,0.00,0.0,0.00
2,Cam Thomas,BKN,0022100001,2021-10-19,MIL,2.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2021-2022,0,1,0.00,0.0,0.0,0.00,0.0,0.00,0.0,0.00,0.0,0.0,0.00,0.00,0.0,0.00
3,DeAndre' Bembry,BKN,0022100001,2021-10-19,MIL,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2021-2022,0,1,0.00,0.0,0.0,0.00,0.0,0.00,0.0,0.00,0.0,0.0,0.00,0.00,0.0,0.00
4,James Harden,BKN,0022100001,2021-10-19,MIL,20.0,8.0,8.0,2.0,1.0,4.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2021-2022,0,1,0.00,0.0,0.0,0.00,0.0,0.00,0.0,0.00,0.0,0.0,0.00,0.00,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26024,Jordan Schakel,WAS,0022101217,2022-04-10,CHA,0.0,1.0,0.0,0.0,0.0,0.0,8.74,0.33,3.33,8.33,0.33,2.00,11.10,0.67,0.67,33.33,0.67,1.67,2.33,0.00,0.33,0.00,0.33,0.00,1.67,1.33,2021-2022,0,1,107.47,115.5,-8.0,63.27,2.2,20.37,17.8,80.67,52.2,15.3,57.93,59.47,100.5,47.87
26025,Raul Neto,WAS,0022101217,2022-04-10,CHA,9.0,1.0,2.0,0.0,2.0,0.0,10.90,2.67,4.00,62.70,0.67,2.00,27.77,1.00,1.33,25.00,0.00,0.33,0.33,1.00,0.33,0.00,2.00,1.33,7.00,3.67,2021-2022,0,1,107.47,115.5,-8.0,63.27,2.2,20.37,17.8,80.67,52.2,15.3,57.93,59.47,100.5,47.87
26026,Rui Hachimura,WAS,0022101217,2022-04-10,CHA,21.0,4.0,2.0,0.0,1.0,1.0,32.43,6.67,13.67,47.90,2.00,4.67,45.00,1.67,4.00,44.43,0.67,4.00,4.67,1.67,0.00,0.33,0.67,2.00,17.00,-5.33,2021-2022,0,1,107.47,115.5,-8.0,63.27,2.2,20.37,17.8,80.67,52.2,15.3,57.93,59.47,100.5,47.87
26027,Tomas Satoransky,WAS,0022101217,2022-04-10,CHA,5.0,6.0,9.0,0.0,1.0,1.0,21.98,3.33,6.00,56.80,0.33,1.67,16.67,0.00,0.00,0.00,0.00,1.67,1.67,6.67,0.00,0.00,2.00,1.67,7.00,-3.00,2021-2022,0,1,107.47,115.5,-8.0,63.27,2.2,20.37,17.8,80.67,52.2,15.3,57.93,59.47,100.5,47.87


In [80]:
numeric_columns = full_data.select_dtypes(include=['number']).columns.tolist()
numeric_columns = [column for column in numeric_columns if column not in ['home','pts','away','reb','ast','blk','stl','3pm']]

features = {feature:[] for feature in ['pts','reb','ast','blk','stl','3pm']}

In [81]:
for category in features.keys():
    print(category)
    for column in numeric_columns:
        correlation = pearsonr(full_data[column],full_data[category])
        if correlation[1] < .05:
            print(column)
            print(f'correalation {correlation[0]} p_value {correlation[1]}')
            features[category].append(column)

pts
min_3gm_avg
correalation 0.5486889965243066 p_value 0.0
fgm_3gm_avg
correalation 0.6430823375522571 p_value 0.0
fga_3gm_avg
correalation 0.661132634828649 p_value 0.0
fg%_3gm_avg
correalation 0.19962367609809753 p_value 0.0
3pm_3gm_avg
correalation 0.40194698694664055 p_value 0.0
3pa_3gm_avg
correalation 0.45691807192824174 p_value 0.0
3p%_3gm_avg
correalation 0.2284038462760503 p_value 0.0
ftm_3gm_avg
correalation 0.557024628574515 p_value 0.0
fta_3gm_avg
correalation 0.5527263640963419 p_value 0.0
ft%_3gm_avg
correalation 0.4651612663458078 p_value 0.0
oreb_3gm_avg
correalation 0.14754343386593016 p_value 0.0
dreb_3gm_avg
correalation 0.42366094719028907 p_value 0.0
reb_3gm_avg
correalation 0.37587866889528604 p_value 0.0
ast_3gm_avg
correalation 0.4859240173887479 p_value 0.0
stl_3gm_avg
correalation 0.28797732665685843 p_value 0.0
blk_3gm_avg
correalation 0.16724826321037015 p_value 0.0
to_3gm_avg
correalation 0.507209656219346 p_value 0.0
pf_3gm_avg
correalation 0.288100636343

In [82]:
#Checking spearmanr 

for category in features.keys():
    print(category)
    for column in numeric_columns:
        correlation = spearmanr(full_data[column],full_data[category])
        if correlation[1] < .05 and column not in features[category]:
            print(column)
            print(f'correalation {correlation[0]} p_value {correlation[1]}')
            features[category].append(category)

pts
netrtg_3gm_avg
correalation -0.009342556698808522 p_value 0.004214770463893035
reb
3p%_3gm_avg
correalation 0.05185284191308604 p_value 7.101001649598678e-57
offrtg_3gm_avg
correalation -0.00945352803490869 p_value 0.0037841553556739265
netrtg_3gm_avg
correalation -0.008158035327815831 p_value 0.012461743078396438
ast_to_3gm_avg
correalation -0.010976222553042488 p_value 0.0007736802990724306
ast_ratio_3gm_avg
correalation -0.011680677280958424 p_value 0.00034645388749476945
tov%_3gm_avg
correalation 0.009897473605338105 p_value 0.0024325914059684815
efg%_3gm_avg
correalation -0.012094912687457294 p_value 0.00021162059100593712
ts%_3gm_avg
correalation -0.012662467602088069 p_value 0.00010506681062603841
pace_3gm_avg
correalation 0.006445648030749776 p_value 0.0483500942940912
pie_3gm_avg
correalation -0.0065332959573559 p_value 0.04537889713472116
ast
netrtg_3gm_avg
correalation -0.009960139740785846 p_value 0.0022823380695304847
blk
stl
offrtg_3gm_avg
correalation -0.011018900041

In [83]:
data_ordered = full_data.sort_values('game_date')

In [84]:
data_ordered

Unnamed: 0,player,team,game_id,game_date,matchup,pts,reb,ast,blk,stl,3pm,min_3gm_avg,fgm_3gm_avg,fga_3gm_avg,fg%_3gm_avg,3pm_3gm_avg,3pa_3gm_avg,3p%_3gm_avg,ftm_3gm_avg,fta_3gm_avg,ft%_3gm_avg,oreb_3gm_avg,dreb_3gm_avg,reb_3gm_avg,ast_3gm_avg,stl_3gm_avg,blk_3gm_avg,to_3gm_avg,pf_3gm_avg,pts_3gm_avg,plus_mins_3gm_avg,season,home,away,offrtg_3gm_avg,defrtg_3gm_avg,netrtg_3gm_avg,ast%_3gm_avg,ast_to_3gm_avg,ast_ratio_3gm_avg,oreb%_3gm_avg,dreb%_3gm_avg,reb%_3gm_avg,tov%_3gm_avg,efg%_3gm_avg,ts%_3gm_avg,pace_3gm_avg,pie_3gm_avg
0,Blake Griffin,BKN,0022100001,2021-10-19,MIL,6.0,5.0,0.0,0.0,1.0,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2021-2022,0,1,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
21,Moses Moody,GSW,0022100002,2021-10-19,LAL,2.0,2.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2021-2022,0,1,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
22,Nemanja Bjelica,GSW,0022100002,2021-10-19,LAL,15.0,11.0,4.0,0.0,1.0,1.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2021-2022,0,1,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
23,Otto Porter Jr,GSW,0022100002,2021-10-19,LAL,5.0,1.0,0.0,0.0,1.0,1.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2021-2022,0,1,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
24,Stephen Curry,GSW,0022100002,2021-10-19,LAL,21.0,10.0,10.0,0.0,3.0,2.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2021-2022,0,1,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93689,LeBron James,LAL,0022400692,2025-02-01,NYK,33.0,11.0,12.0,0.0,0.0,3.0,31.88,9.33,17.33,54.27,1.00,5.33,20.83,6.00,7.00,83.63,0.33,5.67,6.00,9.33,0.33,0.00,4.33,0.67,25.67,5.67,2024-2025,0,1,118.2,107.57,10.63,60.30,1.71,19.13,26.07,70.90,51.73,17.63,58.60,62.83,99.17,57.47
93690,Max Christie,LAL,0022400692,2025-02-01,NYK,15.0,3.0,2.0,1.0,0.0,1.0,30.56,2.67,7.00,37.10,2.33,5.67,42.23,1.00,1.00,66.67,0.00,3.33,3.33,2.67,0.33,0.33,1.33,2.33,8.67,9.00,2024-2025,0,1,118.2,107.57,10.63,60.30,1.71,19.13,26.07,70.90,51.73,17.63,58.60,62.83,99.17,57.47
93691,Rui Hachimura,LAL,0022400692,2025-02-01,NYK,21.0,3.0,2.0,1.0,1.0,3.0,25.81,6.00,10.00,58.33,2.33,4.33,51.10,0.67,0.67,33.33,0.67,3.67,4.33,1.00,0.67,0.33,0.67,1.33,15.00,8.33,2024-2025,0,1,118.2,107.57,10.63,60.30,1.71,19.13,26.07,70.90,51.73,17.63,58.60,62.83,99.17,57.47
93693,Bam Adebayo,MIA,0022400693,2025-02-01,SAS,30.0,12.0,9.0,3.0,2.0,2.0,39.61,7.67,13.67,55.57,1.00,1.67,50.00,5.00,6.00,86.10,2.33,9.33,11.67,5.33,0.33,0.67,3.67,3.00,21.33,5.33,2024-2025,0,1,110.4,112.20,-1.73,67.63,1.61,19.63,24.40,73.80,50.87,17.73,56.27,60.03,95.43,50.87
