In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network._multilayer_perceptron import MLPRegressor
from google.oauth2 import service_account
from datetime import datetime as date
from scipy.stats import pearsonr
from scipy.stats import spearmanr

import pandas as pd
import pandas_gbq


In [6]:
player_3gm_avg = ["min_3gm_avg", "fgm_3gm_avg", "fga_3gm_avg", "fg%_3gm_avg", "3pm_3gm_avg", 
                   "3pa_3gm_avg", "3p%_3gm_avg", "ftm_3gm_avg", "fta_3gm_avg", "ft%_3gm_avg", 
                   "oreb_3gm_avg", "dreb_3gm_avg", "reb_3gm_avg", "ast_3gm_avg", "stl_3gm_avg", 
                   "blk_3gm_avg", "to_3gm_avg", "pf_3gm_avg", "pts_3gm_avg", "plus_mins_3gm_avg"]

teams_3gm_avg = ["offrtg_3gm_avg", "defrtg_3gm_avg", "netrtg_3gm_avg", "ast%_3gm_avg", "ast_to_3gm_avg", 
                    "ast_ratio_3gm_avg", "oreb%_3gm_avg", "dreb%_3gm_avg", "reb%_3gm_avg", "tov%_3gm_avg", 
                    "efg%_3gm_avg", "ts%_3gm_avg", "pace_3gm_avg", "pie_3gm_avg"]

#using shifted windows for rolling data to prevent data leakage
player_query = f""" 
SELECT player,team,game_id,game_date,matchup,pts,reb,ast,blk,stl,`3pm`, {','.join([f'`{player}`' for player in player_3gm_avg])},season
from `capstone_data.player_modeling_data`
order by game_date asc
"""

team_query = f"""
SELECT team,game_id,game_date,home,away, {', '.join([f'`{team}`' for team in teams_3gm_avg])}
from `capstone_data.team_modeling_data`
order by game_date asc
"""


In [7]:
try:
    full_data = pd.read_csv('full_data.csv')

except:
    nba_player_data = pd.DataFrame(pandas_gbq.read_gbq(player_query,project_id='miscellaneous-projects-444203'))
    team_data = pd.DataFrame(pandas_gbq.read_gbq(team_query,project_id='miscellaneous-projects-444203'))
    features_for_team = ['home','away'] + teams_3gm_avg
    features_for_player = ['pts','reb','ast','blk','stl'] + player_3gm_avg
    full_data = nba_player_data.merge(team_data, on = ['game_id','team'], how = 'inner',suffixes=('','remove'))
    full_data.drop([column for column in full_data.columns if 'remove' in column],axis = 1 , inplace=True)
    full_data.to_csv('full_data.csv',mode = 'x')

Downloading: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


In [12]:
pd.set_option('display.max_columns',100)

In [13]:
full_data[full_data['season'] == '2021-2022']

Unnamed: 0,player,team,game_id,game_date,matchup,pts,reb,ast,blk,stl,3pm,min_3gm_avg,fgm_3gm_avg,fga_3gm_avg,fg%_3gm_avg,3pm_3gm_avg,3pa_3gm_avg,3p%_3gm_avg,ftm_3gm_avg,fta_3gm_avg,ft%_3gm_avg,oreb_3gm_avg,dreb_3gm_avg,reb_3gm_avg,ast_3gm_avg,stl_3gm_avg,blk_3gm_avg,to_3gm_avg,pf_3gm_avg,pts_3gm_avg,plus_mins_3gm_avg,season,home,away,offrtg_3gm_avg,defrtg_3gm_avg,netrtg_3gm_avg,ast%_3gm_avg,ast_to_3gm_avg,ast_ratio_3gm_avg,oreb%_3gm_avg,dreb%_3gm_avg,reb%_3gm_avg,tov%_3gm_avg,efg%_3gm_avg,ts%_3gm_avg,pace_3gm_avg,pie_3gm_avg
0,Andrew Wiggins,GSW,0022100002,2021-10-19,LAL,12.0,7.0,1.0,0.0,1.0,2.0,17.61,1.00,4.33,20.63,0.33,2.67,11.10,0.33,0.67,16.67,0.00,2.67,2.67,4.00,1.00,0.33,0.67,1.00,2.67,-2.33,2021-2022,0,1,107.70,127.57,-19.87,73.27,1.86,18.83,28.40,73.00,49.27,15.13,48.73,54.57,99.33,41.33
1,Brook Lopez,MIL,0022100001,2021-10-19,BKN,8.0,5.0,0.0,3.0,1.0,2.0,5.24,1.00,1.67,66.67,0.33,0.67,33.33,0.33,0.33,33.33,0.00,1.33,1.33,1.67,0.33,0.00,0.67,0.33,2.67,-0.67,2021-2022,1,0,123.73,116.70,7.07,53.47,1.70,17.33,23.70,74.87,49.93,14.17,62.23,65.43,99.33,51.43
2,Bruce Brown,BKN,0022100001,2021-10-19,MIL,0.0,1.0,0.0,1.0,0.0,0.0,24.66,7.67,13.67,57.23,1.67,4.00,41.67,3.00,3.00,33.33,2.00,4.00,6.00,0.00,0.67,1.67,0.67,1.33,20.00,10.33,2021-2022,0,1,122.67,112.27,10.40,55.13,2.56,17.70,29.33,79.67,53.17,11.70,57.17,60.67,97.50,52.73
3,DeAndre' Bembry,BKN,0022100001,2021-10-19,MIL,0.0,0.0,0.0,0.0,0.0,0.0,14.15,3.33,4.00,88.90,0.00,0.00,0.00,0.00,0.67,0.00,2.00,5.00,7.00,0.67,0.00,0.33,0.33,1.67,6.67,-12.33,2021-2022,0,1,122.67,112.27,10.40,55.13,2.56,17.70,29.33,79.67,53.17,11.70,57.17,60.67,97.50,52.73
4,Gary Payton II,GSW,0022100002,2021-10-19,LAL,0.0,0.0,0.0,0.0,0.0,0.0,20.58,2.67,7.67,39.17,1.67,5.00,33.33,0.67,0.67,33.33,0.67,2.00,2.67,0.33,1.33,0.33,0.67,2.67,7.67,2.33,2021-2022,0,1,107.70,127.57,-19.87,73.27,1.86,18.83,28.40,73.00,49.27,15.13,48.73,54.57,99.33,41.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26024,Lance Stephenson,IND,0022101216,2022-04-10,BKN,10.0,1.0,5.0,0.0,1.0,0.0,12.63,2.67,4.67,55.00,0.67,0.67,66.67,1.00,1.00,66.67,0.00,2.67,2.67,3.00,0.67,0.33,1.67,1.67,7.00,-9.00,2021-2022,0,1,116.17,125.37,-9.20,68.67,1.73,20.10,33.97,69.57,51.67,16.83,55.67,59.87,102.67,47.50
26025,Mo Bamba,ORL,0022101227,2022-04-10,MIA,21.0,10.0,1.0,2.0,0.0,5.0,20.83,4.33,9.67,46.17,2.67,6.00,31.03,0.67,1.33,16.67,1.67,7.00,8.67,1.67,0.00,2.00,0.33,2.00,12.00,-6.67,2021-2022,1,0,101.80,119.97,-18.10,65.70,2.59,18.30,20.00,75.93,45.83,11.07,49.80,51.90,100.83,39.70
26026,Nathan Knight,MIN,0022101224,2022-04-10,CHI,17.0,8.0,3.0,0.0,0.0,1.0,4.89,1.67,2.67,66.67,0.33,0.67,33.33,0.67,0.67,33.33,1.00,0.33,1.33,0.33,0.00,0.33,0.33,0.67,4.33,-2.33,2021-2022,1,0,119.43,120.63,-1.27,63.20,2.22,19.37,22.20,73.80,47.10,12.53,58.33,61.80,106.17,48.23
26027,Rayjon Tucker,MIL,0022101218,2022-04-10,CLE,15.0,4.0,4.0,0.0,2.0,3.0,3.22,0.00,0.33,0.00,0.00,0.33,0.00,1.00,1.33,25.00,0.00,0.33,0.33,2.00,0.67,0.00,0.00,0.67,1.00,2.00,2021-2022,0,1,123.80,106.23,17.60,60.90,2.79,20.00,28.60,76.67,53.30,10.63,57.50,61.17,103.33,58.63


In [30]:
numeric_columns = full_data.select_dtypes(include=['number']).columns.tolist()
numeric_columns = [column for column in numeric_columns if column not in ['home','pts','away','reb','ast','blk','stl','3pm']]

categories_lists = ['pts','reb','ast','blk','stl','3pm']

In [None]:
for category in categories_lists:
    print(category)
    for column in numeric_columns:
        correlation = pearsonr(full_data[column],full_data[category])
        if correlation[1] < .05:
            print(column)
            print(f'correalation {correlation[0]} p_value {correlation[1]}')

In [47]:
#Checking spearmanr 

for category in categories_lists:
    print(category)
    for column in numeric_columns:
        correlation = spearmanr(full_data[column],full_data[category])
        if correlation[1] < .05:
            print(column)
            print(f'correalation {correlation[0]} p_value {correlation[1]}')

pts
min_3gm_avg
correalation 0.5985003071785863 p_value 0.0
fgm_3gm_avg
correalation 0.6157081143352493 p_value 0.0
fga_3gm_avg
correalation 0.6344386300228595 p_value 0.0
fg%_3gm_avg
correalation 0.21673619084073514 p_value 0.0
3pm_3gm_avg
correalation 0.3980710354509093 p_value 0.0
3pa_3gm_avg
correalation 0.4386371704211845 p_value 0.0
3p%_3gm_avg
correalation 0.24986003284187405 p_value 0.0
ftm_3gm_avg
correalation 0.5049267791961549 p_value 0.0
fta_3gm_avg
correalation 0.49954772502727474 p_value 0.0
ft%_3gm_avg
correalation 0.4597005663199633 p_value 0.0
oreb_3gm_avg
correalation 0.20014954487767903 p_value 0.0
dreb_3gm_avg
correalation 0.44343453998230614 p_value 0.0
reb_3gm_avg
correalation 0.4121561380747874 p_value 0.0
ast_3gm_avg
correalation 0.47551354373928384 p_value 0.0
stl_3gm_avg
correalation 0.31122332417785614 p_value 0.0
blk_3gm_avg
correalation 0.19181011705935308 p_value 0.0
to_3gm_avg
correalation 0.4794539743735904 p_value 0.0
pf_3gm_avg
correalation 0.318793272