In [84]:
import pandas as pd
import plotly.express as px

In [85]:
loc = './stats/'

# Clean outputs

Make sure to drop from inputs too

In [86]:
MIN_MATCHUP_MINS = 4

In [87]:
inputs = pd.read_csv(loc + 'inputs.csv')
outputs = pd.read_csv(loc + 'outputs.csv')

# Set minutes threshold
inputs = inputs[outputs['MATCHUP_TIME_SEC'] > MIN_MATCHUP_MINS * 60]
outputs = outputs[outputs['MATCHUP_TIME_SEC'] > MIN_MATCHUP_MINS * 60]

def per_100_poss(x):
    return x / outputs['PARTIAL_POSS'] * 100

# Set stats to per 100 possessions
outputs = outputs.apply(lambda x: per_100_poss(x) if x.name not in outputs.columns[0:8] else x)

# Remove rows with zeros in important columns
check = outputs[outputs.columns[8:11]] != 0
inputs = inputs[check['PLAYER_PTS']]
outputs = outputs[check['PLAYER_PTS']]

# Merge average points per game and points per 100 possessions into h2h dataframe
off_data = inputs[['OFF_PLAYER_ID', 'SEASON_ID', 'OFF_PTS', 'OFF_PTS_PER_100']]

# Get average possessions per game
poss = off_data['OFF_PTS'] / (off_data['OFF_PTS_PER_100'] / 100)

# Convert per 100 possessions to points per game
outputs['PLAYER_PPG'] = outputs['PLAYER_PTS'] / 100 * poss
outputs['OFF_PTS'] = off_data['OFF_PTS']
outputs['OFF_PTS_PER_100'] = off_data['OFF_PTS_PER_100']


print(outputs.shape, inputs.shape)
outputs.sort_values(by=['PLAYER_PTS'], ascending=False).head()

(49286, 30) (49286, 79)


Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,TEAM_PTS,...,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC,PLAYER_PPG,OFF_PTS,OFF_PTS_PER_100
222961,22020,202331,Paul George,1629028,Deandre Ayton,3,4:05,17.8,207.865169,241.573034,...,0.0,0.0,0.0,5.617978,16.853933,11.235955,1374.157303,142.448777,23.3,34.0
455528,22018,202689,Kemba Walker,203954,Joel Embiid,3,4:39,25.1,175.298805,207.171315,...,0.0,0.0,0.0,23.904382,27.888446,15.936255,1113.545817,126.769757,25.6,35.4
396890,22018,202689,Kemba Walker,201143,Al Horford,3,4:08,19.0,168.421053,215.789474,...,0.0,0.0,0.0,36.842105,42.105263,26.315789,1306.315789,121.796015,25.6,35.4
268116,22019,203915,Spencer Dinwiddie,201143,Al Horford,4,4:35,25.7,155.642023,221.789883,...,0.0,0.0,0.0,23.346304,31.128405,19.455253,1070.428016,102.763644,20.6,31.2
16914,22021,1629029,Luka Doncic,202685,Jonas Valanciunas,4,4:27,21.9,146.118721,223.744292,...,0.0,0.0,0.0,9.13242,9.13242,4.56621,1217.351598,103.685254,28.1,39.6


In [88]:
# Points per game estimated
fig = px.histogram(x=outputs['PLAYER_PPG'], color=outputs['SEASON_ID'], labels={
    'x': 'Estimated points per game versus opponent'
})

print('Mean: ' + str(outputs['PLAYER_PPG'].mean()))
print('Standard deviation: ' + str(outputs['PLAYER_PPG'].std()))

fig.show()

Mean: 12.449944589848759
Standard deviation: 9.050558387186921


In [89]:
# Points per 100 possessions
fig = px.histogram(x=outputs['PLAYER_PTS'], color=outputs['SEASON_ID'], labels={
    'x': 'PP100'
})

print('Mean: ' + str(outputs['PLAYER_PTS'].mean()))
print('Standard deviation: ' + str(outputs['PLAYER_PTS'].std()))

fig.show()

Mean: 21.0669166368908
Standard deviation: 12.996656905585525


# Examine Input Correlation

Find correlation between inputs and remove redundant ones

In [90]:
off_stats = inputs.filter(regex='^OFF').copy()

# Display correlation matrix
fig = px.imshow(off_stats.corr())
fig.show()

# Drop columns with correlation > 0.9
off_cols = ['OFF_FGM', 'OFF_EFF_FG_PCT', 'OFF_OREB', 'OFF_PAINT_TOUCH_FGA', 'OFF_PAINT_TOUCH_FGM', 'OFF_PAINT_TOUCH_TOV', 'OFF_PAINT_TOUCH_PASSES', 'OFF_PAINT_TOUCH_PTS', 'OFF_DREB', 'OFF_FG3M', 'OFF_FTM', 'OFF_PTS_PER_100']
off_stats.drop(columns=off_cols, inplace=True)
fig = px.imshow(off_stats.corr())
fig.show()

print('Columns before removal: ' + str(len(off_cols) + len(off_stats.columns)))
print('Columns after removal: ' + str(len(off_stats.columns)))

Columns before removal: 44
Columns after removal: 32


In [91]:
def_stats = inputs.filter(regex='^DEF').copy()

# Display correlation matrix
fig = px.imshow(def_stats.corr())
fig.show()

# Drop columns with correlation > 0.9
def_cols = ['DEF_D_FGM', 'DEF_OPP_PTS_PAINT', 'DEF_CONTESTED_SHOTS_2PT', 'DEF_DEFLECTIONS', 'DEF_OPP_PTS_FB', 'DEF_OPP_PTS_2ND_CHANCE', 'DEF_OPP_PTS_OFF_TOV', 'DEF_G', 'DEF_PCT_PLUSMINUS']
def_stats.drop(columns=def_cols, inplace=True)
fig = px.imshow(def_stats.corr())
fig.show()

print('Columns before removal: ' + str(len(def_cols) + len(def_stats.columns)))
print('Columns after removal: ' + str(len(def_stats.columns)))

Columns before removal: 34
Columns after removal: 25


In [92]:
inputs_shortened = inputs.drop(columns=off_cols + def_cols)
inputs_shortened.drop(columns=['DEF_PLAYER_ID', 'DEF_PLAYER_NAME', 'OFF_PLAYER_ID', 'OFF_PLAYER_NAME'], inplace=True)

# Output to CSV
inputs_shortened.to_csv(loc + 'X.csv', index=False)
outputs.to_csv(loc + 'y.csv', index=False)