In [1]:
import pandas as pd
import plotly.express as px

In [2]:
loc = './stats/'

# Clean outputs

Make sure to drop from inputs too

In [3]:
MIN_MATCHUP_MINS = 2

In [4]:
inputs = pd.read_csv(loc + 'inputs.csv')
outputs = pd.read_csv(loc + 'outputs.csv')

# Set minutes threshold
inputs = inputs[outputs['MATCHUP_TIME_SEC'] > MIN_MATCHUP_MINS * 60]
outputs = outputs[outputs['MATCHUP_TIME_SEC'] > MIN_MATCHUP_MINS * 60]

def per_100_poss(x):
    return x / outputs['PARTIAL_POSS'] * 100

# Set stats to per 100 possessions
outputs = outputs.apply(lambda x: per_100_poss(x) if x.name not in outputs.columns[0:8] else x)

# Remove rows with zeros in important columns
check = outputs != 0
inputs = inputs[check['PLAYER_PTS'] & check['MATCHUP_AST'] & check['MATCHUP_FG3A']]
outputs = outputs[check['PLAYER_PTS'] & check['MATCHUP_AST'] & check['MATCHUP_FG3A']]

# Merge average points per game and points per 100 possessions into h2h dataframe
off_data = inputs[['OFF_PLAYER_ID', 'SEASON_ID', 'OFF_PTS', 'OFF_PTS_PER_100', 'OFF_AST']]

# Get average possessions per game
poss = off_data['OFF_PTS'] / (off_data['OFF_PTS_PER_100'] / 100)

# Convert per 100 possessions to points per game
outputs['PLAYER_PPG'] = outputs['PLAYER_PTS'] / 100 * poss
outputs['PLAYER_APG'] = outputs['MATCHUP_AST'] / 100 * poss
outputs['PLAYER_3PG'] = outputs['MATCHUP_FG3M'] / 100 * poss
outputs['OFF_PTS'] = off_data['OFF_PTS']
outputs['OFF_PTS_PER_100'] = off_data['OFF_PTS_PER_100']

print(outputs.shape, inputs.shape)
outputs.sort_values(by=['PLAYER_PTS'], ascending=False).head()


(41443, 32) (41443, 79)


Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,TEAM_PTS,...,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC,PLAYER_PPG,PLAYER_APG,PLAYER_3PG,OFF_PTS,OFF_PTS_PER_100
502466,22018,203903,Jordan Clarkson,1628464,Daniel Theis,4,2:24,11.2,223.214286,303.571429,...,0.0,35.714286,35.714286,17.857143,1283.928571,125.838926,20.134228,15.100671,16.8,29.8
29119,22021,1629630,Ja Morant,203486,Mason Plumlee,2,2:23,12.6,222.222222,238.095238,...,0.0,23.809524,23.809524,15.873016,1131.746032,156.862745,11.204482,5.602241,27.6,39.1
493539,22018,203897,Zach LaVine,1628386,Jarrett Allen,3,3:07,13.2,212.121212,287.878788,...,0.0,7.575758,7.575758,7.575758,1417.424242,152.341598,5.440771,5.440771,23.7,33.0
222961,22020,202331,Paul George,1629028,Deandre Ayton,3,4:05,17.8,207.865169,241.573034,...,0.0,5.617978,16.853933,11.235955,1374.157303,142.448777,11.549901,23.099802,23.3,34.0
56166,22021,1629630,Ja Morant,1627826,Ivica Zubac,3,2:60,18.6,204.301075,284.946237,...,0.0,10.752688,21.505376,10.752688,965.053763,144.212524,15.180266,7.590133,27.6,39.1


In [5]:
# Points per game estimated
fig = px.histogram(x=outputs['PLAYER_PPG'], color=outputs['SEASON_ID'], labels={
    'x': 'Estimated points per game versus opponent'
})

print('Mean: ' + str(outputs['PLAYER_PPG'].mean()))
print('Standard deviation: ' + str(outputs['PLAYER_PPG'].std()))

fig.show()

# Assists per game estimated
fig = px.histogram(x=outputs['PLAYER_3PG'], color=outputs['SEASON_ID'], labels={
    'x': 'Estimated assists per game versus opponent'
})

print('Mean: ' + str(outputs['PLAYER_3PG'].mean()))
print('Standard deviation: ' + str(outputs['PLAYER_3PG'].std()))

fig.show()

Mean: 17.108036315250555
Standard deviation: 13.987156991816521


Mean: 2.31617451493954
Standard deviation: 2.698773451005894


In [6]:
# Points per 100 possessions
fig = px.histogram(x=outputs['PLAYER_PTS'], color=outputs['SEASON_ID'], labels={
    'x': 'Points per 100 possessions'
})

print('Mean: ' + str(outputs['PLAYER_PTS'].mean()))
print('Standard deviation: ' + str(outputs['PLAYER_PTS'].std()))

fig.show()

# Assists per 100 possessions
fig = px.histogram(x=outputs['MATCHUP_AST'], color=outputs['SEASON_ID'], labels={
    'x': 'Assists per 100 possessions'
})

print('Mean: ' + str(outputs['MATCHUP_AST'].mean()))
print('Standard deviation: ' + str(outputs['MATCHUP_AST'].std()))

fig.show()

Mean: 27.805306243545722
Standard deviation: 19.784157439035052


Mean: 8.047532073164174
Standard deviation: 5.716665481558683


# Examine Input Correlation

Find correlation between inputs and remove redundant ones

In [7]:
off_stats = inputs.filter(regex='^OFF').copy()

# Display correlation matrix
fig = px.imshow(off_stats.corr())
fig.show()

# Drop columns with correlation > 0.9
off_cols = ['OFF_FGM', 'OFF_EFF_FG_PCT', 'OFF_OREB', 'OFF_PAINT_TOUCH_FGA', 'OFF_PAINT_TOUCH_FGM', 'OFF_PAINT_TOUCH_TOV', 'OFF_PAINT_TOUCH_PASSES', 'OFF_PAINT_TOUCH_PTS', 'OFF_DREB', 'OFF_FG3M', 'OFF_FTM']
off_stats.drop(columns=off_cols, inplace=True)
fig = px.imshow(off_stats.corr())
fig.show()

print('Columns before removal: ' + str(len(off_cols) + len(off_stats.columns)))
print('Columns after removal: ' + str(len(off_stats.columns)))

Columns before removal: 44
Columns after removal: 33


In [8]:
def_stats = inputs.filter(regex='^DEF').copy()

# Display correlation matrix
fig = px.imshow(def_stats.corr())
fig.show()

# Drop columns with correlation > 0.9
def_cols = ['DEF_D_FGM', 'DEF_OPP_PTS_PAINT', 'DEF_CONTESTED_SHOTS_2PT', 'DEF_DEFLECTIONS', 'DEF_OPP_PTS_FB', 'DEF_OPP_PTS_2ND_CHANCE', 'DEF_OPP_PTS_OFF_TOV', 'DEF_G', 'DEF_PCT_PLUSMINUS']
def_stats.drop(columns=def_cols, inplace=True)
fig = px.imshow(def_stats.corr())
fig.show()

print('Columns before removal: ' + str(len(def_cols) + len(def_stats.columns)))
print('Columns after removal: ' + str(len(def_stats.columns)))

Columns before removal: 34
Columns after removal: 25


In [9]:
inputs_shortened = inputs.drop(columns=off_cols + def_cols)
inputs_shortened.drop(columns=['DEF_PLAYER_ID', 'DEF_PLAYER_NAME', 'OFF_PLAYER_ID', 'OFF_PLAYER_NAME'], inplace=True)

# Output to CSV
inputs_shortened.to_csv(loc + 'X.csv', index=False)
outputs.to_csv(loc + 'y.csv', index=False)