In [93]:
import pandas as pd
import plotly.express as px

In [94]:
loc = './stats/'

# Clean outputs

Make sure to drop from inputs too

In [102]:
MIN_MATCHUP_MINS = 2

In [103]:
inputs = pd.read_csv(loc + 'inputs.csv')
outputs = pd.read_csv(loc + 'outputs.csv')

# Set minutes threshold
inputs = inputs[outputs['MATCHUP_TIME_SEC'] > MIN_MATCHUP_MINS * 60]
outputs = outputs[outputs['MATCHUP_TIME_SEC'] > MIN_MATCHUP_MINS * 60]

def per_100_poss(x):
    return x / outputs['PARTIAL_POSS'] * 100

# Set stats to per 100 possessions
outputs = outputs.apply(lambda x: per_100_poss(x) if x.name not in outputs.columns[0:8] else x)

# Remove rows with zeros in important columns
check = outputs[outputs.columns[8:11]] != 0
inputs = inputs[check['PLAYER_PTS']]
outputs = outputs[check['PLAYER_PTS']]

# Merge average points per game and points per 100 possessions into h2h dataframe
off_data = inputs[['OFF_PLAYER_ID', 'SEASON_ID', 'OFF_PTS', 'OFF_PTS_PER_100']]

# Get average possessions per game
poss = off_data['OFF_PTS'] / (off_data['OFF_PTS_PER_100'] / 100)

# Convert per 100 possessions to points per game
outputs['PLAYER_PPG'] = outputs['PLAYER_PTS'] / 100 * poss
outputs['OFF_PTS'] = off_data['OFF_PTS']
outputs['OFF_PTS_PER_100'] = off_data['OFF_PTS_PER_100']


print(outputs.shape, inputs.shape)
outputs.sort_values(by=['PLAYER_PTS'], ascending=False).head()

(98055, 30) (98055, 79)


Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,TEAM_PTS,...,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC,PLAYER_PPG,OFF_PTS,OFF_PTS_PER_100
150789,22020,1629029,Luka Doncic,202689,Kemba Walker,2,2:16,9.8,224.489796,234.693878,...,0.0,0.0,0.0,20.408163,30.612245,10.204082,1385.714286,159.037528,27.7,39.1
502466,22018,203903,Jordan Clarkson,1628464,Daniel Theis,4,2:24,11.2,223.214286,303.571429,...,0.0,0.0,0.0,35.714286,35.714286,17.857143,1283.928571,125.838926,16.8,29.8
29119,22021,1629630,Ja Morant,203486,Mason Plumlee,2,2:23,12.6,222.222222,238.095238,...,0.0,0.0,0.0,23.809524,23.809524,15.873016,1131.746032,156.862745,27.6,39.1
465221,22018,2544,LeBron James,1626167,Myles Turner,2,2:17,11.5,217.391304,234.782609,...,0.0,0.0,0.0,26.086957,52.173913,34.782609,1194.782609,166.849348,27.4,35.7
335641,22019,201942,DeMar DeRozan,1627826,Ivica Zubac,4,3:08,16.6,216.86747,319.277108,...,0.0,0.0,0.0,36.144578,36.144578,24.096386,1133.73494,154.605519,22.1,31.0


In [104]:
# Points per game estimated
fig = px.histogram(x=outputs['PLAYER_PPG'], color=outputs['SEASON_ID'], labels={
    'x': 'Estimated points per game versus opponent'
})

print('Mean: ' + str(outputs['PLAYER_PPG'].mean()))
print('Standard deviation: ' + str(outputs['PLAYER_PPG'].std()))

fig.show()

Mean: 15.227022342009704
Standard deviation: 12.554381654508733


In [105]:
# Points per 100 possessions
fig = px.histogram(x=outputs['PLAYER_PTS'], color=outputs['SEASON_ID'], labels={
    'x': 'PP100'
})

print('Mean: ' + str(outputs['PLAYER_PTS'].mean()))
print('Standard deviation: ' + str(outputs['PLAYER_PTS'].std()))

fig.show()

Mean: 26.41323333859418
Standard deviation: 18.280084990310595


# Examine Input Correlation

Find correlation between inputs and remove redundant ones

In [106]:
off_stats = inputs.filter(regex='^OFF').copy()

# Display correlation matrix
fig = px.imshow(off_stats.corr())
fig.show()

# Drop columns with correlation > 0.9
off_cols = ['OFF_FGM', 'OFF_EFF_FG_PCT', 'OFF_OREB', 'OFF_PAINT_TOUCH_FGA', 'OFF_PAINT_TOUCH_FGM', 'OFF_PAINT_TOUCH_TOV', 'OFF_PAINT_TOUCH_PASSES', 'OFF_PAINT_TOUCH_PTS', 'OFF_DREB', 'OFF_FG3M', 'OFF_FTM', 'OFF_PTS_PER_100']
off_stats.drop(columns=off_cols, inplace=True)
fig = px.imshow(off_stats.corr())
fig.show()

print('Columns before removal: ' + str(len(off_cols) + len(off_stats.columns)))
print('Columns after removal: ' + str(len(off_stats.columns)))

Columns before removal: 44
Columns after removal: 32


In [107]:
def_stats = inputs.filter(regex='^DEF').copy()

# Display correlation matrix
fig = px.imshow(def_stats.corr())
fig.show()

# Drop columns with correlation > 0.9
def_cols = ['DEF_D_FGM', 'DEF_OPP_PTS_PAINT', 'DEF_CONTESTED_SHOTS_2PT', 'DEF_DEFLECTIONS', 'DEF_OPP_PTS_FB', 'DEF_OPP_PTS_2ND_CHANCE', 'DEF_OPP_PTS_OFF_TOV', 'DEF_G', 'DEF_PCT_PLUSMINUS']
def_stats.drop(columns=def_cols, inplace=True)
fig = px.imshow(def_stats.corr())
fig.show()

print('Columns before removal: ' + str(len(def_cols) + len(def_stats.columns)))
print('Columns after removal: ' + str(len(def_stats.columns)))

Columns before removal: 34
Columns after removal: 25


In [109]:
inputs_shortened = inputs.drop(columns=off_cols + def_cols)
inputs_shortened.drop(columns=['DEF_PLAYER_ID', 'DEF_PLAYER_NAME', 'OFF_PLAYER_ID', 'OFF_PLAYER_NAME'], inplace=True)

# Output to CSV
inputs_shortened.to_csv(loc + 'X.csv', index=False)
outputs.to_csv(loc + 'y.csv', index=False)