In [1]:
import pandas as pd
import plotly.express as px

In [2]:
loc = './stats/'

# Clean outputs

Make sure to drop from inputs too

In [3]:
MIN_MATCHUP_MINS = 5
MIN_POSS = 30

In [4]:
inputs = pd.read_csv(loc + 'inputs.csv')
outputs = pd.read_csv(loc + 'outputs.csv')

# Set threshold
# inputs = inputs[outputs['MATCHUP_TIME_SEC'] > MIN_MATCHUP_MINS * 60]
# outputs = outputs[outputs['MATCHUP_TIME_SEC'] > MIN_MATCHUP_MINS * 60]
inputs = inputs[outputs['PARTIAL_POSS'] > MIN_POSS]
outputs = outputs[outputs['PARTIAL_POSS'] > MIN_POSS]

def per_100_poss(x):
    return x / outputs['PARTIAL_POSS'] * 100

# Set stats to per 100 possessions
outputs = outputs.apply(lambda x: per_100_poss(x) if x.name not in outputs.columns[0:8] else x)

# Remove rows with zeros in important columns
check = outputs != 0
inputs = inputs[check['PLAYER_PTS'] & check['MATCHUP_AST'] & check['MATCHUP_FG3A']]
outputs = outputs[check['PLAYER_PTS'] & check['MATCHUP_AST'] & check['MATCHUP_FG3A']]

# Merge average points per game and points per 100 possessions into h2h dataframe
off_data = inputs[['OFF_PLAYER_ID', 'SEASON_ID', 'OFF_PTS', 'OFF_PTS_PER_100', 'OFF_AST']]

# Get average possessions per game
poss = off_data['OFF_PTS'] / (off_data['OFF_PTS_PER_100'] / 100)

# Convert per 100 possessions to points per game
outputs['PLAYER_PPG'] = outputs['PLAYER_PTS'] / 100 * poss
outputs['PLAYER_APG'] = outputs['MATCHUP_AST'] / 100 * poss
outputs['PLAYER_3PG'] = outputs['MATCHUP_FG3M'] / 100 * poss
outputs['OFF_PTS'] = off_data['OFF_PTS']
outputs['OFF_PTS_PER_100'] = off_data['OFF_PTS_PER_100']

print(outputs.shape, inputs.shape)
outputs.sort_values(by=['PLAYER_PTS'], ascending=False).head()


(17085, 32) (17085, 78)


Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,TEAM_PTS,...,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC,PLAYER_PPG,PLAYER_APG,PLAYER_3PG,OFF_PTS,OFF_PTS_PER_100
56149,22021,1629029,Luka Doncic,1627826,Ivica Zubac,3,11:22,54.3,112.338858,208.103131,...,0.0,3.683241,5.524862,1.841621,1255.248619,78.723394,14.196022,14.196022,27.4,39.1
46400,22021,1627742,Brandon Ingram,1626174,Christian Wood,3,5:18,32.6,107.361963,165.644172,...,0.0,9.202454,12.269939,9.202454,975.460123,70.137925,4.007881,8.015763,17.9,27.4
493483,22018,202695,Kawhi Leonard,1628386,Jarrett Allen,4,6:13,33.0,100.0,184.848485,...,0.0,3.030303,3.030303,3.030303,1130.909091,41.935484,5.083089,2.541544,10.4,24.8
430258,22018,202689,Kemba Walker,203083,Andre Drummond,4,7:03,34.8,94.827586,146.551724,...,0.0,11.494253,17.241379,5.747126,1216.37931,36.847291,2.233169,3.349754,6.8,17.5
443243,22018,203081,Damian Lillard,203500,Steven Adams,4,8:06,44.5,87.640449,161.797753,...,0.0,13.483146,13.483146,6.741573,1092.134831,69.246775,21.3067,1.775558,25.6,32.4


In [5]:
# Points per game estimated
fig = px.histogram(x=outputs['PLAYER_PPG'], color=outputs['SEASON_ID'], labels={
    'x': 'Estimated points per game versus opponent'
})

print('Mean: ' + str(outputs['PLAYER_PPG'].mean()))
print('Standard deviation: ' + str(outputs['PLAYER_PPG'].std()))

fig.show()

# Assists per game estimated
fig = px.histogram(x=outputs['PLAYER_3PG'], color=outputs['SEASON_ID'], labels={
    'x': 'Estimated assists per game versus opponent'
})

print('Mean: ' + str(outputs['PLAYER_3PG'].mean()))
print('Standard deviation: ' + str(outputs['PLAYER_3PG'].std()))

fig.show()

Mean: 10.453574201287928
Standard deviation: 7.150327012275329


Mean: 1.2970310382371697
Standard deviation: 1.3826333063598057


In [6]:
# Points per 100 possessions
fig = px.histogram(x=outputs['PLAYER_PTS'], color=outputs['SEASON_ID'], labels={
    'x': 'Points per 100 possessions'
})

print('Mean: ' + str(outputs['PLAYER_PTS'].mean()))
print('Standard deviation: ' + str(outputs['PLAYER_PTS'].std()))

fig.show()

# Assists per 100 possessions
fig = px.histogram(x=outputs['MATCHUP_AST'], color=outputs['SEASON_ID'], labels={
    'x': 'Assists per 100 possessions'
})

print('Mean: ' + str(outputs['MATCHUP_AST'].mean()))
print('Standard deviation: ' + str(outputs['MATCHUP_AST'].std()))

fig.show()

Mean: 20.764124926114906
Standard deviation: 11.586096962327767


Mean: 5.875634139122012
Standard deviation: 3.89954999199105


# Examine Input Correlation

Find correlation between inputs and remove redundant ones

In [7]:
off_stats = inputs.filter(regex='^OFF').copy()

# Display correlation matrix
fig = px.imshow(off_stats.corr())
fig.show()

# Drop columns with correlation > 0.9
off_cols = ['OFF_FGM', 'OFF_EFF_FG_PCT', 'OFF_OREB', 'OFF_PAINT_TOUCH_FGA', 'OFF_PAINT_TOUCH_FGM', 'OFF_PAINT_TOUCH_TOV', 'OFF_PAINT_TOUCH_PASSES', 'OFF_PAINT_TOUCH_PTS', 'OFF_FG3M', 'OFF_FTM', 'OFF_PTS', 'OFF_DREB', 'OFF_TOUCHES']
off_stats.drop(columns=off_cols, inplace=True)
fig = px.imshow(off_stats.corr())
fig.show()

print('Columns before removal: ' + str(len(off_cols) + len(off_stats.columns)))
print('Columns after removal: ' + str(len(off_stats.columns)))

Columns before removal: 43
Columns after removal: 30


In [8]:
def_stats = inputs.filter(regex='^DEF').copy()

# Display correlation matrix
fig = px.imshow(def_stats.corr())
fig.show()

# Drop columns with correlation > 0.9
def_cols = ['DEF_D_FGM', 'DEF_OPP_PTS_PAINT', 'DEF_CONTESTED_SHOTS_2PT', 'DEF_DEFLECTIONS', 'DEF_OPP_PTS_FB', 'DEF_OPP_PTS_2ND_CHANCE', 'DEF_OPP_PTS_OFF_TOV', 'DEF_G', 'DEF_PCT_PLUSMINUS']
def_stats.drop(columns=def_cols, inplace=True)
fig = px.imshow(def_stats.corr())
fig.show()

print('Columns before removal: ' + str(len(def_cols) + len(def_stats.columns)))
print('Columns after removal: ' + str(len(def_stats.columns)))

Columns before removal: 34
Columns after removal: 25


In [9]:
inputs_shortened = inputs.drop(columns=off_cols + def_cols)
inputs_shortened.drop(columns=['DEF_PLAYER_ID', 'DEF_PLAYER_NAME', 'OFF_PLAYER_ID'], inplace=True)

# Output to CSV
inputs_shortened.to_csv(loc + 'X.csv', index=False)
outputs.to_csv(loc + 'y.csv', index=False)