In [429]:
import pandas as pd
import requests
import json
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split

import shap
import plotly.express as px

from minisom import MiniSom

# For shap to work with keras, disable v2 behavior
tf.compat.v1.disable_v2_behavior()

In [430]:
season = '2021-22'
percentile = 0.9

df = pd.read_csv('./stats/' + season + '_' + str(percentile) + '_h2h_stats.csv')
# df = pd.read_csv('./stats/h2h_combined.csv')
df = df.fillna(0)

print(df.shape)
df.head()

(17023, 27)


Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,TEAM_PTS,...,MATCHUP_FG3A,MATCHUP_FG3_PCT,HELP_BLK,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC
0,22021,203952,Andrew Wiggins,1626220,Royce O'Neale,3,15:29,72.2,19,73,...,8,0.5,0,0,0,0,1,4,2,928.6
1,22021,203952,Andrew Wiggins,1630162,Anthony Edwards,3,13:10,67.8,12,60,...,5,0.4,0,0,0,0,0,0,0,790.1
2,22021,203952,Andrew Wiggins,1630581,Josh Giddey,3,12:32,63.9,10,66,...,4,0.25,0,0,0,0,1,3,2,751.5
3,22021,203952,Andrew Wiggins,203084,Harrison Barnes,2,12:15,60.5,10,68,...,4,0.5,0,0,0,0,0,0,0,735.5
4,22021,203952,Andrew Wiggins,203496,Robert Covington,3,9:55,52.8,21,62,...,3,1.0,0,0,0,0,4,4,1,595.3


# Convert totals to per 100 possessions

Most limited sample size, convert each to per possession and then multiply by 100 (more standard to measure in per 100 possessions rather than per possession, as generally nba teams average [100 possessions per game](https://www.teamrankings.com/nba/stat/possessions-per-game)).

In [431]:
MIN_MATCHUP_MINS = 4

In [432]:
h2h_df = df.copy()
h2h_df = h2h_df[h2h_df['MATCHUP_TIME_SEC'] > MIN_MATCHUP_MINS * 60] # Must have played more than x minutes
h2h_df.drop(columns=['TEAM_PTS', 'MATCHUP_FG_PCT', 'MATCHUP_FG3_PCT'], inplace=True)

def per_100_poss(x):
    return x / h2h_df['PARTIAL_POSS'] * 100

# Set stats to per 100 possessions
h2h_df = h2h_df.apply(lambda x: per_100_poss(x) if x.name not in h2h_df.columns[0:8] else x)

h2h_df.sort_values('DEF_PLAYER_NAME', ascending=True, inplace=True)

# Remove rows with zeros in important columns
check = h2h_df[h2h_df.columns[8:11]] != 0
h2h_df = h2h_df[check['PLAYER_PTS'] & check['MATCHUP_AST']]

print(h2h_df.shape)
h2h_df.head()

(2002, 24)


Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,MATCHUP_AST,...,MATCHUP_FG3M,MATCHUP_FG3A,HELP_BLK,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC
7116,22021,1629630,Ja Morant,203932,Aaron Gordon,3,15:11,71.2,29.494382,5.617978,...,2.808989,5.617978,0.0,0.0,0.0,0.0,1.404494,4.213483,1.404494,1279.073034
2824,22021,1630175,Cole Anthony,203932,Aaron Gordon,2,6:31,30.8,25.974026,6.493506,...,6.493506,9.74026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1269.480519
555,22021,1630162,Anthony Edwards,203932,Aaron Gordon,3,16:20,83.5,17.964072,1.197605,...,3.592814,8.383234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1173.772455
16082,22021,1629027,Trae Young,203932,Aaron Gordon,2,14:04,75.6,27.777778,13.227513,...,2.645503,10.582011,0.0,0.0,0.0,0.0,1.322751,2.645503,1.322751,1116.005291
9423,22021,201950,Jrue Holiday,203932,Aaron Gordon,2,7:32,39.5,12.658228,7.594937,...,0.0,5.063291,0.0,0.0,0.0,0.0,2.531646,2.531646,2.531646,1144.303797


# Sanitize Outputs

The program is only as good as the data you give it

In [433]:
features = ['PLAYER_PTS']

som_data = h2h_df[features].values # h2h_df.columns[7:10]
som = MiniSom(20, 20, len(features), sigma=5, learning_rate=0.5, # som_data.shape[1] in 3
              neighborhood_function='gaussian', random_seed=10)

%time som.train(som_data, 20000, random_order=False)  # random training

quantization_errors = np.linalg.norm(som.quantization(som_data) - som_data, axis=1)

error_threshold = np.percentile(quantization_errors, 
                               100*(1-.25))+(0.5*(np.percentile(quantization_errors, 
                               100*(1-.25)) - np.percentile(quantization_errors, 100*(1-.75))))

is_outlier = quantization_errors > error_threshold
error_threshold

Wall time: 879 ms


0.13246975900978475

In [434]:
px.histogram(x=quantization_errors, labels={
    'x': 'error'
})

In [435]:
fig = px.scatter(h2h_df, x='PLAYER_PTS', y='MATCHUP_TIME_SEC', color=is_outlier, 
                    labels={
                     'PLAYER_PTS': 'Points',
                     'MATCHUP_AST': 'Assists',
                     'MATCHUP_TOV': 'Turnovers',
                     'MATCHUP_TIME_SEC': 'Matchup Time'
                    })
fig.show()

In [436]:
fig = px.scatter(h2h_df[is_outlier == False], x='PLAYER_PTS', y='MATCHUP_AST', 
                    labels={
                     'PLAYER_PTS': 'Points',
                     'MATCHUP_AST': 'Assists',
                     'MATCHUP_TOV': 'Turnovers'
                    })
fig.show()

In [437]:
# Remove outliers
out = h2h_df[is_outlier == False]
print(out.shape)

(1674, 24)


# Retrieve Offensive and Defensive Stats

Retrieves defensive data for each defending player

In [438]:
data = pd.read_csv('./stats/'+ season + '_def_stats.csv')
# data = pd.read_csv('./stats/def_combined.csv')
data = data.fillna(0)

# Get def stats only from selected defenders (rename player_id to def_player_id to merge arrays)
data = data.add_prefix('DEF_')
data.rename(columns={'DEF_SEASON_ID': 'SEASON_ID', 'DEF_DEF_RATING': 'DEF_RATING', 
                     'DEF_DEF_BOXOUTS': 'DEF_BOXOUTS'}, inplace=True)
def_df = pd.merge(data, out[['DEF_PLAYER_ID', 'SEASON_ID']], how ='inner', on=['DEF_PLAYER_ID', 'SEASON_ID'])
def_df.drop(columns=['DEF_GP', 'DEF_G', 'DEF_D_FG_PCT', 'DEF_DREB_PCT', 'DEF_PCT_STL', 'DEF_PCT_BLK'], inplace=True)

# Add offensive player (helps merging offensive stats)
print(def_df.shape[0], out.shape[0])
def_df['OFF_PLAYER_ID'] = out['OFF_PLAYER_ID'].to_numpy()
def_df['OFF_PLAYER_NAME'] = out['OFF_PLAYER_NAME'].to_numpy()

print(def_df.columns)
def_df

1674 1674
Index(['DEF_PLAYER_ID', 'DEF_PLAYER_NAME', 'DEF_AGE', 'DEF_D_FGM', 'DEF_D_FGA',
       'DEF_NORMAL_FG_PCT', 'DEF_PCT_PLUSMINUS', 'DEF_W', 'DEF_L', 'DEF_MIN',
       'DEF_STL', 'DEF_BLK', 'DEF_DREB', 'DEF_CONTESTED_SHOTS',
       'DEF_CONTESTED_SHOTS_2PT', 'DEF_CONTESTED_SHOTS_3PT', 'DEF_DEFLECTIONS',
       'DEF_CHARGES_DRAWN', 'DEF_BOXOUTS', 'DEF_PCT_BOX_OUTS_REB', 'SEASON_ID',
       'DEF_RATING', 'DEF_OPP_PTS_OFF_TOV', 'DEF_OPP_PTS_2ND_CHANCE',
       'DEF_OPP_PTS_FB', 'DEF_OPP_PTS_PAINT', 'DEF_DEF_WS', 'OFF_PLAYER_ID',
       'OFF_PLAYER_NAME'],
      dtype='object')


Unnamed: 0,DEF_PLAYER_ID,DEF_PLAYER_NAME,DEF_AGE,DEF_D_FGM,DEF_D_FGA,DEF_NORMAL_FG_PCT,DEF_PCT_PLUSMINUS,DEF_W,DEF_L,DEF_MIN,...,DEF_PCT_BOX_OUTS_REB,SEASON_ID,DEF_RATING,DEF_OPP_PTS_OFF_TOV,DEF_OPP_PTS_2ND_CHANCE,DEF_OPP_PTS_FB,DEF_OPP_PTS_PAINT,DEF_DEF_WS,OFF_PLAYER_ID,OFF_PLAYER_NAME
0,203932,Aaron Gordon,26.0,4.98,11.50,0.462,-0.029,16.0,36.0,27.3,...,0.560,22021,109.6,9.7,8.3,5.3,27.2,0.076,1629630,Ja Morant
1,203932,Aaron Gordon,26.0,4.98,11.50,0.462,-0.029,16.0,36.0,27.3,...,0.560,22021,109.6,9.7,8.3,5.3,27.2,0.076,1630175,Cole Anthony
2,203932,Aaron Gordon,26.0,4.98,11.50,0.462,-0.029,16.0,36.0,27.3,...,0.560,22021,109.6,9.7,8.3,5.3,27.2,0.076,1630162,Anthony Edwards
3,203932,Aaron Gordon,26.0,4.98,11.50,0.462,-0.029,16.0,36.0,27.3,...,0.560,22021,109.6,9.7,8.3,5.3,27.2,0.076,1629027,Trae Young
4,203932,Aaron Gordon,26.0,4.98,11.50,0.462,-0.029,16.0,36.0,27.3,...,0.560,22021,109.6,9.7,8.3,5.3,27.2,0.076,1627749,Dejounte Murray
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1669,203897,Zach LaVine,26.0,5.34,11.49,0.446,0.019,9.0,9.0,16.2,...,0.667,22021,108.8,4.1,4.1,3.7,14.8,0.049,1629018,Gary Trent Jr.
1670,203897,Zach LaVine,26.0,5.34,11.49,0.446,0.019,9.0,9.0,16.2,...,0.667,22021,108.8,4.1,4.1,3.7,14.8,0.049,1626181,Norman Powell
1671,203897,Zach LaVine,26.0,5.34,11.49,0.446,0.019,9.0,9.0,16.2,...,0.667,22021,108.8,4.1,4.1,3.7,14.8,0.049,1626179,Terry Rozier
1672,1630533,Ziaire Williams,20.0,3.74,8.29,0.442,0.009,26.0,18.0,37.0,...,0.455,22021,111.7,13.7,10.2,11.3,36.8,0.081,203114,Khris Middleton


In [439]:
data = pd.read_csv('./stats/' + season + '_' + str(percentile) + '_off_stats.csv')
# data = pd.read_csv('./stats/off_combined.csv')
data = data.fillna(0)

# Get off stats only from selected offensive players
data = data.add_prefix('OFF_')
data.rename(columns={'OFF_SEASON_ID': 'SEASON_ID'}, inplace=True)
combine = pd.merge(def_df, data, how='inner', on=['OFF_PLAYER_ID', 'SEASON_ID'])

# Get correct sort
combine.sort_values(['DEF_PLAYER_ID', 'OFF_PLAYER_ID'], inplace=True)
print(combine.shape)
print(combine.columns)
combine.head()

(1674, 68)
Index(['DEF_PLAYER_ID', 'DEF_PLAYER_NAME', 'DEF_AGE', 'DEF_D_FGM', 'DEF_D_FGA',
       'DEF_NORMAL_FG_PCT', 'DEF_PCT_PLUSMINUS', 'DEF_W', 'DEF_L', 'DEF_MIN',
       'DEF_STL', 'DEF_BLK', 'DEF_DREB', 'DEF_CONTESTED_SHOTS',
       'DEF_CONTESTED_SHOTS_2PT', 'DEF_CONTESTED_SHOTS_3PT', 'DEF_DEFLECTIONS',
       'DEF_CHARGES_DRAWN', 'DEF_BOXOUTS', 'DEF_PCT_BOX_OUTS_REB', 'SEASON_ID',
       'DEF_RATING', 'DEF_OPP_PTS_OFF_TOV', 'DEF_OPP_PTS_2ND_CHANCE',
       'DEF_OPP_PTS_FB', 'DEF_OPP_PTS_PAINT', 'DEF_DEF_WS', 'OFF_PLAYER_ID',
       'OFF_PLAYER_NAME', 'OFF_AGE', 'OFF_GP', 'OFF_MIN', 'OFF_FGM', 'OFF_FGA',
       'OFF_FG_PCT', 'OFF_FG3M', 'OFF_FG3A', 'OFF_FG3_PCT', 'OFF_FTM',
       'OFF_FTA', 'OFF_FT_PCT', 'OFF_OREB', 'OFF_DREB', 'OFF_REB', 'OFF_AST',
       'OFF_TOV', 'OFF_BLKA', 'OFF_PF', 'OFF_PTS', 'OFF_TOUCHES',
       'OFF_PAINT_TOUCHES', 'OFF_PAINT_TOUCH_FGM', 'OFF_PAINT_TOUCH_FGA',
       'OFF_PAINT_TOUCH_PASSES', 'OFF_PAINT_TOUCH_TOV', 'OFF_DRIVE_PTS',
       'OFF_DRIVE_

Unnamed: 0,DEF_PLAYER_ID,DEF_PLAYER_NAME,DEF_AGE,DEF_D_FGM,DEF_D_FGA,DEF_NORMAL_FG_PCT,DEF_PCT_PLUSMINUS,DEF_W,DEF_L,DEF_MIN,...,OFF_CATCH_SHOOT_FG_PCT,OFF_PULL_UP_PTS,OFF_PULL_UP_FG_PCT,OFF_PAINT_TOUCH_PTS,OFF_PAINT_TOUCH_FG_PCT,OFF_POST_TOUCH_PTS,OFF_POST_TOUCH_FG_PCT,OFF_ELBOW_TOUCH_PTS,OFF_ELBOW_TOUCH_FG_PCT,OFF_EFF_FG_PCT
1421,2544,LeBron James,37.0,5.03,12.55,0.463,-0.062,32.0,7.0,20.4,...,0.432,3.3,0.377,2.7,0.648,0.9,0.475,0.5,0.8,0.568
1625,2544,LeBron James,37.0,5.03,12.55,0.463,-0.062,32.0,7.0,20.4,...,0.319,2.2,0.394,5.9,0.651,2.7,0.467,1.9,0.535,0.499
1593,2544,LeBron James,37.0,5.03,12.55,0.463,-0.062,32.0,7.0,20.4,...,0.352,1.7,0.321,5.5,0.717,1.5,0.545,1.6,0.516,0.553
1440,2544,LeBron James,37.0,5.03,12.55,0.463,-0.062,32.0,7.0,20.4,...,0.422,5.9,0.422,0.8,0.629,0.0,0.0,0.6,0.6,0.561
674,2546,Carmelo Anthony,37.0,4.5,10.1,0.461,-0.015,2.0,1.0,6.1,...,0.336,0.1,0.13,10.0,0.69,2.0,0.507,3.8,0.649,0.606


In [440]:
# Sort head to head stats by same sort
h2h_df.sort_values(['DEF_PLAYER_ID', 'OFF_PLAYER_ID'], inplace=True)
h2h_df.head()

Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,MATCHUP_AST,...,MATCHUP_FG3M,MATCHUP_FG3A,HELP_BLK,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC
6854,22021,203507,Giannis Antetokounmpo,2544,LeBron James,1,4:06,20.8,48.076923,19.230769,...,0.0,0.0,0.0,0.0,0.0,0.0,19.230769,19.230769,9.615385,1181.25
7,22021,203952,Andrew Wiggins,2544,LeBron James,2,8:06,47.0,10.638298,4.255319,...,2.12766,4.255319,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1032.978723
10899,22021,204001,Kristaps Porzingis,2544,LeBron James,1,8:07,34.8,22.988506,2.873563,...,0.0,8.62069,0.0,0.0,0.0,0.0,5.747126,5.747126,2.873563,1398.275862
2475,22021,1626174,Christian Wood,2544,LeBron James,3,14:16,76.2,31.496063,5.249344,...,5.249344,6.56168,0.0,0.0,0.0,0.0,2.624672,2.624672,1.312336,1123.35958
5655,22021,1627734,Domantas Sabonis,2544,LeBron James,2,5:42,24.7,8.097166,4.048583,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1383.805668


**Machine Learning Implementation**

Utilizes the keras Model from Tensorflow to predict offensive player's stats.
If there is an error, make sure to first install tensorflow. This can be done through Anaconda, or through the command 

```
!pip install tensorflow
```

In [488]:
# # Clean data to use on model
# X = combine.drop(columns=['DEF_PLAYER_ID', 'DEF_PLAYER_NAME', 'OFF_PLAYER_ID', 'OFF_PLAYER_NAME', 'SEASON_ID'])
# y = out['PLAYER_PTS']

# # Save to csvs
# X.to_csv(r'.\stats\cleaned\\' + season + '_X.csv', index=False)
# y.to_csv(r'.\stats\cleaned\\' + season + '_y.csv', index=False)

X = pd.read_csv('./stats/cleaned/X_combined.csv')
y = pd.read_csv('./stats/cleaned/y_combined.csv')

print(X.shape, y.shape)

# Apply normalization to input
X = (X - X.mean()) / X.std()

# Generate training and verification data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print('Training Entries')
X_train.info()
X_train.describe()

print('Testing Entries')
X_test.info()
X_test.describe()

X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

(5475, 63) (5475, 1)
Training Entries
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4380 entries, 2392 to 1848
Data columns (total 63 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   DEF_AGE                  4380 non-null   float64
 1   DEF_D_FGM                4380 non-null   float64
 2   DEF_D_FGA                4380 non-null   float64
 3   DEF_NORMAL_FG_PCT        4380 non-null   float64
 4   DEF_PCT_PLUSMINUS        4380 non-null   float64
 5   DEF_W                    4380 non-null   float64
 6   DEF_L                    4380 non-null   float64
 7   DEF_MIN                  4380 non-null   float64
 8   DEF_STL                  4380 non-null   float64
 9   DEF_BLK                  4380 non-null   float64
 10  DEF_DREB                 4380 non-null   float64
 11  DEF_CONTESTED_SHOTS      4380 non-null   float64
 12  DEF_CONTESTED_SHOTS_2PT  4380 non-null   float64
 13  DEF_CONTESTED_SHOTS_3PT  4380 non-nul

**Keras Model Philosophy**

1. Normalize inputs
2. Dropout layer to prevent overfitting
3. Dense layer

In [489]:
# Create model
model = Sequential()
model.add(Dense(int(X.shape[1] / 2), activation='relu', input_shape=(X.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='relu')) # y.shape[1]

# Compile model
model.compile(optimizer='nadam', loss='mean_squared_error', metrics=['accuracy'])

model.summary()

Model: "sequential_28"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_64 (Dense)            (None, 31)                1984      
                                                                 
 dropout_28 (Dropout)        (None, 31)                0         
                                                                 
 dense_65 (Dense)            (None, 1)                 32        
                                                                 
Total params: 2,016
Trainable params: 2,016
Non-trainable params: 0
_________________________________________________________________


In [490]:
# Fit model
bsize = int(X_train.shape[0] * 0.1) # 10% batch size

%time history = model.fit(X_train, y_train, epochs=1000, validation_split=0.3, batch_size=bsize, verbose=0)


`Model.state_updates` will be removed in a future version. This property should not be used in TensorFlow 2.0, as `updates` are applied automatically.



Wall time: 11.6 s


In [491]:
y_pred = model.predict(X_test)
score = model.evaluate(X_test, y_test)
print(score)

diff = abs(y_pred.flatten() - y_test.flatten())
print(diff.std(), diff.mean())
print(y_pred.mean(), y_test.mean())
predictions = pd.DataFrame(data={'predicted': y_pred.flatten(), 'actual': y_test.flatten(), 'abs_diff': diff})
predictions.sort_values(by='abs_diff', inplace=True)
# predictions[predictions['actual'] < 10]
predictions


`Model.state_updates` will be removed in a future version. This property should not be used in TensorFlow 2.0, as `updates` are applied automatically.



[122.22689031313543, 0.0]
6.544636674862985 8.91036593728205
24.807655 26.14819035690796


Unnamed: 0,predicted,actual,abs_diff
848,25.519732,25.535420,0.015689
444,25.530586,25.547445,0.016859
146,23.702478,23.758099,0.055621
936,20.665405,20.725389,0.059983
256,27.180122,27.118644,0.061478
...,...,...,...
184,24.937679,56.470588,31.532909
322,19.179728,52.173913,32.994185
282,23.042009,57.471264,34.429255
262,22.712065,60.728745,38.016680


In [492]:
fig = px.histogram(x=predictions['abs_diff'], labels={
    'x': 'difference'
})

# Fix to make reliable metric to compare to
print('Percent acceptable: ' + str(len(predictions[predictions['abs_diff'] < 5]) / len(predictions) * 100))

fig.show()

Percent acceptable: 33.15068493150685


In [493]:
# Plot training and validation loss
fig = px.line()
fig.add_scatter(y=history.history['loss'], name='Train')
fig.add_scatter(y=history.history['val_loss'], name='Validation')

fig.update_layout(title='Train vs Validation Loss',
                   xaxis_title='Epochs',
                   yaxis_title='Loss')

fig.show()

In [494]:
# compute SHAP values
explainer = shap.DeepExplainer(model, X_train)
shap_values = explainer.shap_values(X_test)

shap.initjs()
shap.force_plot(explainer.expected_value[0], shap_values[0][0], features = X.columns)


Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.

