In [92]:
import pandas as pd
import requests
import json
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split

import shap
import plotly.express as px

from minisom import MiniSom

# For shap to work with keras, disable v2 behavior
tf.compat.v1.disable_v2_behavior()

In [93]:
season = '2020-21'
percentile = 0.2

df = pd.read_csv('./stats/' + season + '_' + str(percentile) + '_h2h_stats.csv')
# df = pd.read_csv('./stats/h2h_combined.csv')
df = df.fillna(0)

print(df.shape)
df.head()

(119185, 27)


Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,TEAM_PTS,...,MATCHUP_FG3A,MATCHUP_FG3_PCT,HELP_BLK,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC
0,22020,203932,Aaron Gordon,202710,Jimmy Butler,3,12:10,61.1,15,69,...,2,0.0,0,0,0,0,1,2,1,729.9
1,22020,203932,Aaron Gordon,1629647,Darius Bazley,2,10:08,56.1,9,58,...,5,0.2,0,0,0,0,0,0,0,608.1
2,22020,203932,Aaron Gordon,202699,Tobias Harris,2,10:45,53.8,4,58,...,2,0.0,0,0,0,0,0,0,0,645.0
3,22020,203932,Aaron Gordon,203937,Kyle Anderson,2,10:41,52.5,5,48,...,5,0.2,0,0,0,0,0,0,0,640.5
4,22020,203932,Aaron Gordon,202695,Kawhi Leonard,3,10:12,51.8,11,44,...,2,0.5,0,0,0,0,0,0,0,612.2


# Convert totals to per 100 possessions

Most limited sample size, convert each to per possession and then multiply by 100 (more standard to measure in per 100 possessions rather than per possession, as generally nba teams average [100 possessions per game](https://www.teamrankings.com/nba/stat/possessions-per-game)).

In [112]:
MIN_MATCHUP_MINS = 10

In [148]:
h2h_df = df.copy()
h2h_df = h2h_df[h2h_df['MATCHUP_TIME_SEC'] > MIN_MATCHUP_MINS * 60] # Must have played more than x minutes
h2h_df.drop(columns=['TEAM_PTS', 'MATCHUP_FG_PCT', 'MATCHUP_FG3_PCT'], inplace=True)

def per_100_poss(x):
    return x / h2h_df['PARTIAL_POSS'] * 100

# Set stats to per 100 possessions
h2h_df = h2h_df.apply(lambda x: per_100_poss(x) if x.name not in h2h_df.columns[0:8] else x)

h2h_df.sort_values('DEF_PLAYER_NAME', ascending=True, inplace=True)

# Remove rows with zeros in important columns
check = h2h_df[h2h_df.columns[8:11]] != 0
h2h_df = h2h_df[check['PLAYER_PTS'] & check['MATCHUP_AST']]

print(h2h_df.shape)
h2h_df.head()

(1591, 24)


Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,MATCHUP_AST,...,MATCHUP_FG3M,MATCHUP_FG3A,HELP_BLK,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC
66422,22020,203482,Kelly Olynyk,203932,Aaron Gordon,4,11:12,62.4,16.025641,8.012821,...,0.0,3.205128,0.0,0.0,0.0,0.0,3.205128,3.205128,1.602564,1076.121795
65573,22020,202695,Kawhi Leonard,203932,Aaron Gordon,3,21:57,95.8,20.876827,7.306889,...,2.087683,5.219207,0.0,0.0,0.0,0.0,2.087683,2.087683,1.043841,1374.217119
96184,22020,1629628,RJ Barrett,203932,Aaron Gordon,3,17:16,80.6,18.610422,1.240695,...,3.722084,7.444169,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1284.863524
57502,22020,202710,Jimmy Butler,203932,Aaron Gordon,3,14:50,76.9,14.304291,6.501951,...,0.0,0.0,0.0,0.0,0.0,0.0,3.90117,6.501951,3.90117,1157.86736
25637,22020,201942,DeMar DeRozan,203932,Aaron Gordon,2,12:46,63.5,23.622047,17.322835,...,0.0,0.0,0.0,0.0,0.0,0.0,7.874016,7.874016,4.724409,1206.456693


# Sanitize Outputs

The program is only as good as the data you give it

In [114]:
features = ['PLAYER_PTS']

som_data = h2h_df[features].values # h2h_df.columns[7:10]
som = MiniSom(20, 20, len(features), sigma=5, learning_rate=0.5, # som_data.shape[1] in 3
              neighborhood_function='gaussian', random_seed=10)

%time som.train(som_data, 20000, random_order=False)  # random training

quantization_errors = np.linalg.norm(som.quantization(som_data) - som_data, axis=1)

error_threshold = np.percentile(quantization_errors, 
                               100*(1-.25))+(0.5*(np.percentile(quantization_errors, 
                               100*(1-.25)) - np.percentile(quantization_errors, 100*(1-.75))))

is_outlier = quantization_errors > error_threshold
error_threshold

Wall time: 906 ms


0.0878056907608249

In [115]:
px.histogram(x=quantization_errors, labels={
    'x': 'error'
})

In [116]:
fig = px.scatter(h2h_df, x='PLAYER_PTS', y='MATCHUP_TIME_SEC', color=is_outlier, 
                    labels={
                     'PLAYER_PTS': 'Points',
                     'MATCHUP_AST': 'Assists',
                     'MATCHUP_TOV': 'Turnovers',
                     'MATCHUP_TIME_SEC': 'Matchup Time'
                    })
fig.show()

In [117]:
fig = px.scatter(h2h_df[is_outlier == False], x='PLAYER_PTS', y='MATCHUP_AST', 
                    labels={
                     'PLAYER_PTS': 'Points',
                     'MATCHUP_AST': 'Assists',
                     'MATCHUP_TOV': 'Turnovers'
                    })
fig.show()

In [150]:
# Remove outliers
out = h2h_df[is_outlier == False]
print(out.shape)

(1324, 24)
(1591, 24)


# Retrieve Offensive and Defensive Stats

Retrieves defensive data for each defending player

In [151]:
data = pd.read_csv('./stats/'+ season + '_def_stats.csv')
# data = pd.read_csv('./stats/def_combined.csv')
data = data.fillna(0)

# Get def stats only from selected defenders (rename player_id to def_player_id to merge arrays)
data = data.add_prefix('DEF_')
data.rename(columns={'DEF_SEASON_ID': 'SEASON_ID', 'DEF_DEF_RATING': 'DEF_RATING', 
                     'DEF_DEF_BOXOUTS': 'DEF_BOXOUTS'}, inplace=True)
def_df = pd.merge(data, out[['DEF_PLAYER_ID', 'SEASON_ID']], how ='inner', on=['DEF_PLAYER_ID', 'SEASON_ID'])
def_df.drop(columns=['DEF_GP', 'DEF_G', 'DEF_D_FG_PCT', 'DEF_DREB_PCT', 'DEF_PCT_STL', 'DEF_PCT_BLK'], inplace=True)

# Add offensive player (helps merging offensive stats)
print(def_df.shape[0], out.shape[0])
def_df['OFF_PLAYER_ID'] = out['OFF_PLAYER_ID'].to_numpy()
def_df['OFF_PLAYER_NAME'] = out['OFF_PLAYER_NAME'].to_numpy()

print(def_df.columns)
def_df

1591 1591
Index(['DEF_PLAYER_ID', 'DEF_PLAYER_NAME', 'DEF_AGE', 'DEF_D_FGM', 'DEF_D_FGA',
       'DEF_NORMAL_FG_PCT', 'DEF_PCT_PLUSMINUS', 'DEF_W', 'DEF_L', 'DEF_MIN',
       'DEF_STL', 'DEF_BLK', 'DEF_DREB', 'DEF_CONTESTED_SHOTS',
       'DEF_CONTESTED_SHOTS_2PT', 'DEF_CONTESTED_SHOTS_3PT', 'DEF_DEFLECTIONS',
       'DEF_CHARGES_DRAWN', 'DEF_BOXOUTS', 'DEF_PCT_BOX_OUTS_REB', 'SEASON_ID',
       'DEF_RATING', 'DEF_OPP_PTS_OFF_TOV', 'DEF_OPP_PTS_2ND_CHANCE',
       'DEF_OPP_PTS_FB', 'DEF_OPP_PTS_PAINT', 'DEF_DEF_WS', 'OFF_PLAYER_ID',
       'OFF_PLAYER_NAME'],
      dtype='object')


Unnamed: 0,DEF_PLAYER_ID,DEF_PLAYER_NAME,DEF_AGE,DEF_D_FGM,DEF_D_FGA,DEF_NORMAL_FG_PCT,DEF_PCT_PLUSMINUS,DEF_W,DEF_L,DEF_MIN,...,DEF_PCT_BOX_OUTS_REB,SEASON_ID,DEF_RATING,DEF_OPP_PTS_OFF_TOV,DEF_OPP_PTS_2ND_CHANCE,DEF_OPP_PTS_FB,DEF_OPP_PTS_PAINT,DEF_DEF_WS,OFF_PLAYER_ID,OFF_PLAYER_NAME
0,203932,Aaron Gordon,25.0,4.64,10.08,0.471,-0.011,7.0,13.0,19.6,...,0.423,22020,110.9,6.6,5.7,4.9,18.8,0.056,203482,Kelly Olynyk
1,203932,Aaron Gordon,25.0,4.64,10.08,0.471,-0.011,7.0,13.0,19.6,...,0.423,22020,110.9,6.6,5.7,4.9,18.8,0.056,202695,Kawhi Leonard
2,203932,Aaron Gordon,25.0,4.64,10.08,0.471,-0.011,7.0,13.0,19.6,...,0.423,22020,110.9,6.6,5.7,4.9,18.8,0.056,1629628,RJ Barrett
3,203932,Aaron Gordon,25.0,4.64,10.08,0.471,-0.011,7.0,13.0,19.6,...,0.423,22020,110.9,6.6,5.7,4.9,18.8,0.056,202710,Jimmy Butler
4,203932,Aaron Gordon,25.0,4.64,10.08,0.471,-0.011,7.0,13.0,19.6,...,0.423,22020,110.9,6.6,5.7,4.9,18.8,0.056,201942,DeMar DeRozan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1586,1629627,Zion Williamson,20.0,6.13,13.74,0.465,-0.019,52.0,20.0,30.8,...,0.156,22020,107.3,11.5,7.9,7.8,33.3,0.121,1629020,Jarred Vanderbilt
1587,1629627,Zion Williamson,20.0,6.13,13.74,0.465,-0.019,52.0,20.0,30.8,...,0.156,22020,107.3,11.5,7.9,7.8,33.3,0.121,200794,Paul Millsap
1588,1629627,Zion Williamson,20.0,6.13,13.74,0.465,-0.019,52.0,20.0,30.8,...,0.156,22020,107.3,11.5,7.9,7.8,33.3,0.121,1627884,Derrick Jones Jr.
1589,1629627,Zion Williamson,20.0,6.13,13.74,0.465,-0.019,52.0,20.0,30.8,...,0.156,22020,107.3,11.5,7.9,7.8,33.3,0.121,1630172,Patrick Williams


In [152]:
data = pd.read_csv('./stats/' + season + '_' + str(percentile) + '_off_stats.csv')
# data = pd.read_csv('./stats/off_combined.csv')
data = data.fillna(0)

# Get off stats only from selected offensive players
data = data.add_prefix('OFF_')
data.rename(columns={'OFF_SEASON_ID': 'SEASON_ID'}, inplace=True)
combine = pd.merge(def_df, data, how='inner', on=['OFF_PLAYER_ID', 'SEASON_ID'])

# Get correct sort
combine.sort_values(['DEF_PLAYER_ID', 'OFF_PLAYER_ID'], inplace=True)
print(combine.shape)
print(combine.columns)
combine.head()

(1591, 68)
Index(['DEF_PLAYER_ID', 'DEF_PLAYER_NAME', 'DEF_AGE', 'DEF_D_FGM', 'DEF_D_FGA',
       'DEF_NORMAL_FG_PCT', 'DEF_PCT_PLUSMINUS', 'DEF_W', 'DEF_L', 'DEF_MIN',
       'DEF_STL', 'DEF_BLK', 'DEF_DREB', 'DEF_CONTESTED_SHOTS',
       'DEF_CONTESTED_SHOTS_2PT', 'DEF_CONTESTED_SHOTS_3PT', 'DEF_DEFLECTIONS',
       'DEF_CHARGES_DRAWN', 'DEF_BOXOUTS', 'DEF_PCT_BOX_OUTS_REB', 'SEASON_ID',
       'DEF_RATING', 'DEF_OPP_PTS_OFF_TOV', 'DEF_OPP_PTS_2ND_CHANCE',
       'DEF_OPP_PTS_FB', 'DEF_OPP_PTS_PAINT', 'DEF_DEF_WS', 'OFF_PLAYER_ID',
       'OFF_PLAYER_NAME', 'OFF_AGE', 'OFF_GP', 'OFF_MIN', 'OFF_FGM', 'OFF_FGA',
       'OFF_FG_PCT', 'OFF_FG3M', 'OFF_FG3A', 'OFF_FG3_PCT', 'OFF_FTM',
       'OFF_FTA', 'OFF_FT_PCT', 'OFF_OREB', 'OFF_DREB', 'OFF_REB', 'OFF_AST',
       'OFF_TOV', 'OFF_BLKA', 'OFF_PF', 'OFF_PTS', 'OFF_TOUCHES',
       'OFF_PAINT_TOUCHES', 'OFF_PAINT_TOUCH_FGM', 'OFF_PAINT_TOUCH_FGA',
       'OFF_PAINT_TOUCH_PASSES', 'OFF_PAINT_TOUCH_TOV', 'OFF_DRIVE_PTS',
       'OFF_DRIVE_

Unnamed: 0,DEF_PLAYER_ID,DEF_PLAYER_NAME,DEF_AGE,DEF_D_FGM,DEF_D_FGA,DEF_NORMAL_FG_PCT,DEF_PCT_PLUSMINUS,DEF_W,DEF_L,DEF_MIN,...,OFF_CATCH_SHOOT_FG_PCT,OFF_PULL_UP_PTS,OFF_PULL_UP_FG_PCT,OFF_PAINT_TOUCH_PTS,OFF_PAINT_TOUCH_FG_PCT,OFF_POST_TOUCH_PTS,OFF_POST_TOUCH_FG_PCT,OFF_ELBOW_TOUCH_PTS,OFF_ELBOW_TOUCH_FG_PCT,OFF_EFF_FG_PCT
66,2544,LeBron James,36.0,3.95,9.93,0.461,-0.063,40.0,21.0,33.0,...,0.336,6.3,0.488,0.8,0.625,0.9,0.567,0.5,0.538,0.505
554,2544,LeBron James,36.0,3.95,9.93,0.461,-0.063,40.0,21.0,33.0,...,0.379,1.8,0.398,0.7,0.512,0.0,0.0,0.1,0.8,0.551
1486,2544,LeBron James,36.0,3.95,9.93,0.461,-0.063,40.0,21.0,33.0,...,0.375,3.6,0.34,1.5,0.585,0.5,0.323,0.6,0.44,0.491
1563,2544,LeBron James,36.0,3.95,9.93,0.461,-0.063,40.0,21.0,33.0,...,0.331,1.5,0.313,3.1,0.603,0.3,0.462,0.6,0.545,0.501
968,2546,Carmelo Anthony,37.0,3.67,7.96,0.462,-0.001,22.0,15.0,23.8,...,0.317,0.4,0.333,4.8,0.63,0.1,0.429,1.6,0.477,0.537


In [153]:
# Sort head to head stats by same sort
h2h_df.sort_values(['DEF_PLAYER_ID', 'OFF_PLAYER_ID'], inplace=True)
h2h_df.head()

Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,MATCHUP_AST,...,MATCHUP_FG3M,MATCHUP_FG3A,HELP_BLK,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC
25638,22020,201942,DeMar DeRozan,2544,LeBron James,3,13:27,62.9,20.667727,4.769475,...,4.769475,6.3593,0.0,0.0,0.0,0.0,3.17965,3.17965,1.589825,1282.988871
63998,22020,203200,Justin Holiday,2544,LeBron James,2,11:26,58.3,17.152659,1.715266,...,3.430532,13.722127,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1177.358491
56523,22020,203924,Jerami Grant,2544,LeBron James,2,13:43,58.1,13.769363,3.442341,...,1.72117,8.605852,0.0,0.0,0.0,0.0,1.72117,1.72117,1.72117,1416.351119
66754,22020,1626162,Kelly Oubre Jr.,2544,LeBron James,3,14:42,86.4,16.203704,1.157407,...,1.157407,6.944444,0.0,0.0,0.0,0.0,1.157407,2.314815,0.0,1020.601852
9231,22020,1629634,Brandon Clarke,2546,Carmelo Anthony,3,11:20,68.9,8.708273,1.451379,...,0.0,0.0,0.0,0.0,0.0,0.0,2.902758,2.902758,1.451379,987.082729


**Machine Learning Implementation**

Utilizes the keras Model from Tensorflow to predict offensive player's stats.
If there is an error, make sure to first install tensorflow. This can be done through Anaconda, or through the command 

```
!pip install tensorflow
```

In [154]:
# # Clean data to use on model
X = combine.drop(columns=['DEF_PLAYER_ID', 'DEF_PLAYER_NAME', 'OFF_PLAYER_ID', 'OFF_PLAYER_NAME', 'SEASON_ID'])
y = out['PLAYER_PTS']

# Save to csvs
# X.to_csv(r'.\stats\cleaned\\' + season + '_X.csv', index=False)
# y.to_csv(r'.\stats\cleaned\\' + season + '_y.csv', index=False)

# X = pd.read_csv('./stats/cleaned/X_combined.csv')
# y = pd.read_csv('./stats/cleaned/y_combined.csv')

print(X.shape, y.shape)

# Apply normalization to input
X = (X - X.mean()) / X.std()

# Generate training and verification data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print('Training Entries')
X_train.info()
X_train.describe()

print('Testing Entries')
X_test.info()
X_test.describe()

X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

(1591, 63) (1591,)
Training Entries
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1272 entries, 698 to 1552
Data columns (total 63 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   DEF_AGE                  1272 non-null   float64
 1   DEF_D_FGM                1272 non-null   float64
 2   DEF_D_FGA                1272 non-null   float64
 3   DEF_NORMAL_FG_PCT        1272 non-null   float64
 4   DEF_PCT_PLUSMINUS        1272 non-null   float64
 5   DEF_W                    1272 non-null   float64
 6   DEF_L                    1272 non-null   float64
 7   DEF_MIN                  1272 non-null   float64
 8   DEF_STL                  1272 non-null   float64
 9   DEF_BLK                  1272 non-null   float64
 10  DEF_DREB                 1272 non-null   float64
 11  DEF_CONTESTED_SHOTS      1272 non-null   float64
 12  DEF_CONTESTED_SHOTS_2PT  1272 non-null   float64
 13  DEF_CONTESTED_SHOTS_3PT  1272 non-null  

**Keras Model Philosophy**

1. Normalize inputs
2. Dropout layer to prevent overfitting
3. Dense layer

In [219]:
# Create model
model = Sequential()
model.add(Dense(int(X.shape[1] / 2), activation='relu', input_shape=(X.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='relu')) # y.shape[1]

# Compile model
model.compile(optimizer='nadam', loss='mean_squared_error', metrics=['accuracy'])

model.summary()

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_36 (Dense)            (None, 31)                1984      
                                                                 
 dropout_18 (Dropout)        (None, 31)                0         
                                                                 
 dense_37 (Dense)            (None, 1)                 32        
                                                                 
Total params: 2,016
Trainable params: 2,016
Non-trainable params: 0
_________________________________________________________________


In [220]:
# Fit model
bsize = int(X_train.shape[0] * 0.1) # 10% batch size

%time history = model.fit(X_train, y_train, epochs=10000, validation_split=0.6, batch_size=bsize, verbose=0)


`Model.state_updates` will be removed in a future version. This property should not be used in TensorFlow 2.0, as `updates` are applied automatically.



Wall time: 59.9 s


In [221]:
y_pred = model.predict(X_test)
score = model.evaluate(X_test, y_test)
print(score)

diff = abs(y_pred.flatten() - y_test.flatten())
print(diff.std(), diff.mean())
print(y_pred.mean(), y_test.mean())
predictions = pd.DataFrame(data={'predicted': y_pred.flatten(), 'actual': y_test.flatten(), 'abs_diff': diff})
predictions.sort_values(by='abs_diff', inplace=True)
predictions[predictions['actual'] <10]
# predictions


`Model.state_updates` will be removed in a future version. This property should not be used in TensorFlow 2.0, as `updates` are applied automatically.



[99.45735929004825, 0.0]
6.126680433721523 7.8689990561922105
20.937819 19.07229053966266


Unnamed: 0,predicted,actual,abs_diff
82,7.239316,9.316770,2.077455
27,10.409878,7.102273,3.307605
70,13.385365,9.328358,4.057006
42,12.881271,8.223684,4.657587
0,14.305829,9.646302,4.659527
...,...,...,...
301,30.759468,8.344924,22.414545
10,31.960779,9.345794,22.614985
80,27.079601,1.858736,25.220865
310,37.397858,4.464286,32.933572


In [222]:
fig = px.histogram(x=predictions['abs_diff'], labels={
    'x': 'difference'
})

# nth percentile difference (Lower number better)
print(predictions.quantile(q=0.75)['abs_diff'])
print('Percent acceptable: ' + str(len(predictions[predictions['abs_diff'] < 5]) / len(predictions) * 100))

fig.show()

10.971606861281348
Percent acceptable: 38.24451410658307


In [223]:
# Plot training and validation loss
fig = px.line()
fig.add_scatter(y=history.history['loss'], name='Train')
fig.add_scatter(y=history.history['val_loss'], name='Validation')

fig.update_layout(title='Train vs Validation Loss',
                   xaxis_title='Epochs',
                   yaxis_title='Loss')

fig.show()

In [224]:
# compute SHAP values
explainer = shap.DeepExplainer(model, X_train)
shap_values = explainer.shap_values(X_test)

shap.initjs()
shap.force_plot(explainer.expected_value[0], shap_values[0][0], features = X.columns)


Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.

