In [64]:
import pandas as pd
import requests
import json

import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
import numpy as np

import shap
import plotly.express as px

from minisom import MiniSom

# For shap to work with keras, disable v2 behavior
tf.compat.v1.disable_v2_behavior()

In [84]:
season = '2021-22'
percentile = 0.5

df = pd.read_csv('./stats/' + season + '_' + str(percentile) + '_h2h_stats.csv')
df = df.fillna(0)

print(df.shape)
df.head()

(36315, 27)


Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,TEAM_PTS,...,MATCHUP_FG3A,MATCHUP_FG3_PCT,HELP_BLK,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC
0,22021,201142,Kevin Durant,1629731,Dean Wade,2,21:09,105.8,29,99,...,4,0.5,0,0,0,0,9,9,4,1268.9
1,22021,201142,Kevin Durant,202699,Tobias Harris,3,12:36,68.8,28,93,...,5,0.6,0,0,0,0,3,4,2,756.2
2,22021,201142,Kevin Durant,203924,Jerami Grant,2,13:37,68.1,16,54,...,2,1.0,0,0,0,0,0,0,0,817.4
3,22021,201142,Kevin Durant,1629680,Matisse Thybulle,3,12:08,66.5,23,72,...,4,0.25,0,0,0,0,6,6,2,727.5
4,22021,201142,Kevin Durant,1627884,Derrick Jones Jr.,3,12:12,64.0,35,71,...,6,0.333,0,0,0,0,7,7,3,732.4


**Convert totals to per 100 possessions**

Most limited sample size, convert each to per possession and then multiply by 100 (more standard to measure in per 100 possessions rather than per possession, as generally nba teams average [100 possessions per game](https://www.teamrankings.com/nba/stat/possessions-per-game)).

In [85]:
MIN_MATCHUP_MINS = 10

In [86]:
h2h_df = df.copy()
h2h_df = h2h_df[h2h_df['MATCHUP_TIME_SEC'] > MIN_MATCHUP_MINS * 60] # Must have played more than x minutes
h2h_df.drop(columns=['TEAM_PTS', 'MATCHUP_FG_PCT', 'MATCHUP_FG3_PCT'], inplace=True)

def per_100_poss(x):
    return x / h2h_df['PARTIAL_POSS'] * 100

# Set stats to per 100 possessions
h2h_df = h2h_df.apply(lambda x: per_100_poss(x) if x.name not in h2h_df.columns[0:8] else x)

h2h_df.sort_values('DEF_PLAYER_NAME', ascending=True, inplace=True)

# Remove rows with zeros in important columns
check = h2h_df[h2h_df.columns[8:11]] != 0
h2h_df = h2h_df[check['PLAYER_PTS']] # & check['MATCHUP_AST']

print(h2h_df.shape)
h2h_df.head()

(674, 24)


Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,MATCHUP_AST,...,MATCHUP_FG3M,MATCHUP_FG3A,HELP_BLK,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC
7825,22021,1627749,Dejounte Murray,203932,Aaron Gordon,3,10:10,55.4,7.220217,12.635379,...,0.0,1.805054,0.0,0.0,0.0,0.0,3.610108,3.610108,1.805054,1100.722022
15307,22021,202704,Reggie Jackson,203932,Aaron Gordon,2,12:06,54.2,20.295203,11.070111,...,3.690037,12.915129,0.0,0.0,0.0,0.0,1.845018,3.690037,1.845018,1338.560886
4693,22021,1627742,Brandon Ingram,203932,Aaron Gordon,1,12:10,54.3,3.683241,5.524862,...,0.0,5.524862,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1344.383057
5250,22021,1630162,Anthony Edwards,203932,Aaron Gordon,3,16:20,83.5,17.964072,1.197605,...,3.592814,8.383234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1173.772455
3342,22021,203897,Zach LaVine,203932,Aaron Gordon,2,10:32,54.7,27.422303,10.968921,...,3.656307,12.797075,0.0,0.0,0.0,0.0,1.828154,3.656307,0.0,1154.844607


In [87]:
som_data = h2h_df[h2h_df.columns[7:]].values
som = MiniSom(20, 20, som_data.shape[1], sigma=5, learning_rate=0.5,
              neighborhood_function='gaussian', random_seed=10)

%time som.train(som_data, 20000, random_order=False)  # random training

quantization_errors = np.linalg.norm(som.quantization(som_data) - som_data, axis=1)

error_threshold = np.percentile(quantization_errors, 
                               100*(1-.25))+(2*(np.percentile(quantization_errors, 
                               100*(1-.25)) - np.percentile(quantization_errors, 100*(1-.75))))

is_outlier = quantization_errors > error_threshold
error_threshold

Wall time: 7.22 s


26.553343166780927

In [88]:
px.histogram(x=quantization_errors, labels={
    'x': 'error'
})

In [89]:
fig = px.scatter_3d(h2h_df, x='PLAYER_PTS', y='MATCHUP_AST', z='MATCHUP_TOV', color=is_outlier, 
                    labels={
                     'PLAYER_PTS': 'Points',
                     'MATCHUP_AST': 'Assists',
                     'MATCHUP_TOV': 'Turnovers'
                    })
fig.show()

In [90]:
fig = px.scatter_3d(h2h_df[is_outlier == False], x='PLAYER_PTS', y='MATCHUP_AST', z='MATCHUP_TOV', 
                    labels={
                     'PLAYER_PTS': 'Points',
                     'MATCHUP_AST': 'Assists',
                     'MATCHUP_TOV': 'Turnovers'
                    })
fig.show()

In [91]:
# Remove outliers
h2h_df = h2h_df[is_outlier == False]
print(h2h_df.shape)

(634, 24)


# Retrieve Offensive and Defensive Stats

Retrieves defensive data for each defending player

In [92]:
data = pd.read_csv('./stats/'+ season + '_def_stats.csv')
data = data.fillna(0)

# Get def stats only from selected defenders (rename player_id to def_player_id to merge arrays)
data.rename(columns={'PLAYER_ID': 'DEF_PLAYER_ID'}, inplace=True)
def_df = pd.merge(data, h2h_df[['DEF_PLAYER_ID', 'SEASON_ID']], how ='inner', on=['DEF_PLAYER_ID', 'SEASON_ID'])
def_df = def_df[['DEF_PLAYER_ID', 'PLAYER_NAME', 'AGE', 'STL', 'BLK', 'DREB', 'CONTESTED_SHOTS', 'SEASON_ID']]
def_df.rename(columns={'PLAYER_NAME': 'DEF_PLAYER_NAME', 'AGE': 
                       'DEF_AGE', 'STL': 'DEF_STL', 'BLK': 'DEF_BLK', 'DREB': 'DEF_DREB', 
                       'CONTESTED_SHOTS': 'DEF_CONTESTED_SHOTS'}, inplace=True)

# Add offensive player (helps merging offensive stats)
def_df['OFF_PLAYER_ID'] = h2h_df['OFF_PLAYER_ID'].to_numpy()
def_df['OFF_PLAYER_NAME'] = h2h_df['OFF_PLAYER_NAME'].to_numpy()

print(def_df.shape)
print(def_df.columns)

(634, 10)
Index(['DEF_PLAYER_ID', 'DEF_PLAYER_NAME', 'DEF_AGE', 'DEF_STL', 'DEF_BLK',
       'DEF_DREB', 'DEF_CONTESTED_SHOTS', 'SEASON_ID', 'OFF_PLAYER_ID',
       'OFF_PLAYER_NAME'],
      dtype='object')


In [93]:
data = pd.read_csv('./stats/' + season + '_' + str(percentile) + '_off_stats.csv')
data = data.fillna(0)

# Get off stats only from selected offensive players
data = data[['PLAYER_ID', 'AGE', 'FGM', 'FGA', 'FG3M', 'FG3A', 'OREB', 'AST', 'PTS', 'SEASON_ID']]
data = data.add_prefix('OFF_')
data.rename(columns={'OFF_SEASON_ID': 'SEASON_ID'}, inplace=True)
combine = pd.merge(def_df, data, how='inner', on=['OFF_PLAYER_ID', 'SEASON_ID'])

# Get correct sort
combine.sort_values(['DEF_PLAYER_ID', 'OFF_PLAYER_ID'], inplace=True)
combine.head()

Unnamed: 0,DEF_PLAYER_ID,DEF_PLAYER_NAME,DEF_AGE,DEF_STL,DEF_BLK,DEF_DREB,DEF_CONTESTED_SHOTS,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,OFF_AGE,OFF_FGM,OFF_FGA,OFF_FG3M,OFF_FG3A,OFF_OREB,OFF_AST,OFF_PTS
312,2544,LeBron James,37.0,0.6,0.3,2.3,3.2,22021,1626174,Christian Wood,26.0,6.5,13.5,1.8,5.0,1.7,2.2,17.7
358,2544,LeBron James,37.0,0.6,0.3,2.3,3.2,22021,1628991,Jaren Jackson Jr.,22.0,5.7,13.6,1.7,5.4,1.4,1.1,16.6
241,2544,LeBron James,37.0,0.6,0.3,2.3,3.2,22021,1630180,Saddiq Bey,22.0,5.4,13.9,2.4,7.3,1.4,2.6,15.5
371,101108,Chris Paul,36.0,1.0,1.4,9.5,10.02,22021,1626181,Norman Powell,28.0,6.2,13.6,2.3,5.7,0.4,2.1,18.7
370,101108,Chris Paul,36.0,1.0,1.4,9.5,10.02,22021,1630170,Devin Vassell,21.0,4.4,10.1,1.8,5.1,0.6,1.5,11.4


In [94]:
# Sort head to head stats by same sort
h2h_df.sort_values(['DEF_PLAYER_ID', 'OFF_PLAYER_ID'], inplace=True)
h2h_df.head()

Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,MATCHUP_AST,...,MATCHUP_FG3M,MATCHUP_FG3A,HELP_BLK,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC
12296,22021,1626174,Christian Wood,2544,LeBron James,3,14:16,76.2,31.496063,5.249344,...,5.249344,6.56168,0.0,0.0,0.0,0.0,2.624672,2.624672,1.312336,1123.35958
15003,22021,1628991,Jaren Jackson Jr.,2544,LeBron James,4,16:07,75.0,4.0,1.333333,...,1.333333,6.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1289.066667
18600,22021,1630180,Saddiq Bey,2544,LeBron James,2,10:08,48.4,6.198347,4.132231,...,2.066116,4.132231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1255.785124
9254,22021,1626181,Norman Powell,101108,Chris Paul,3,12:52,65.0,21.538462,0.0,...,4.615385,12.307692,0.0,0.0,0.0,0.0,1.538462,1.538462,1.538462,1187.538462
32326,22021,1630170,Devin Vassell,101108,Chris Paul,3,10:07,45.1,33.259424,2.217295,...,6.651885,8.86918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1345.676275


**Machine Learning Implementation**

Utilizes the keras Model from Tensorflow to predict offensive player's stats.
If there is an error, make sure to first install tensorflow. This can be done through Anaconda, or through the command 

```
!pip install tensorflow
```

In [95]:
# Clean data to use on model
X = combine.drop(columns=['DEF_PLAYER_ID', 'DEF_PLAYER_NAME', 'OFF_PLAYER_ID', 'OFF_PLAYER_NAME', 'SEASON_ID'])
y = h2h_df['PLAYER_PTS']
print(X.shape, y.shape)

# Apply normalization to input
X = (X - X.mean()) / X.std()

# Generate training and verification data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print('Training Entries')
X_train.info()
X_train.describe()

print('Testing Entries')
X_test.info()
X_test.describe()

X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

(634, 13) (634,)
Training Entries
<class 'pandas.core.frame.DataFrame'>
Int64Index: 507 entries, 404 to 34
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   DEF_AGE              507 non-null    float64
 1   DEF_STL              507 non-null    float64
 2   DEF_BLK              507 non-null    float64
 3   DEF_DREB             507 non-null    float64
 4   DEF_CONTESTED_SHOTS  507 non-null    float64
 5   OFF_AGE              507 non-null    float64
 6   OFF_FGM              507 non-null    float64
 7   OFF_FGA              507 non-null    float64
 8   OFF_FG3M             507 non-null    float64
 9   OFF_FG3A             507 non-null    float64
 10  OFF_OREB             507 non-null    float64
 11  OFF_AST              507 non-null    float64
 12  OFF_PTS              507 non-null    float64
dtypes: float64(13)
memory usage: 55.5 KB
Testing Entries
<class 'pandas.core.frame.DataFrame'>
Int64Index: 12

**Keras Model Philosophy**

1. Normalize inputs
2. Dropout layer to prevent overfitting
3. Dense layer

In [96]:
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.keras import layers, Sequential
from tensorflow.keras.layers import Dense, Dropout

# Create model
model = Sequential()
# model.add(Dropout(0.2, ))
model.add(Dense(11, activation='relu', input_shape=(X.shape[1],)))
# model.add(Dense(20, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='relu')) # y.shape[1]

# Compile model
model.compile(optimizer='nadam',loss='mean_squared_error', metrics=['accuracy'])

model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 11)                154       
                                                                 
 dropout_3 (Dropout)         (None, 11)                0         
                                                                 
 dense_9 (Dense)             (None, 1)                 12        
                                                                 
Total params: 166
Trainable params: 166
Non-trainable params: 0
_________________________________________________________________


In [97]:
# Fit model
%time history = model.fit(X_train, y_train, epochs=1000, validation_split=0.7, batch_size=800, verbose=0)


`Model.state_updates` will be removed in a future version. This property should not be used in TensorFlow 2.0, as `updates` are applied automatically.



Wall time: 11.5 s


In [98]:
y_pred = model.predict(X_test)
score = model.evaluate(X_test, y_test)
print(score)

diff = abs(y_pred.flatten() - y_test)
print(diff.std(), diff.mean())
print(y_pred.mean(), y_test.mean())
predictions = pd.DataFrame(data={'predicted': y_pred.flatten(), 'actual': y_test, 'abs_diff': diff})
predictions.sort_values(by='abs_diff', inplace=True)
predictions


`Model.state_updates` will be removed in a future version. This property should not be used in TensorFlow 2.0, as `updates` are applied automatically.



[132.96685863104392, 0.0]
7.83892095075726 8.456841683769227
15.581363 20.661737600289385


Unnamed: 0,predicted,actual,abs_diff
103,6.393223,6.451613,0.058390
55,12.610605,12.522361,0.088244
90,9.326221,9.433962,0.107741
19,9.395297,9.505703,0.110406
10,21.301952,21.420519,0.118566
...,...,...,...
118,16.476734,43.786982,27.310248
28,22.698389,52.023121,29.324732
68,37.856621,4.767580,33.089040
125,8.518131,41.825095,33.306964


In [99]:
px.histogram(x=predictions['abs_diff'], labels={
    'x': 'difference'
})

In [100]:
# Plot training and validation loss
fig = px.line()
fig.add_scatter(y=history.history['loss'], name='Train')
fig.add_scatter(y=history.history['val_loss'], name='Validation')

fig.update_layout(title='Train vs Validation Loss',
                   xaxis_title='Epochs',
                   yaxis_title='Loss')

fig.show()

In [101]:
# compute SHAP values
explainer = shap.DeepExplainer(model, X_train)
shap_values = explainer.shap_values(X_test)

shap.initjs()
shap.force_plot(explainer.expected_value[0], shap_values[0][0], features = X.columns)


Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode. See PR #1483 for discussion.

