In [764]:
import pandas as pd
import requests
import json

import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
import numpy as np

import shap
import plotly.express as px

from minisom import MiniSom

# For shap to work with keras, disable v2 behavior
tf.compat.v1.disable_v2_behavior()

In [765]:
player = 'scurry'

df = pd.read_csv('./stats/player//' + player + '_combined.csv')
df = df.fillna(0)

print(df.shape)
df.head()

(1329, 27)


Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,TEAM_PTS,...,MATCHUP_FG3A,MATCHUP_FG3_PCT,HELP_BLK,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC
0,22017,201939,Stephen Curry,201566,Russell Westbrook,3,20:43,107.1,13,66,...,8,0.125,0,0,0,0,0,0,0,1242.7
1,22017,201939,Stephen Curry,201952,Jeff Teague,2,13:41,77.4,14,72,...,7,0.286,0,0,0,0,0,1,2,821.3
2,22017,201939,Stephen Curry,203901,Elfrid Payton,2,13:26,69.8,13,77,...,7,0.143,0,0,0,0,0,0,0,806.4
3,22017,201939,Stephen Curry,203915,Spencer Dinwiddie,2,12:26,65.0,8,59,...,4,0.0,0,0,0,0,0,0,2,745.6
4,22017,201939,Stephen Curry,200765,Rajon Rondo,2,11:42,61.0,11,44,...,5,0.2,0,0,0,0,0,0,0,702.5


**Convert totals to per 100 possessions**

Most limited sample size, convert each to per possession and then multiply by 100 (more standard to measure in per 100 possessions rather than per possession, as generally nba teams average [100 possessions per game](https://www.teamrankings.com/nba/stat/possessions-per-game)).

In [929]:
MIN_MATCHUP_MINS = 1
MAX_PP100 = 60

In [930]:
off_df = df.copy()
off_df = off_df[off_df['MATCHUP_TIME_SEC'] > MIN_MATCHUP_MINS * 60] # Must have played more than x minutes
off_df.drop(columns=['TEAM_PTS', 'MATCHUP_FG_PCT', 'MATCHUP_FG3_PCT'], inplace=True)

def per_100_poss(x):
    return x / off_df['PARTIAL_POSS'] * 100

# Set stats to per 100 possessions
off_df = off_df.apply(lambda x: per_100_poss(x) if x.name not in off_df.columns[0:8] else x)

off_df.sort_values('DEF_PLAYER_NAME', ascending=True, inplace=True)

# Remove rows with zeros in important columns
check = off_df[off_df.columns[8:11]] != 0
off_df = off_df[check['PLAYER_PTS'] & check['MATCHUP_AST']]
off_df = off_df[off_df['PLAYER_PTS'] < MAX_PP100]

print(off_df.shape)
off_df.head()

(348, 24)


Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,MATCHUP_AST,...,MATCHUP_FG3M,MATCHUP_FG3A,HELP_BLK,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC
376,22018,201939,Stephen Curry,202329,Al-Farouq Aminu,3,3:15,19.6,45.918367,5.102041,...,5.102041,15.306122,0.0,0.0,0.0,0.0,10.204082,10.204082,0.0,992.857143
1048,22021,201939,Stephen Curry,202692,Alec Burks,1,4:58,26.4,41.666667,7.575758,...,7.575758,26.515152,0.0,0.0,0.0,0.0,3.787879,7.575758,0.0,1129.166667
740,22020,201939,Stephen Curry,1627936,Alex Caruso,2,3:29,23.4,47.008547,4.273504,...,8.547009,8.547009,0.0,0.0,0.0,0.0,12.820513,17.094017,8.547009,891.452991
35,22017,201939,Stephen Curry,1626150,Andrew Harrison,2,4:11,24.8,28.225806,4.032258,...,4.032258,16.129032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1011.290323
104,22017,201939,Stephen Curry,203952,Andrew Wiggins,2,1:50,10.4,28.846154,28.846154,...,9.615385,19.230769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1059.615385


In [931]:
som_data = off_df[off_df.columns[7:]].values
som = MiniSom(20, 20, som_data.shape[1], sigma=5, learning_rate=0.5,
              neighborhood_function='gaussian', random_seed=10)

%time som.train(som_data, 20000, random_order=False)  # random training

quantization_errors = np.linalg.norm(som.quantization(som_data) - som_data, axis=1)

error_threshold = np.percentile(quantization_errors, 
                               100*(1-.25))+(0.5*(np.percentile(quantization_errors, 
                               100*(1-.25)) - np.percentile(quantization_errors, 100*(1-.75))))

is_outlier = quantization_errors > error_threshold
off_df['OUTLIER'] = is_outlier
error_threshold

Wall time: 1.53 s


27.74202622333563

In [932]:
px.histogram(x=quantization_errors, labels={
    'x': 'error'
})

In [933]:
fig = px.scatter_3d(off_df, x='PLAYER_PTS', y='MATCHUP_AST', z='MATCHUP_TOV', color='OUTLIER', 
                    labels={
                     'PLAYER_PTS': 'Points',
                     'MATCHUP_AST': 'Assists',
                     'MATCHUP_TOV': 'Turnovers'
                    })
fig.show()

In [934]:
fig = px.scatter_3d(off_df[off_df['OUTLIER'] == False], x='PLAYER_PTS', y='MATCHUP_AST', z='MATCHUP_TOV', 
                    labels={
                     'PLAYER_PTS': 'Points',
                     'MATCHUP_AST': 'Assists',
                     'MATCHUP_TOV': 'Turnovers'
                    })
fig.show()

In [935]:
# Remove outliers
off_df = off_df[off_df['OUTLIER'] == False]
print(off_df.shape)

(298, 25)


**Retrieve Defensive Stats**

Retrieves defensive data for each defending player

In [936]:
# data = pd.read_csv('./stats/' + season + '_def_stats.csv')
data = pd.read_csv('./stats/combined.csv')
data = data.fillna(0)

# Get def stats only from selected defenders (rename def_player_id to player_id to merge arrays)
comp = off_df.rename(columns={'DEF_PLAYER_ID': 'PLAYER_ID'})
def_df = pd.merge(data, comp[['PLAYER_ID', 'SEASON_ID']], how ='inner', on=['PLAYER_ID', 'SEASON_ID'])

print(def_df.shape)
print(def_df.columns)
def_df.sort_values('PLAYER_NAME', ascending=True, inplace=True)
def_df.head()

(298, 33)
Index(['PLAYER_ID', 'PLAYER_NAME', 'AGE', 'GP', 'G', 'D_FGM', 'D_FGA',
       'D_FG_PCT', 'NORMAL_FG_PCT', 'PCT_PLUSMINUS', 'W', 'L', 'MIN', 'STL',
       'BLK', 'DREB', 'CONTESTED_SHOTS', 'CONTESTED_SHOTS_2PT',
       'CONTESTED_SHOTS_3PT', 'DEFLECTIONS', 'CHARGES_DRAWN', 'DEF_BOXOUTS',
       'PCT_BOX_OUTS_REB', 'SEASON_ID', 'DEF_RATING', 'DREB_PCT', 'PCT_STL',
       'PCT_BLK', 'OPP_PTS_OFF_TOV', 'OPP_PTS_2ND_CHANCE', 'OPP_PTS_FB',
       'OPP_PTS_PAINT', 'DEF_WS'],
      dtype='object')


Unnamed: 0,PLAYER_ID,PLAYER_NAME,AGE,GP,G,D_FGM,D_FGA,D_FG_PCT,NORMAL_FG_PCT,PCT_PLUSMINUS,...,SEASON_ID,DEF_RATING,DREB_PCT,PCT_STL,PCT_BLK,OPP_PTS_OFF_TOV,OPP_PTS_2ND_CHANCE,OPP_PTS_FB,OPP_PTS_PAINT,DEF_WS
66,202329,Al-Farouq Aminu,28.0,81,81,5.1,11.05,0.461,0.466,-0.005,...,22018,106.5,0.232,0.29,0.396,9.7,8.0,8.5,25.3,0.092
240,202692,Alec Burks,30.0,50,50,4.66,10.44,0.446,0.452,-0.005,...,22021,104.0,0.087,0.196,0.058,7.9,6.4,6.9,23.2,0.122
164,1627936,Alex Caruso,27.0,58,58,3.36,7.64,0.44,0.456,-0.016,...,22020,104.6,0.14,0.289,0.262,6.8,5.3,5.3,18.6,0.094
0,1626150,Andrew Harrison,23.0,56,56,3.66,8.57,0.427,0.446,-0.019,...,22017,103.7,0.15,0.148,0.529,5.7,5.0,4.0,17.2,0.072
1,203952,Andrew Wiggins,23.0,82,82,6.21,13.4,0.463,0.456,0.007,...,22017,107.6,0.082,0.052,0.122,4.9,3.6,3.4,12.2,0.034


**Machine Learning Implementation**

Utilizes the keras Model from Tensorflow to predict offensive player's stats.
If there is an error, make sure to first install tensorflow. This can be done through Anaconda, or through the command 

```
!pip install tensorflow
```

In [937]:
# Clean data to use on model
X = def_df[def_df.columns[2:]].drop(columns=['GP', 'G', 'W', 'L', 'MIN', 'D_FGM', 'D_FGA', 'NORMAL_FG_PCT', 'AGE']) #  'SEASON_ID'
# X = def_df[['SEASON_ID', 'PCT_PLUSMINUS', 'DEF_WS', 'BLK', 'CONTESTED_SHOTS_2PT']]
y = off_df[off_df.columns[8:11]]
y = off_df[off_df.columns[8]]
print(X.shape, y.shape)

# Apply normalization to input
X = (X - X.mean()) / X.std()

# Generate training and verification data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print('Training Entries')
X_train.info()
X_train.describe()

print('Testing Entries')
X_test.info()
X_test.describe()

X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

(298, 22) (298,)
Training Entries
<class 'pandas.core.frame.DataFrame'>
Int64Index: 238 entries, 73 to 146
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   D_FG_PCT             238 non-null    float64
 1   PCT_PLUSMINUS        238 non-null    float64
 2   STL                  238 non-null    float64
 3   BLK                  238 non-null    float64
 4   DREB                 238 non-null    float64
 5   CONTESTED_SHOTS      238 non-null    float64
 6   CONTESTED_SHOTS_2PT  238 non-null    float64
 7   CONTESTED_SHOTS_3PT  238 non-null    float64
 8   DEFLECTIONS          238 non-null    float64
 9   CHARGES_DRAWN        238 non-null    float64
 10  DEF_BOXOUTS          238 non-null    float64
 11  PCT_BOX_OUTS_REB     238 non-null    float64
 12  SEASON_ID            238 non-null    float64
 13  DEF_RATING           238 non-null    float64
 14  DREB_PCT             238 non-null    float64
 15  PCT_S

**Keras Model Philosophy**

1. Normalize inputs
2. Dropout layer to prevent overfitting
3. Dense layer

In [1001]:
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.keras import layers, Sequential
from tensorflow.keras.layers import Dense, Dropout

# Create model
model = Sequential()
model.add(Dropout(0.2, input_shape=(X.shape[1],)))
model.add(Dense(11, activation='relu'))
# model.add(Dense(20, activation='relu'))
model.add(Dropout(0.7))
model.add(Dense(1, activation='relu')) # y.shape[1]

# Compile model
model.compile(optimizer='nadam',loss='mean_squared_error', metrics=['accuracy'])

model.summary()

Model: "sequential_98"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dropout_171 (Dropout)       (None, 22)                0         
                                                                 
 dense_270 (Dense)           (None, 11)                253       
                                                                 
 dropout_172 (Dropout)       (None, 11)                0         
                                                                 
 dense_271 (Dense)           (None, 1)                 12        
                                                                 
Total params: 265
Trainable params: 265
Non-trainable params: 0
_________________________________________________________________


In [1002]:
# Fit model
%time history = model.fit(X_train, y_train, epochs=1000, validation_split=0.7, batch_size=10, verbose=0)


`Model.state_updates` will be removed in a future version. This property should not be used in TensorFlow 2.0, as `updates` are applied automatically.



Wall time: 28.3 s


In [1003]:
y_pred = model.predict(X_test)
score = model.evaluate(X_test, y_test)
print(score)

diff = abs(y_pred.flatten() - y_test)
print(diff.std(), diff.mean())
print(y_pred.mean(), y_test.mean())
predictions = pd.DataFrame(data={'predicted': y_pred.flatten(), 'actual': y_test, 'abs_diff': diff})
predictions.sort_values(by='abs_diff')


`Model.state_updates` will be removed in a future version. This property should not be used in TensorFlow 2.0, as `updates` are applied automatically.



[242.50684916178386, 0.0]
8.342856164587186 13.149281370458532
22.930265 30.473594427600098


Unnamed: 0,predicted,actual,abs_diff
52,26.414686,26.737968,0.323282
21,23.026365,23.391813,0.365448
17,24.108477,25.157233,1.048756
28,20.179325,18.957346,1.221979
32,21.637701,20.408163,1.229538
33,18.564598,20.576132,2.011534
45,26.083452,28.571429,2.487976
55,24.897991,22.38806,2.509931
25,26.939632,22.69289,4.246743
15,16.017157,21.428571,5.411415


In [1004]:
px.histogram(x=predictions['abs_diff'], labels={
    'x': 'difference'
})

In [1005]:
# Plot training and validation loss
fig = px.line()
fig.add_scatter(y=history.history['loss'], name='Train')
fig.add_scatter(y=history.history['val_loss'], name='Validation')

fig.update_layout(title='Train vs Validation Loss',
                   xaxis_title='Epochs',
                   yaxis_title='Loss')

fig.show()

In [1006]:
# compute SHAP values
explainer = shap.DeepExplainer(model, X_train)
shap_values = explainer.shap_values(X_test)

shap.initjs()
shap.force_plot(explainer.expected_value[0], shap_values[0][0], features = X.columns)


Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.

