In [91]:
from math import sqrt

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import shap
import plotly.express as px

# For shap to work with keras, disable v2 behavior
tf.compat.v1.disable_v2_behavior()

In [31]:
loc = './stats/'

In [32]:
X = pd.read_csv(loc + 'X.csv')
y = pd.read_csv(loc + 'y.csv')

y = y['PLAYER_PTS']

# Apply normalization to input
X = (X - X.mean()) / X.std()

# Generate training and verification data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print('Training Entries')
X_train.info()
X_train.describe()

print('Testing Entries')
X_test.info()
X_test.describe()

X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

Training Entries
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9608 entries, 9582 to 6098
Data columns (total 47 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   DEF_AGE                  9608 non-null   float64
 1   DEF_GP                   9608 non-null   float64
 2   DEF_D_FGA                9608 non-null   float64
 3   DEF_D_FG_PCT             9608 non-null   float64
 4   DEF_NORMAL_FG_PCT        9608 non-null   float64
 5   DEF_W                    9608 non-null   float64
 6   DEF_L                    9608 non-null   float64
 7   DEF_MIN                  9608 non-null   float64
 8   DEF_STL                  9608 non-null   float64
 9   DEF_BLK                  9608 non-null   float64
 10  DEF_DREB                 9608 non-null   float64
 11  DEF_CONTESTED_SHOTS      9608 non-null   float64
 12  DEF_CONTESTED_SHOTS_3PT  9608 non-null   float64
 13  DEF_CHARGES_DRAWN        9608 non-null   float64
 14  DEF_

# Baseline Model

A basic model that predicts the offensive player's points per 100 possessions will be their average points per 100 possessions

In [33]:
X_base = X.copy()
y_base = y.copy()

X_base = X_base['OFF_PTS'].to_numpy()
y_base = y_base.to_numpy()

rmse = np.sqrt(np.mean((X_base - y_base) ** 2))
mae = mean_absolute_error(y_base, X_base)
print('RMSE: ' + str(rmse))
print('MAE: ' + str(mae))

RMSE: 22.44333668838082
MAE: 19.649724944053894


# Sequential Model

Creates a simple Keras model composed of single stack layers connected sequentially.

In [34]:
# Create model
model = Sequential()
model.add(Dense(int(X.shape[1] / 2), activation='relu', input_shape=(X.shape[1],)))
model.add(Dropout(0.7))
model.add(Dense(1, activation='relu')) # y.shape[1]

# Compile model
model.compile(optimizer='nadam', loss='mean_squared_error', metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 23)                1104      
                                                                 
 dropout_1 (Dropout)         (None, 23)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 24        
                                                                 
Total params: 1,128
Trainable params: 1,128
Non-trainable params: 0
_________________________________________________________________


In [35]:
X[X.isnull().values]

Unnamed: 0,DEF_AGE,DEF_GP,DEF_D_FGA,DEF_D_FG_PCT,DEF_NORMAL_FG_PCT,DEF_W,DEF_L,DEF_MIN,DEF_STL,DEF_BLK,...,OFF_DRIVE_FG_PCT,OFF_CATCH_SHOOT_PTS,OFF_CATCH_SHOOT_FG_PCT,OFF_PULL_UP_PTS,OFF_PULL_UP_FG_PCT,OFF_PAINT_TOUCH_FG_PCT,OFF_POST_TOUCH_PTS,OFF_POST_TOUCH_FG_PCT,OFF_ELBOW_TOUCH_PTS,OFF_ELBOW_TOUCH_FG_PCT


In [36]:
# Set batch size
bsize = int(X_train.shape[0] * 0.6)

# Fit model
%time history = model.fit(X_train, y_train, epochs=8000, validation_split=0.2, batch_size=bsize, verbose=0)

  updates = self.state_updates


Wall time: 50.6 s


In [40]:
y_pred = model.predict(X_test)

# Evaluate error
score = model.evaluate(X_test, y_test)
mae = mean_absolute_error(y_test, y_pred)
print('RMSE: ' + str(sqrt(score[0])))
print('MAE: ' + str(mae))

diff = abs(y_pred.flatten() - y_test.flatten())
predictions = pd.DataFrame(data={'predicted': y_pred.flatten(), 'actual': y_test.flatten(), 'abs_diff': diff})
predictions.sort_values(by='abs_diff')
# predictions.sort_values(by='predicted', ascending=False)

RMSE: 10.257852147328032
MAE: 7.62516613572777


Unnamed: 0,predicted,actual,abs_diff
920,18.470783,18.466110,0.004673
1315,18.279028,18.263859,0.015169
692,19.586308,19.570566,0.015742
1863,17.340597,17.320574,0.020023
1019,21.190456,21.213339,0.022882
...,...,...,...
788,17.147385,66.126633,48.979248
119,24.996490,75.435657,50.439167
1535,33.778465,85.993372,52.214906
2295,38.987881,103.428454,64.440573


In [38]:
# Plot training and validation loss
fig = px.line()
fig.add_scatter(y=history.history['loss'], name='Train')
fig.add_scatter(y=history.history['val_loss'], name='Validation')

fig.update_layout(title='Train vs Validation Loss',
                   xaxis_title='Epochs',
                   yaxis_title='Loss')

fig.show()

In [80]:
fig = px.histogram(predictions, x='predicted')
fig.show()

In [92]:
# compute SHAP values
explainer = shap.DeepExplainer(model, X_train)
shap_values = explainer.shap_values(X_test)

shap.initjs()
shap.force_plot(explainer.expected_value[0], shap_values[0][0], features = X.columns)

keras is no longer supported, please use tf.keras instead.
Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.





You have provided over 5k background samples! For better performance consider using smaller random sample.


In [144]:
# Predict matchup
off_id = 201939
def_id = 1630200

# Get offense stats
data = pd.read_csv('./stats/2021-22_0.9_off_stats.csv')
data = data.add_prefix('OFF_')
data.rename(columns={'OFF_SEASON_ID': 'SEASON_ID'}, inplace=True)
inputs_o = data[data['OFF_PLAYER_ID'] == off_id]

# Get defensive stats
def_data = pd.read_csv('./stats/2021-22_def_stats.csv')
def_data = def_data.fillna(0)
def_data = def_data.add_prefix('DEF_')
def_data.rename(columns={'DEF_SEASON_ID': 'SEASON_ID', 'DEF_DEF_RATING': 'DEF_RATING', 
                     'DEF_DEF_BOXOUTS': 'DEF_BOXOUTS'}, inplace=True)
inputs_d = def_data[def_data['DEF_PLAYER_ID'] == def_id]
print(inputs_d['DEF_PLAYER_NAME'])

# Get both inputs together
inputs_o.drop(columns=['OFF_PLAYER_ID', 'OFF_PTS_PER_100'], inplace=True)
inputs_d.drop(columns=['DEF_PLAYER_ID', 'DEF_PLAYER_NAME', 'SEASON_ID'], inplace=True)
inputs_o.drop(columns=['OFF_FGM', 'OFF_FGA', 'OFF_EFF_FG_PCT', 'OFF_OREB', 'OFF_PAINT_TOUCH_FGA', 'OFF_PAINT_TOUCH_FGM', 'OFF_PAINT_TOUCH_TOV', 'OFF_PAINT_TOUCH_PASSES', 'OFF_PAINT_TOUCH_PTS', 'OFF_DREB', 'OFF_FG3A', 'OFF_FG3M', 'OFF_FTM', 'OFF_FTA'], inplace=True)
inputs_d.drop(columns=['DEF_D_FGM', 'DEF_OPP_PTS_PAINT', 'DEF_CONTESTED_SHOTS_2PT', 'DEF_DEFLECTIONS', 'DEF_OPP_PTS_FB', 'DEF_OPP_PTS_2ND_CHANCE', 'DEF_OPP_PTS_OFF_TOV', 'DEF_PCT_PLUSMINUS', 'DEF_G'], inplace=True)

inputs_o = inputs_o.reset_index(drop=True)
inputs_d = inputs_d.reset_index(drop=True)
input = pd.concat([inputs_d, inputs_o], axis=1)
print(input['SEASON_ID'])

X = pd.read_csv('./stats/X.csv')
# X = X[X['SEASON_ID'] == 22021]

input = (input - X.mean()) / X.std()

input = input.fillna(0)

model.predict(input.to_numpy())

537    Tre Jones
Name: DEF_PLAYER_NAME, dtype: object
0    22021
Name: SEASON_ID, dtype: int64


array([[16.808788]], dtype=float32)