In [361]:
from math import sqrt

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import shap
import plotly.express as px

# For shap to work with keras, disable v2 behavior
tf.compat.v1.disable_v2_behavior()

In [362]:
loc = './stats/'

In [363]:
X = pd.read_csv(loc + 'X.csv')
y = pd.read_csv(loc + 'y.csv')

y = y['PLAYER_PPG']

# Apply normalization to input
X = (X - X.mean()) / X.std()

# Baseline Model

A basic model that predicts the offensive player's points per 100 possessions will be their average points per 100 possessions

In [364]:
X_base = X.copy()
y_base = y.copy()

X_base = X_base['OFF_PTS'].to_numpy()
y_base = y_base.to_numpy()

rmse = np.sqrt(np.mean((X_base - y_base) ** 2))
mae = mean_absolute_error(y_base, X_base)
print('RMSE: ' + str(rmse))
print('MAE: ' + str(mae))

RMSE: 19.428045942801486
MAE: 15.227051267591593


# Sequential Model

Creates a simple Keras model composed of single stack layers connected sequentially.

In [365]:
# Remove points from input dataset
X_seq = X.copy()
X_seq.drop(columns=['OFF_POST_TOUCH_PTS', 'OFF_POST_TOUCH_FG_PCT', 'OFF_PTS'], inplace=True)

# Sort columns alphabetically
X_seq = X_seq.sort_index(axis=1)

# Generate training and verification data
X_train, X_test, y_train, y_test = train_test_split(X_seq, y, test_size=0.2)

print('Training Entries')
X_train.info()
X_train.describe()

print('Testing Entries')
X_test.info()
X_test.describe()

X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

Training Entries
<class 'pandas.core.frame.DataFrame'>
Int64Index: 78444 entries, 55479 to 8607
Data columns (total 51 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   DEF_AGE                   78444 non-null  float64
 1   DEF_BLK                   78444 non-null  float64
 2   DEF_BOXOUTS               78444 non-null  float64
 3   DEF_CHARGES_DRAWN         78444 non-null  float64
 4   DEF_CONTESTED_SHOTS       78444 non-null  float64
 5   DEF_CONTESTED_SHOTS_3PT   78444 non-null  float64
 6   DEF_DEF_WS                78444 non-null  float64
 7   DEF_DREB                  78444 non-null  float64
 8   DEF_DREB_PCT              78444 non-null  float64
 9   DEF_D_FGA                 78444 non-null  float64
 10  DEF_D_FG_PCT              78444 non-null  float64
 11  DEF_GP                    78444 non-null  float64
 12  DEF_L                     78444 non-null  float64
 13  DEF_MIN                   78444 non-null 

In [366]:
# Create model
model = Sequential()
model.add(Dense(int(X_seq.shape[1] / 2), activation='relu', input_shape=(X_seq.shape[1],)))
model.add(Dropout(0.7))
model.add(Dense(1, activation='relu')) # y.shape[1]

# Compile model
model.compile(optimizer='nadam', loss='mean_squared_error', metrics=['accuracy'])

model.summary()

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_28 (Dense)            (None, 25)                1300      
                                                                 
 dropout_14 (Dropout)        (None, 25)                0         
                                                                 
 dense_29 (Dense)            (None, 1)                 26        
                                                                 
Total params: 1,326
Trainable params: 1,326
Non-trainable params: 0
_________________________________________________________________


In [367]:
# Set batch size
bsize = int(X_train.shape[0] * 0.6)

# Fit model
%time history = model.fit(X_train, y_train, epochs=10000, validation_split=0.2, batch_size=bsize, verbose=0)

`Model.state_updates` will be removed in a future version. This property should not be used in TensorFlow 2.0, as `updates` are applied automatically.


In [None]:
y_pred = model.predict(X_test)

# Evaluate error
score = model.evaluate(X_test, y_test)
mae = mean_absolute_error(y_test, y_pred)
print('RMSE: ' + str(sqrt(score[0])))
print('MAE: ' + str(mae))

diff = abs(y_pred.flatten() - y_test.flatten())
predictions = pd.DataFrame(data={'predicted': y_pred.flatten(), 'actual': y_test.flatten(), 'abs_diff': diff})
# predictions.sort_values(by='abs_diff')
predictions.sort_values(by='abs_diff', ascending=False)

`Model.state_updates` will be removed in a future version. This property should not be used in TensorFlow 2.0, as `updates` are applied automatically.


RMSE: 10.102969860181902
MAE: 6.987576292986406


Unnamed: 0,predicted,actual,abs_diff
12840,39.335922,154.605519,115.269597
73,33.288517,143.478722,110.190205
5203,53.913010,144.212524,90.299514
5959,26.672016,113.039059,86.367043
13758,29.013069,105.732074,76.719005
...,...,...,...
4102,9.247110,9.245149,0.001962
1645,16.418240,16.416397,0.001843
11897,8.694698,8.693492,0.001206
12150,6.821604,6.822798,0.001194


In [None]:
# Plot training and validation loss
fig = px.line()
fig.add_scatter(y=history.history['loss'], name='Train')
fig.add_scatter(y=history.history['val_loss'], name='Validation')

fig.update_layout(title='Train vs Validation Loss',
                   xaxis_title='Epochs',
                   yaxis_title='Loss')

fig.show()

In [None]:
fig = px.histogram(predictions, x='predicted')
fig.show()

12.64915979608916


In [None]:
fig = px.histogram(predictions, x='actual')
fig.show()

In [None]:
# compute SHAP values
explainer = shap.DeepExplainer(model, X_train[0:1000])
shap_values = explainer.shap_values(X_test)

shap.initjs()
shap.force_plot(explainer.expected_value[0], shap_values[0][0], features = X_seq.columns)

Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.


In [None]:
# Predict matchup
off_id = 201142
def_id = 201939

# Get offense stats
data = pd.read_csv('./stats/2021-22_off_stats.csv')
data = data.add_prefix('OFF_')
data.rename(columns={'OFF_SEASON_ID': 'SEASON_ID'}, inplace=True)
inputs_o = data[data['OFF_PLAYER_ID'] == off_id]

# Get defensive stats
def_data = pd.read_csv('./stats/2021-22_def_stats.csv')
def_data = def_data.fillna(0)
def_data = def_data.add_prefix('DEF_')
def_data.rename(columns={'DEF_SEASON_ID': 'SEASON_ID', 'DEF_DEF_RATING': 'DEF_RATING', 
                     'DEF_DEF_BOXOUTS': 'DEF_BOXOUTS'}, inplace=True)
inputs_d = def_data[def_data['DEF_PLAYER_ID'] == def_id]
print(inputs_d['DEF_PLAYER_NAME'])

# Get both inputs together
inputs_o.drop(columns=['OFF_PLAYER_ID', 'OFF_PTS_PER_100'], inplace=True)
inputs_d.drop(columns=['DEF_PLAYER_ID', 'DEF_PLAYER_NAME', 'SEASON_ID'], inplace=True)
inputs_o.drop(columns=['OFF_POST_TOUCH_PTS', 'OFF_POST_TOUCH_FG_PCT', 'OFF_FGM', 'OFF_PTS', 'OFF_EFF_FG_PCT', 'OFF_OREB', 'OFF_PAINT_TOUCH_FGA', 'OFF_PAINT_TOUCH_FGM', 'OFF_PAINT_TOUCH_TOV', 'OFF_PAINT_TOUCH_PASSES', 'OFF_PAINT_TOUCH_PTS', 'OFF_DREB', 'OFF_FG3M', 'OFF_FTM'], inplace=True)
inputs_d.drop(columns=['DEF_D_FGM', 'DEF_OPP_PTS_PAINT', 'DEF_CONTESTED_SHOTS_2PT', 'DEF_DEFLECTIONS', 'DEF_OPP_PTS_FB', 'DEF_OPP_PTS_2ND_CHANCE', 'DEF_OPP_PTS_OFF_TOV', 'DEF_PCT_PLUSMINUS', 'DEF_G'], inplace=True)

inputs_o = inputs_o.reset_index(drop=True)
inputs_d = inputs_d.reset_index(drop=True)
X_predict = pd.concat([inputs_d, inputs_o], axis=1)

# Normalize inputs
X_seqtest = pd.read_csv('./stats/X.csv')
X_seqtest.drop(columns=['OFF_POST_TOUCH_PTS', 'OFF_POST_TOUCH_FG_PCT', 'OFF_PTS'], inplace=True)
X_seqtest = X_seqtest[X_seqtest['SEASON_ID'] == 22021]

X_predict = (X_predict - X_seqtest.mean()) / X_seqtest.std()

X_predict = X_predict.fillna(0)

X_predict = X_predict.sort_index(axis=1)

model.predict(X_predict.to_numpy())

508    Stephen Curry
Name: DEF_PLAYER_NAME, dtype: object



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


array([[25.49113]], dtype=float32)

In [None]:
test = pd.read_csv('./stats/y.csv')
test = test[test['OFF_PLAYER_ID'] == 201939]
test['PLAYER_PTS'].mean()

41.780257704344145