In [1]:
from math import sqrt

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import shap
import plotly.express as px

# For shap to work with keras, disable v2 behavior
tf.compat.v1.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
season = '2021-22'
percentile = 0.2

df = pd.read_csv('./stats/' + season + '_' + str(percentile) + '_h2h_stats.csv')
# df = pd.read_csv('./stats/h2h_combined.csv')
df = df.fillna(0)

print(df.shape)
df.head()

(115711, 27)


Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,TEAM_PTS,...,MATCHUP_FG3A,MATCHUP_FG3_PCT,HELP_BLK,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC
0,22021,203932,Aaron Gordon,203084,Harrison Barnes,3,29:17,141.2,25,157,...,5,0.6,0,0,0,0,2,2,1,1757.0
1,22021,203932,Aaron Gordon,1628404,Josh Hart,3,21:29,99.2,22,92,...,4,0.25,0,0,0,0,1,2,1,1289.1
2,22021,203932,Aaron Gordon,202711,Bojan Bogdanovic,3,17:12,92.0,16,93,...,4,0.25,0,0,0,0,1,2,0,1031.6
3,22021,203932,Aaron Gordon,1628991,Jaren Jackson Jr.,3,14:31,68.4,11,67,...,3,0.333,0,0,0,0,4,4,2,870.7
4,22021,203932,Aaron Gordon,1630532,Franz Wagner,2,14:09,67.8,5,66,...,2,0.5,0,0,0,0,0,0,0,849.4


# Convert totals to per 100 possessions

Most limited sample size, convert each to per possession and then multiply by 100 (more standard to measure in per 100 possessions rather than per possession, as generally nba teams average [100 possessions per game](https://www.teamrankings.com/nba/stat/possessions-per-game)).

In [3]:
MIN_MATCHUP_MINS = 8

In [4]:
h2h_df = df.copy()

# Set minutes threshold and drop unnecessary columns
h2h_df = h2h_df[h2h_df['MATCHUP_TIME_SEC'] > MIN_MATCHUP_MINS * 60] # Must have played more than x minutes
h2h_df.drop(columns=['TEAM_PTS', 'MATCHUP_FG_PCT', 'MATCHUP_FG3_PCT'], inplace=True)

def per_100_poss(x):
    return x / h2h_df['PARTIAL_POSS'] * 100

# Set stats to per 100 possessions
h2h_df = h2h_df.apply(lambda x: per_100_poss(x) if x.name not in h2h_df.columns[0:8] else x)

h2h_df.sort_values('DEF_PLAYER_NAME', ascending=True, inplace=True)

# Remove rows with zeros in important columns
check = h2h_df[h2h_df.columns[8:11]] != 0
h2h_df = h2h_df[check['PLAYER_PTS']]

print(h2h_df.shape)
h2h_df.sort_values(by=['PLAYER_PTS'], ascending=False).head()

(2547, 24)


Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,MATCHUP_AST,...,MATCHUP_FG3M,MATCHUP_FG3A,HELP_BLK,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC
76082,22021,1629029,Luka Doncic,1627826,Ivica Zubac,3,11:22,54.3,112.338858,20.257827,...,20.257827,36.832413,0.0,0.0,0.0,0.0,3.683241,5.524862,1.841621,1255.248619
57150,22021,203954,Joel Embiid,1628964,Mo Bamba,3,9:38,47.9,66.805846,6.263048,...,0.0,2.087683,0.0,0.0,0.0,0.0,20.876827,25.052192,14.613779,1205.636743
58056,22021,202685,Jonas Valanciunas,1627826,Ivica Zubac,3,15:07,72.6,66.115702,2.754821,...,13.774105,15.151515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1249.035813
54436,22021,1628369,Jayson Tatum,203992,Bogdan Bogdanovic,4,11:25,55.1,65.335753,9.07441,...,10.889292,21.778584,0.0,0.0,0.0,0.0,3.629764,3.629764,0.0,1243.194192
57142,22021,203954,Joel Embiid,1629655,Daniel Gafford,3,13:10,67.1,62.593145,4.470939,...,2.980626,8.941878,0.0,0.0,0.0,0.0,14.90313,14.90313,10.432191,1177.645306


# Convert to projected points per game

Convert per 100 possessions numbers to projected points per game by taking average possessions and multiplying by points per single possession.

In [5]:
# Get offensive data
data = pd.read_csv('./stats/' + season + '_' + str(percentile) + '_off_stats.csv')
data = data.fillna(0)

# Merge average points per game and points per 100 possessions into h2h dataframe
data = data.add_prefix('OFF_')
data.rename(columns={'OFF_SEASON_ID': 'SEASON_ID'}, inplace=True)
data = data[['OFF_PLAYER_ID', 'SEASON_ID', 'OFF_PTS', 'OFF_PTS_PER_100']]
data = pd.merge(data, h2h_df[['OFF_PLAYER_ID', 'SEASON_ID', 'DEF_PLAYER_ID']], how ='inner', on=['OFF_PLAYER_ID', 'SEASON_ID'])
data = pd.merge(h2h_df, data, how='inner', on=['OFF_PLAYER_ID', 'SEASON_ID', 'DEF_PLAYER_ID'])

# Get average possessions per game
poss = data['OFF_PTS'] / (data['OFF_PTS_PER_100'] / 100)

# Convert per 100 possessions to points per game
data['PLAYER_PTS'] = data['PLAYER_PTS'] / 100 * poss

# Set h2h dataframe
h2h_df = data

data.sort_values(by=['PLAYER_PTS'], ascending=False).head()

Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,MATCHUP_AST,...,HELP_BLK,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC,OFF_PTS,OFF_PTS_PER_100
1024,22021,1629029,Luka Doncic,1627826,Ivica Zubac,3,11:22,54.3,80.283297,20.257827,...,0.0,0.0,0.0,0.0,3.683241,5.524862,1.841621,1255.248619,27.8,38.9
193,22021,1628369,Jayson Tatum,203992,Bogdan Bogdanovic,4,11:25,55.1,48.122651,9.07441,...,0.0,0.0,0.0,0.0,3.629764,3.629764,0.0,1243.194192,26.0,35.3
1892,22021,203954,Joel Embiid,1628964,Mo Bamba,3,9:38,47.9,44.53723,6.263048,...,0.0,0.0,0.0,0.0,20.876827,25.052192,14.613779,1205.636743,29.6,44.4
1067,22021,2544,LeBron James,203109,Jae Crowder,2,8:22,40.1,42.360586,0.0,...,0.0,0.0,0.0,0.0,4.987531,4.987531,2.493766,1251.122195,28.8,37.3
599,22021,201142,Kevin Durant,1627884,Derrick Jones Jr.,3,12:12,64.0,41.946171,3.125,...,0.0,0.0,0.0,0.0,10.9375,10.9375,4.6875,1144.375,29.3,38.2


In [8]:
# Points per game estimated
fig = px.histogram(x=data['PLAYER_PTS'], labels={
    'x': 'Estimated points per game versus opponent'
})

print('Mean: ' + str(data['PLAYER_PTS'].mean()))
print('Standard deviation: ' + str(data['PLAYER_PTS'].std()))

fig.show()

Mean: 12.229234257058865
Standard deviation: 7.720081048660234


In [10]:
# Points per game averages
fig = px.histogram(x=data['OFF_PTS'], labels={
    'x': 'Points per game average'
})

print('Mean: ' + str(data['OFF_PTS'].mean()))
print('Standard deviation: ' + str(data['OFF_PTS'].std()))

fig.show()

Mean: 15.335963879073418
Standard deviation: 6.297156608080484


# Retrieve Offensive and Defensive Stats

Retrieves defensive data for each defending player

In [11]:
data = pd.read_csv('./stats/'+ season + '_def_stats.csv')
# data = pd.read_csv('./stats/def_combined.csv')
data = data.fillna(0)

# Get def stats only from selected defenders (rename player_id to def_player_id to merge arrays)
data = data.add_prefix('DEF_')
data.rename(columns={'DEF_SEASON_ID': 'SEASON_ID', 'DEF_DEF_RATING': 'DEF_RATING', 
                     'DEF_DEF_BOXOUTS': 'DEF_BOXOUTS'}, inplace=True)
def_df = pd.merge(data, h2h_df[['DEF_PLAYER_ID', 'SEASON_ID']], how ='inner', on=['DEF_PLAYER_ID', 'SEASON_ID'])
def_df.drop(columns=['DEF_GP', 'DEF_G', 'DEF_D_FG_PCT', 'DEF_DREB_PCT', 'DEF_PCT_STL', 'DEF_PCT_BLK'], inplace=True)

# Add offensive player (helps merging offensive stats)
print(def_df.shape[0], h2h_df.shape[0])
def_df['OFF_PLAYER_ID'] = h2h_df['OFF_PLAYER_ID'].to_numpy()
def_df['OFF_PLAYER_NAME'] = h2h_df['OFF_PLAYER_NAME'].to_numpy()

print(def_df.columns)
def_df.head()

2547 2547
Index(['DEF_PLAYER_ID', 'DEF_PLAYER_NAME', 'DEF_AGE', 'DEF_D_FGM', 'DEF_D_FGA',
       'DEF_NORMAL_FG_PCT', 'DEF_PCT_PLUSMINUS', 'DEF_W', 'DEF_L', 'DEF_MIN',
       'DEF_STL', 'DEF_BLK', 'DEF_DREB', 'DEF_CONTESTED_SHOTS',
       'DEF_CONTESTED_SHOTS_2PT', 'DEF_CONTESTED_SHOTS_3PT', 'DEF_DEFLECTIONS',
       'DEF_CHARGES_DRAWN', 'DEF_BOXOUTS', 'DEF_PCT_BOX_OUTS_REB', 'SEASON_ID',
       'DEF_RATING', 'DEF_OPP_PTS_OFF_TOV', 'DEF_OPP_PTS_2ND_CHANCE',
       'DEF_OPP_PTS_FB', 'DEF_OPP_PTS_PAINT', 'DEF_DEF_WS', 'OFF_PLAYER_ID',
       'OFF_PLAYER_NAME'],
      dtype='object')


Unnamed: 0,DEF_PLAYER_ID,DEF_PLAYER_NAME,DEF_AGE,DEF_D_FGM,DEF_D_FGA,DEF_NORMAL_FG_PCT,DEF_PCT_PLUSMINUS,DEF_W,DEF_L,DEF_MIN,...,DEF_PCT_BOX_OUTS_REB,SEASON_ID,DEF_RATING,DEF_OPP_PTS_OFF_TOV,DEF_OPP_PTS_2ND_CHANCE,DEF_OPP_PTS_FB,DEF_OPP_PTS_PAINT,DEF_DEF_WS,OFF_PLAYER_ID,OFF_PLAYER_NAME
0,203932,Aaron Gordon,26.0,4.91,11.23,0.462,-0.024,28.0,16.0,22.9,...,0.176,22021,105.5,7.9,5.7,6.0,21.2,0.091,202711,Bojan Bogdanovic
1,203932,Aaron Gordon,26.0,4.91,11.23,0.462,-0.024,28.0,16.0,22.9,...,0.176,22021,105.5,7.9,5.7,6.0,21.2,0.091,1629027,Trae Young
2,203932,Aaron Gordon,26.0,4.91,11.23,0.462,-0.024,28.0,16.0,22.9,...,0.176,22021,105.5,7.9,5.7,6.0,21.2,0.091,1627742,Brandon Ingram
3,203932,Aaron Gordon,26.0,4.91,11.23,0.462,-0.024,28.0,16.0,22.9,...,0.176,22021,105.5,7.9,5.7,6.0,21.2,0.091,1628983,Shai Gilgeous-Alexander
4,203932,Aaron Gordon,26.0,4.91,11.23,0.462,-0.024,28.0,16.0,22.9,...,0.176,22021,105.5,7.9,5.7,6.0,21.2,0.091,202704,Reggie Jackson


In [12]:
data = pd.read_csv('./stats/' + season + '_' + str(percentile) + '_off_stats.csv')
# data = pd.read_csv('./stats/off_combined.csv')
data = data.fillna(0)

# Get off stats only from selected offensive players
data = data.add_prefix('OFF_')
data.rename(columns={'OFF_SEASON_ID': 'SEASON_ID'}, inplace=True)
combine = pd.merge(def_df, data, how='inner', on=['OFF_PLAYER_ID', 'SEASON_ID'])

# Get correct sort
combine.sort_values(['DEF_PLAYER_ID', 'OFF_PLAYER_ID'], inplace=True)
print(combine.shape)
print(combine.columns)
combine.head()

(2547, 69)
Index(['DEF_PLAYER_ID', 'DEF_PLAYER_NAME', 'DEF_AGE', 'DEF_D_FGM', 'DEF_D_FGA',
       'DEF_NORMAL_FG_PCT', 'DEF_PCT_PLUSMINUS', 'DEF_W', 'DEF_L', 'DEF_MIN',
       'DEF_STL', 'DEF_BLK', 'DEF_DREB', 'DEF_CONTESTED_SHOTS',
       'DEF_CONTESTED_SHOTS_2PT', 'DEF_CONTESTED_SHOTS_3PT', 'DEF_DEFLECTIONS',
       'DEF_CHARGES_DRAWN', 'DEF_BOXOUTS', 'DEF_PCT_BOX_OUTS_REB', 'SEASON_ID',
       'DEF_RATING', 'DEF_OPP_PTS_OFF_TOV', 'DEF_OPP_PTS_2ND_CHANCE',
       'DEF_OPP_PTS_FB', 'DEF_OPP_PTS_PAINT', 'DEF_DEF_WS', 'OFF_PLAYER_ID',
       'OFF_PLAYER_NAME', 'OFF_AGE', 'OFF_GP', 'OFF_MIN', 'OFF_FGM', 'OFF_FGA',
       'OFF_FG_PCT', 'OFF_FG3M', 'OFF_FG3A', 'OFF_FG3_PCT', 'OFF_FTM',
       'OFF_FTA', 'OFF_FT_PCT', 'OFF_OREB', 'OFF_DREB', 'OFF_REB', 'OFF_AST',
       'OFF_TOV', 'OFF_BLKA', 'OFF_PF', 'OFF_PTS', 'OFF_TOUCHES',
       'OFF_PAINT_TOUCHES', 'OFF_PAINT_TOUCH_FGM', 'OFF_PAINT_TOUCH_FGA',
       'OFF_PAINT_TOUCH_PASSES', 'OFF_PAINT_TOUCH_TOV', 'OFF_DRIVE_PTS',
       'OFF_DRIVE_

Unnamed: 0,DEF_PLAYER_ID,DEF_PLAYER_NAME,DEF_AGE,DEF_D_FGM,DEF_D_FGA,DEF_NORMAL_FG_PCT,DEF_PCT_PLUSMINUS,DEF_W,DEF_L,DEF_MIN,...,OFF_PULL_UP_PTS,OFF_PULL_UP_FG_PCT,OFF_PAINT_TOUCH_PTS,OFF_PAINT_TOUCH_FG_PCT,OFF_POST_TOUCH_PTS,OFF_POST_TOUCH_FG_PCT,OFF_ELBOW_TOUCH_PTS,OFF_ELBOW_TOUCH_FG_PCT,OFF_EFF_FG_PCT,OFF_PTS_PER_100
586,2544,LeBron James,37.0,5.08,12.64,0.461,-0.058,32.0,7.0,20.4,...,5.8,0.425,0.4,0.583,1.9,0.425,0.8,0.429,0.516,26.4
2011,2544,LeBron James,37.0,5.08,12.64,0.461,-0.058,32.0,7.0,20.4,...,1.1,0.346,1.4,0.523,0.0,0.5,0.8,0.704,0.473,15.9
1144,2544,LeBron James,37.0,5.08,12.64,0.461,-0.058,32.0,7.0,20.4,...,3.3,0.373,2.5,0.634,0.8,0.452,0.5,0.824,0.556,26.6
2237,2544,LeBron James,37.0,5.08,12.64,0.461,-0.058,32.0,7.0,20.4,...,2.2,0.394,5.9,0.651,2.7,0.467,1.9,0.535,0.499,32.0
208,2544,LeBron James,37.0,5.08,12.64,0.461,-0.058,32.0,7.0,20.4,...,1.7,0.333,5.4,0.714,1.5,0.533,1.7,0.534,0.56,26.8


In [13]:
# Sort head to head stats by same sort
h2h_df.sort_values(['DEF_PLAYER_ID', 'OFF_PLAYER_ID'], inplace=True)
print(h2h_df.shape)

h2h_df.head()

(2547, 26)


Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,MATCHUP_AST,...,HELP_BLK,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC,OFF_PTS,OFF_PTS_PER_100
1673,22021,202694,Marcus Morris Sr.,2544,LeBron James,3,11:55,56.6,10.707785,0.0,...,0.0,0.0,0.0,0.0,3.533569,3.533569,0.0,1263.250883,16.0,26.4
1677,22021,203937,Kyle Anderson,2544,LeBron James,4,10:23,60.2,3.134207,3.322259,...,0.0,0.0,0.0,0.0,3.322259,3.322259,1.66113,1034.219269,7.5,15.9
1674,22021,203952,Andrew Wiggins,2544,LeBron James,2,8:06,47.0,6.99888,4.255319,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1032.978723,17.5,26.6
1676,22021,204001,Kristaps Porzingis,2544,LeBron James,1,8:07,34.8,13.793103,2.873563,...,0.0,0.0,0.0,0.0,5.747126,5.747126,2.873563,1398.275862,19.2,32.0
1675,22021,1626174,Christian Wood,2544,LeBron James,3,14:16,76.2,20.801504,5.249344,...,0.0,0.0,0.0,0.0,2.624672,2.624672,1.312336,1123.35958,17.7,26.8


# Keras Model Implementation

Utilizes the keras Model from TensorFlow to predict offensive player's stats.
If there is an error, make sure to first install tensorflow. This can be done through Anaconda, or through the command 

```
!pip install tensorflow
```

In [30]:
# Clean data to use on model
# X = combine.drop(columns=['DEF_PLAYER_ID', 'DEF_PLAYER_NAME', 'OFF_PLAYER_ID', 'OFF_PLAYER_NAME', 'SEASON_ID'])
# y = h2h_df['PLAYER_PTS']

# Save to csvs
# X.to_csv('./stats/cleaned/' + season + '_X.csv', index=False)
# y.to_csv('./stats/cleaned/' + season + '_y.csv', index=False)

# Read saved data
inputs = pd.read_csv('./stats/cleaned/X_combined.csv')
y = pd.read_csv('./stats/cleaned/y_combined.csv')

inputs.drop(columns=['OFF_PTS_PER_100'], inplace=True)

print(inputs.shape, y.shape)

(15033, 63) (15033, 1)


# Examine Correlation

Find correlation between inputs and remove redundant inputs

In [43]:
off_stats = inputs.filter(regex='^OFF').copy()

# Display correlation matrix
fig = px.imshow(off_stats.corr())
fig.show()

# Drop columns with correlation > 0.9
off_cols = ['OFF_FGM', 'OFF_FGA', 'OFF_EFF_FG_PCT', 'OFF_OREB', 'OFF_PAINT_TOUCH_FGA', 'OFF_PAINT_TOUCH_FGM', 'OFF_PAINT_TOUCH_TOV', 'OFF_PAINT_TOUCH_PASSES', 'OFF_PAINT_TOUCH_PTS', 'OFF_DREB', 'OFF_FG3A', 'OFF_FG3M', 'OFF_FTM', 'OFF_FTA']
off_stats.drop(columns=off_cols, inplace=True)
fig = px.imshow(off_stats.corr())
fig.show()

print('Columns before removal: ' + str(len(off_cols) + len(off_stats.columns)))
print('Columns after removal: ' + str(len(off_stats.columns)))

Columns before removal: 39
Columns after removal: 25


In [44]:
def_stats = inputs.filter(regex='^DEF').copy()

# Display correlation matrix
fig = px.imshow(def_stats.corr())
fig.show()

# Drop columns with correlation > 0.9
def_cols = ['DEF_D_FGM', 'DEF_OPP_PTS_PAINT', 'DEF_CONTESTED_SHOTS_2PT', 'DEF_DEFLECTIONS', 'DEF_OPP_PTS_FB', 'DEF_OPP_PTS_2ND_CHANCE', 'DEF_OPP_PTS_OFF_TOV']
def_stats.drop(columns=def_cols, inplace=True)
fig = px.imshow(def_stats.corr())
fig.show()

print('Columns before removal: ' + str(len(def_cols) + len(def_stats.columns)))
print('Columns after removal: ' + str(len(def_stats.columns)))

Columns before removal: 24
Columns after removal: 17


In [45]:
X = inputs.drop(columns=off_cols + def_cols)

# Apply normalization to input
X = (X - X.mean()) / X.std()

# Generate training and verification data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print('Training Entries')
X_train.info()
X_train.describe()

print('Testing Entries')
X_test.info()
X_test.describe()

X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

Training Entries
<class 'pandas.core.frame.DataFrame'>
Int64Index: 12026 entries, 5570 to 2798
Data columns (total 42 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   DEF_AGE                  12026 non-null  float64
 1   DEF_D_FGA                12026 non-null  float64
 2   DEF_NORMAL_FG_PCT        12026 non-null  float64
 3   DEF_PCT_PLUSMINUS        12026 non-null  float64
 4   DEF_W                    12026 non-null  float64
 5   DEF_L                    12026 non-null  float64
 6   DEF_MIN                  12026 non-null  float64
 7   DEF_STL                  12026 non-null  float64
 8   DEF_BLK                  12026 non-null  float64
 9   DEF_DREB                 12026 non-null  float64
 10  DEF_CONTESTED_SHOTS      12026 non-null  float64
 11  DEF_CONTESTED_SHOTS_3PT  12026 non-null  float64
 12  DEF_CHARGES_DRAWN        12026 non-null  float64
 13  DEF_BOXOUTS              12026 non-null  float64
 14  DEF

# Baseline Model

A basic model that predicts the offensive player's points per 100 possessions will be their average points per 100 possessions

In [46]:
X_base = pd.read_csv('./stats/cleaned/X_combined.csv')
y_base = pd.read_csv('./stats/cleaned/y_combined.csv')

X_base = X_base['OFF_PTS'].to_numpy()
y_base = y_base['PLAYER_PTS'].to_numpy()

rmse = np.sqrt(np.mean((X_base - y_base) ** 2))
mae = mean_absolute_error(y_base, X_base)
print('RMSE: ' + str(rmse))
print('MAE: ' + str(mae))

RMSE: 6.731943803486539
MAE: 5.276775335692326


# Sequential Model

Creates a simple Keras model composed of single stack layers connected sequentially.

In [51]:
# Create model
model = Sequential()
model.add(Dense(int(X.shape[1] / 2), activation='relu', input_shape=(X.shape[1],)))
model.add(Dropout(0.7))
model.add(Dense(1, activation='relu')) # y.shape[1]

# Compile model
model.compile(optimizer='nadam', loss='mean_squared_error', metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 21)                903       
                                                                 
 dropout_1 (Dropout)         (None, 21)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 22        
                                                                 
Total params: 925
Trainable params: 925
Non-trainable params: 0
_________________________________________________________________


In [52]:
# Set batch size
bsize = int(X_train.shape[0] * 0.6)

# Fit model
%time history = model.fit(X_train, y_train, epochs=8000, validation_split=0.2, batch_size=bsize, verbose=0)


`Model.state_updates` will be removed in a future version. This property should not be used in TensorFlow 2.0, as `updates` are applied automatically.



Wall time: 2min 28s


In [59]:
y_pred = model.predict(X_test)

# Evaluate error
score = model.evaluate(X_test, y_test)
mae = mean_absolute_error(y_test, y_pred)
print('RMSE: ' + str(sqrt(score[0])))
print('MAE: ' + str(mae))

diff = abs(y_pred.flatten() - y_test.flatten())
predictions = pd.DataFrame(data={'predicted': y_pred.flatten(), 'actual': y_test.flatten(), 'abs_diff': diff})
predictions.sort_values(by='abs_diff')
# predictions.sort_values(by='predicted', ascending=False)

RMSE: 5.785278759588981
MAE: 4.426951874427377


Unnamed: 0,predicted,actual,abs_diff
1405,8.572535,8.567502,0.005032
2058,10.382192,10.374004,0.008188
2820,14.135057,14.123760,0.011297
1131,9.940875,9.952622,0.011747
1505,18.915707,18.903733,0.011974
...,...,...,...
2650,10.605659,34.726649,24.120991
2417,15.377887,39.600970,24.223083
2307,12.645611,38.267155,25.621544
2374,10.098077,36.391042,26.292965


In [60]:
# Plot training and validation loss
fig = px.line()
fig.add_scatter(y=history.history['loss'], name='Train')
fig.add_scatter(y=history.history['val_loss'], name='Validation')

fig.update_layout(title='Train vs Validation Loss',
                   xaxis_title='Epochs',
                   yaxis_title='Loss')

fig.show()

In [61]:
fig = px.histogram(x=predictions['abs_diff'], labels={
    'x': 'difference'
})

# print(predictions.quantile(q=0.5)['abs_diff'])
print('Mean: ' + str(predictions['abs_diff'].mean()))
print('Standard deviation: ' + str(predictions['abs_diff'].std()))

fig.show()

Mean: 4.426951874427374
Standard deviation: 3.725072077950692


# Predicted and Actual Distributions

Shown below are the distributions of the predicted and actual points per 100 possessions of the tested players

In [62]:
fig = px.histogram(x=predictions['predicted'], labels={
    'x': 'predicted'
})

# nth percentile difference (Lower number better)
print('Mean: ' + str(predictions['predicted'].mean()))
print('Standard deviation: ' + str(predictions['predicted'].std()))

fig.show()

Mean: 11.834338188171387
Standard deviation: 3.971040964126587


In [63]:
fig = px.histogram(x=predictions['actual'], labels={
    'x': 'actual'
})

# nth percentile difference (Lower number better)
print('Mean: ' + str(predictions['actual'].mean()))
print('Standard deviation: ' + str(predictions['actual'].std()))

fig.show()

Mean: 11.957460524855067
Standard deviation: 7.444188025085954


In [64]:
# compute SHAP values
explainer = shap.DeepExplainer(model, X_train)
shap_values = explainer.shap_values(X_test)

shap.initjs()
shap.force_plot(explainer.expected_value[0], shap_values[0][0], features = X.columns)


Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode. See PR #1483 for discussion.


You have provided over 5k background samples! For better performance consider using smaller random sample.



In [249]:
# Predict matchup
off_id = 201142
def_id = 1628464

# Get offense stats
data = pd.read_csv('./stats/2021-22_0.2_off_stats.csv')
data = data.add_prefix('OFF_')
data.rename(columns={'OFF_SEASON_ID': 'SEASON_ID'}, inplace=True)
inputs_o = data[data['OFF_PLAYER_ID'] == off_id]

# Get defensive stats
def_data = pd.read_csv('./stats/2021-22_def_stats.csv')
def_data = def_data.fillna(0)
def_data = def_data.add_prefix('DEF_')
def_data.rename(columns={'DEF_SEASON_ID': 'SEASON_ID', 'DEF_DEF_RATING': 'DEF_RATING', 
                     'DEF_DEF_BOXOUTS': 'DEF_BOXOUTS'}, inplace=True)
def_data.drop(columns=['DEF_GP', 'DEF_G', 'DEF_D_FG_PCT', 'DEF_DREB_PCT', 'DEF_PCT_STL', 'DEF_PCT_BLK'], inplace=True)
inputs_d = def_data[def_data['DEF_PLAYER_ID'] == def_id]

# Get both inputs together
inputs_o.drop(columns=['OFF_PLAYER_ID', 'SEASON_ID', 'OFF_PTS_PER_100'], inplace=True)
inputs_d.drop(columns=['DEF_PLAYER_ID', 'DEF_PLAYER_NAME', 'SEASON_ID'], inplace=True)
inputs_o.drop(columns=['OFF_FGM', 'OFF_FGA', 'OFF_EFF_FG_PCT', 'OFF_OREB', 'OFF_PAINT_TOUCH_FGA', 'OFF_PAINT_TOUCH_FGM', 'OFF_PAINT_TOUCH_TOV', 'OFF_PAINT_TOUCH_PASSES', 'OFF_PAINT_TOUCH_PTS', 'OFF_DREB', 'OFF_FG3A', 'OFF_FG3M', 'OFF_FTM', 'OFF_FTA'], inplace=True)
inputs_d.drop(columns=['DEF_D_FGM', 'DEF_OPP_PTS_PAINT', 'DEF_CONTESTED_SHOTS_2PT', 'DEF_DEFLECTIONS', 'DEF_OPP_PTS_FB', 'DEF_OPP_PTS_2ND_CHANCE', 'DEF_OPP_PTS_OFF_TOV'], inplace=True)

inputs_o = inputs_o.reset_index(drop=True)
inputs_d = inputs_d.reset_index(drop=True)
input = pd.concat([inputs_d, inputs_o], axis=1)

X = pd.read_csv('./stats/cleaned/X_combined.csv')
X.drop(columns=['OFF_FGM', 'OFF_FGA', 'OFF_EFF_FG_PCT', 'OFF_OREB', 'OFF_PAINT_TOUCH_FGA', 'OFF_PAINT_TOUCH_FGM', 'OFF_PAINT_TOUCH_TOV', 'OFF_PAINT_TOUCH_PASSES', 'OFF_PAINT_TOUCH_PTS', 'OFF_DREB', 'OFF_FG3A', 'OFF_FG3M', 'OFF_FTM', 'OFF_FTA', 'OFF_PTS_PER_100'], inplace=True)
X.drop(columns=['DEF_D_FGM', 'DEF_OPP_PTS_PAINT', 'DEF_CONTESTED_SHOTS_2PT', 'DEF_DEFLECTIONS', 'DEF_OPP_PTS_FB', 'DEF_OPP_PTS_2ND_CHANCE', 'DEF_OPP_PTS_OFF_TOV'], inplace=True)

input = (input - X.mean()) / X.std()
model.predict(input.to_numpy())



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



array([[21.320356]], dtype=float32)

In [142]:
tot_mins = 37.3 + 32.3 + 14
17.23384 * 37.3/tot_mins + 18.494263 * 32.3/tot_mins + 19.046219 * 14/tot_mins

18.02433005861244