In [24]:
import pandas as pd
import requests
import json
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split

import shap
import plotly.express as px

from minisom import MiniSom

# For shap to work with keras, disable v2 behavior
tf.compat.v1.disable_v2_behavior()

In [25]:
season = '2019-20'
percentile = 0.2

df = pd.read_csv('./stats/' + season + '_' + str(percentile) + '_h2h_stats.csv')
# df = pd.read_csv('./stats/h2h_combined.csv')
df = df.fillna(0)

print(df.shape)
df.head()

(114829, 27)


Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,TEAM_PTS,...,MATCHUP_FG3A,MATCHUP_FG3_PCT,HELP_BLK,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC
0,22019,203932,Aaron Gordon,202710,Jimmy Butler,4,25:22,117.3,24,110,...,10,0.4,0,0,0,0,2,2,0,1521.9
1,22019,203932,Aaron Gordon,1627823,Juancho Hernangomez,3,15:56,87.2,10,97,...,5,0.2,0,0,0,0,1,1,1,956.2
2,22019,203932,Aaron Gordon,203114,Khris Middleton,4,17:50,87.1,13,82,...,6,0.333,0,0,0,0,3,5,3,1070.2
3,22019,203932,Aaron Gordon,203933,T.J. Warren,2,16:36,81.6,25,82,...,5,0.6,0,0,0,0,2,4,2,995.6
4,22019,203932,Aaron Gordon,203084,Harrison Barnes,2,17:33,79.4,22,102,...,4,0.75,0,0,0,0,1,1,1,1052.5


# Convert totals to per 100 possessions

Most limited sample size, convert each to per possession and then multiply by 100 (more standard to measure in per 100 possessions rather than per possession, as generally nba teams average [100 possessions per game](https://www.teamrankings.com/nba/stat/possessions-per-game)).

In [81]:
MIN_MATCHUP_MINS = 12

In [82]:
h2h_df = df.copy()
h2h_df = h2h_df[h2h_df['MATCHUP_TIME_SEC'] > MIN_MATCHUP_MINS * 60] # Must have played more than x minutes
h2h_df.drop(columns=['TEAM_PTS', 'MATCHUP_FG_PCT', 'MATCHUP_FG3_PCT'], inplace=True)

def per_100_poss(x):
    return x / h2h_df['PARTIAL_POSS'] * 100

# Set stats to per 100 possessions
h2h_df = h2h_df.apply(lambda x: per_100_poss(x) if x.name not in h2h_df.columns[0:8] else x)

h2h_df.sort_values('DEF_PLAYER_NAME', ascending=True, inplace=True)

# Remove rows with zeros in important columns
check = h2h_df[h2h_df.columns[8:11]] != 0
h2h_df = h2h_df[check['PLAYER_PTS'] & check['MATCHUP_AST']]

print(h2h_df.shape)
h2h_df.head()

(497, 24)


Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,MATCHUP_AST,...,MATCHUP_FG3M,MATCHUP_FG3A,HELP_BLK,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC
6921,22019,1627732,Ben Simmons,203932,Aaron Gordon,2,21:36,102.9,17.492711,7.774538,...,0.0,0.0,0.0,0.0,0.0,0.0,1.943635,1.943635,0.971817,1259.280855
55621,22019,202710,Jimmy Butler,203932,Aaron Gordon,4,23:59,112.9,15.943313,8.857396,...,0.0,1.771479,0.0,0.0,0.0,0.0,3.542958,6.200177,3.542958,1274.4907
71730,22019,2544,LeBron James,203932,Aaron Gordon,2,15:39,80.0,18.75,20.0,...,1.25,7.5,0.0,0.0,0.0,0.0,2.5,2.5,1.25,1173.125
60710,22019,1627823,Juancho Hernangomez,203932,Aaron Gordon,3,17:09,102.8,10.700389,2.918288,...,1.945525,6.809339,0.0,0.0,0.0,0.0,0.972763,0.972763,0.972763,1001.459144
41217,22019,203507,Giannis Antetokounmpo,203932,Aaron Gordon,3,16:38,100.5,29.850746,12.935323,...,1.99005,3.9801,0.0,0.0,0.0,0.0,5.970149,7.960199,3.9801,993.333333


# Sanitize Outputs

The program is only as good as the data you give it

In [83]:
features = ['PLAYER_PTS', 'MATCHUP_AST', 'MATCHUP_TOV']

som_data = h2h_df[features].values # h2h_df.columns[7:10]
som = MiniSom(20, 20, len(features), sigma=5, learning_rate=0.5,
              neighborhood_function='gaussian', random_seed=10)

%time som.train(som_data, 20000, random_order=False)  # random training

quantization_errors = np.linalg.norm(som.quantization(som_data) - som_data, axis=1)

error_threshold = np.percentile(quantization_errors, 
                               100*(1-.25))+(4*(np.percentile(quantization_errors, 
                               100*(1-.25)) - np.percentile(quantization_errors, 100*(1-.75))))

is_outlier = quantization_errors > error_threshold
error_threshold

Wall time: 1.08 s


6.699263473024659

In [84]:
plt = px.scatter(data_frame = h2h_df, x = 'PLAYER_PTS', y = 'MATCHUP_TIME_SEC', color = is_outlier)
plt.show()

In [85]:
out = h2h_df
# [is_outlier == False]
out.shape

(497, 24)

# Retrieve Offensive and Defensive Stats

Retrieves defensive data for each defending player

In [86]:
data = pd.read_csv('./stats/'+ season + '_def_stats.csv')
# data = pd.read_csv('./stats/def_combined.csv')
data = data.fillna(0)

# Get def stats only from selected defenders (rename player_id to def_player_id to merge arrays)
data = data.add_prefix('DEF_')
data.rename(columns={'DEF_SEASON_ID': 'SEASON_ID', 'DEF_DEF_RATING': 'DEF_RATING', 
                     'DEF_DEF_BOXOUTS': 'DEF_BOXOUTS'}, inplace=True)
def_df = pd.merge(data, out[['DEF_PLAYER_ID', 'SEASON_ID']], how ='inner', on=['DEF_PLAYER_ID', 'SEASON_ID'])
def_df.drop(columns=['DEF_GP', 'DEF_G', 'DEF_D_FG_PCT', 'DEF_DREB_PCT', 'DEF_PCT_STL', 'DEF_PCT_BLK'], inplace=True)

# Add offensive player (helps merging offensive stats)
print(def_df.shape[0], out.shape[0])
def_df['OFF_PLAYER_ID'] = out['OFF_PLAYER_ID'].to_numpy()
def_df['OFF_PLAYER_NAME'] = out['OFF_PLAYER_NAME'].to_numpy()

print(def_df.columns)
def_df

497 497
Index(['DEF_PLAYER_ID', 'DEF_PLAYER_NAME', 'DEF_AGE', 'DEF_D_FGM', 'DEF_D_FGA',
       'DEF_NORMAL_FG_PCT', 'DEF_PCT_PLUSMINUS', 'DEF_W', 'DEF_L', 'DEF_MIN',
       'DEF_STL', 'DEF_BLK', 'DEF_DREB', 'DEF_CONTESTED_SHOTS',
       'DEF_CONTESTED_SHOTS_2PT', 'DEF_CONTESTED_SHOTS_3PT', 'DEF_DEFLECTIONS',
       'DEF_CHARGES_DRAWN', 'DEF_BOXOUTS', 'DEF_PCT_BOX_OUTS_REB', 'SEASON_ID',
       'DEF_RATING', 'DEF_OPP_PTS_OFF_TOV', 'DEF_OPP_PTS_2ND_CHANCE',
       'DEF_OPP_PTS_FB', 'DEF_OPP_PTS_PAINT', 'DEF_DEF_WS', 'OFF_PLAYER_ID',
       'OFF_PLAYER_NAME'],
      dtype='object')


Unnamed: 0,DEF_PLAYER_ID,DEF_PLAYER_NAME,DEF_AGE,DEF_D_FGM,DEF_D_FGA,DEF_NORMAL_FG_PCT,DEF_PCT_PLUSMINUS,DEF_W,DEF_L,DEF_MIN,...,DEF_PCT_BOX_OUTS_REB,SEASON_ID,DEF_RATING,DEF_OPP_PTS_OFF_TOV,DEF_OPP_PTS_2ND_CHANCE,DEF_OPP_PTS_FB,DEF_OPP_PTS_PAINT,DEF_DEF_WS,OFF_PLAYER_ID,OFF_PLAYER_NAME
0,203932,Aaron Gordon,24.0,5.31,11.98,0.468,-0.025,22.0,17.0,32.8,...,0.581,22019,107.4,12.8,9.1,11.5,35.8,0.111,1627732,Ben Simmons
1,203932,Aaron Gordon,24.0,5.31,11.98,0.468,-0.025,22.0,17.0,32.8,...,0.581,22019,107.4,12.8,9.1,11.5,35.8,0.111,202710,Jimmy Butler
2,203932,Aaron Gordon,24.0,5.31,11.98,0.468,-0.025,22.0,17.0,32.8,...,0.581,22019,107.4,12.8,9.1,11.5,35.8,0.111,2544,LeBron James
3,203932,Aaron Gordon,24.0,5.31,11.98,0.468,-0.025,22.0,17.0,32.8,...,0.581,22019,107.4,12.8,9.1,11.5,35.8,0.111,1627823,Juancho Hernangomez
4,203932,Aaron Gordon,24.0,5.31,11.98,0.468,-0.025,22.0,17.0,32.8,...,0.581,22019,107.4,12.8,9.1,11.5,35.8,0.111,203507,Giannis Antetokounmpo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,203897,Zach LaVine,25.0,5.02,11.68,0.454,-0.024,15.0,40.0,16.1,...,0.378,22019,117.3,6.4,5.1,4.7,17.9,0.007,1628971,Bruce Brown
493,203897,Zach LaVine,25.0,5.02,11.68,0.454,-0.024,15.0,40.0,16.1,...,0.378,22019,117.3,6.4,5.1,4.7,17.9,0.007,1629628,RJ Barrett
494,203897,Zach LaVine,25.0,5.02,11.68,0.454,-0.024,15.0,40.0,16.1,...,0.378,22019,117.3,6.4,5.1,4.7,17.9,0.007,203078,Bradley Beal
495,203897,Zach LaVine,25.0,5.02,11.68,0.454,-0.024,15.0,40.0,16.1,...,0.378,22019,117.3,6.4,5.1,4.7,17.9,0.007,203087,Jeremy Lamb


In [87]:
data = pd.read_csv('./stats/' + season + '_' + str(percentile) + '_off_stats.csv')
# data = pd.read_csv('./stats/off_combined.csv')
data = data.fillna(0)

# Get off stats only from selected offensive players
data = data.add_prefix('OFF_')
data.rename(columns={'OFF_SEASON_ID': 'SEASON_ID'}, inplace=True)
combine = pd.merge(def_df, data, how='inner', on=['OFF_PLAYER_ID', 'SEASON_ID'])

# Get correct sort
combine.sort_values(['DEF_PLAYER_ID', 'OFF_PLAYER_ID'], inplace=True)
print(combine.shape)
print(combine.columns)
combine.head()

(497, 68)
Index(['DEF_PLAYER_ID', 'DEF_PLAYER_NAME', 'DEF_AGE', 'DEF_D_FGM', 'DEF_D_FGA',
       'DEF_NORMAL_FG_PCT', 'DEF_PCT_PLUSMINUS', 'DEF_W', 'DEF_L', 'DEF_MIN',
       'DEF_STL', 'DEF_BLK', 'DEF_DREB', 'DEF_CONTESTED_SHOTS',
       'DEF_CONTESTED_SHOTS_2PT', 'DEF_CONTESTED_SHOTS_3PT', 'DEF_DEFLECTIONS',
       'DEF_CHARGES_DRAWN', 'DEF_BOXOUTS', 'DEF_PCT_BOX_OUTS_REB', 'SEASON_ID',
       'DEF_RATING', 'DEF_OPP_PTS_OFF_TOV', 'DEF_OPP_PTS_2ND_CHANCE',
       'DEF_OPP_PTS_FB', 'DEF_OPP_PTS_PAINT', 'DEF_DEF_WS', 'OFF_PLAYER_ID',
       'OFF_PLAYER_NAME', 'OFF_AGE', 'OFF_GP', 'OFF_MIN', 'OFF_FGM', 'OFF_FGA',
       'OFF_FG_PCT', 'OFF_FG3M', 'OFF_FG3A', 'OFF_FG3_PCT', 'OFF_FTM',
       'OFF_FTA', 'OFF_FT_PCT', 'OFF_OREB', 'OFF_DREB', 'OFF_REB', 'OFF_AST',
       'OFF_TOV', 'OFF_BLKA', 'OFF_PF', 'OFF_PTS', 'OFF_TOUCHES',
       'OFF_PAINT_TOUCHES', 'OFF_PAINT_TOUCH_FGM', 'OFF_PAINT_TOUCH_FGA',
       'OFF_PAINT_TOUCH_PASSES', 'OFF_PAINT_TOUCH_TOV', 'OFF_DRIVE_PTS',
       'OFF_DRIVE_F

Unnamed: 0,DEF_PLAYER_ID,DEF_PLAYER_NAME,DEF_AGE,DEF_D_FGM,DEF_D_FGA,DEF_NORMAL_FG_PCT,DEF_PCT_PLUSMINUS,DEF_W,DEF_L,DEF_MIN,...,OFF_CATCH_SHOOT_FG_PCT,OFF_PULL_UP_PTS,OFF_PULL_UP_FG_PCT,OFF_PAINT_TOUCH_PTS,OFF_PAINT_TOUCH_FG_PCT,OFF_POST_TOUCH_PTS,OFF_POST_TOUCH_FG_PCT,OFF_ELBOW_TOUCH_PTS,OFF_ELBOW_TOUCH_FG_PCT,OFF_EFF_FG_PCT
386,2544,LeBron James,35.0,3.91,9.31,0.463,-0.043,2.0,1.0,3.8,...,0.451,5.7,0.444,0.8,0.636,0.8,0.457,0.5,0.538,0.536
28,2544,LeBron James,35.0,3.91,9.31,0.463,-0.043,2.0,1.0,3.8,...,0.411,9.6,0.427,1.1,0.571,2.4,0.427,0.8,0.618,0.524
345,2544,LeBron James,35.0,3.91,9.31,0.463,-0.043,2.0,1.0,3.8,...,0.384,0.3,0.257,2.1,0.606,0.1,1.0,0.4,0.8,0.576
263,2546,Carmelo Anthony,36.0,4.67,10.16,0.465,-0.005,47.0,15.0,15.7,...,0.419,3.9,0.364,1.0,0.686,0.8,0.519,0.4,0.765,0.548
266,2546,Carmelo Anthony,36.0,4.67,10.16,0.465,-0.005,47.0,15.0,15.7,...,0.398,0.7,0.308,2.8,0.652,0.8,0.523,0.5,0.538,0.555


In [88]:
# Sort head to head stats by same sort
h2h_df.sort_values(['DEF_PLAYER_ID', 'OFF_PLAYER_ID'], inplace=True)
h2h_df.head()

Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,MATCHUP_AST,...,MATCHUP_FG3M,MATCHUP_FG3A,HELP_BLK,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC
25370,22019,201942,DeMar DeRozan,2544,LeBron James,3,19:58,93.2,15.021459,5.364807,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1285.622318
62531,22019,202695,Kawhi Leonard,2544,LeBron James,4,15:07,74.9,32.042724,4.00534,...,1.335113,4.00534,0.0,0.0,0.0,0.0,1.335113,2.670227,0.0,1211.081442
31394,22019,1627827,Dorian Finney-Smith,2544,LeBron James,4,16:46,84.4,10.663507,1.184834,...,3.554502,5.924171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1191.587678
8550,22019,202711,Bojan Bogdanovic,2546,Carmelo Anthony,3,15:37,79.5,13.836478,1.257862,...,3.773585,7.54717,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1178.36478
54213,22019,203924,Jerami Grant,2546,Carmelo Anthony,3,15:43,69.8,30.08596,1.432665,...,7.163324,8.595989,0.0,0.0,0.0,0.0,2.86533,5.730659,1.432665,1350.573066


# Keras Model Implementation

Utilizes the keras Model from TensorFlow to predict offensive player's stats.
If there is an error, make sure to first install tensorflow. This can be done through Anaconda, or through the command 

```
!pip install tensorflow
```

In [245]:
# Clean data to use on model
# X = combine.drop(columns=['DEF_PLAYER_ID', 'DEF_PLAYER_NAME', 'OFF_PLAYER_ID', 'OFF_PLAYER_NAME', 'SEASON_ID'])
# y = out['PLAYER_PTS']

# Save to csvs
# X.to_csv('./stats/cleaned/' + season + '_X.csv', index=False)
# y.to_csv('./stats/cleaned/' + season + '_y.csv', index=False)

# Read saved data
X = pd.read_csv('./stats/cleaned/X_combined.csv')
y = pd.read_csv('./stats/cleaned/y_combined.csv')

print(X.shape, y.shape)

# Apply normalization to input
X = (X - X.mean()) / X.std()

# Generate training and verification data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print('Training Entries')
X_train.info()
X_train.describe()

print('Testing Entries')
X_test.info()
X_test.describe()

X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

(2701, 63) (2701, 1)
Training Entries
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2160 entries, 1836 to 1789
Data columns (total 63 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   DEF_AGE                  2160 non-null   float64
 1   DEF_D_FGM                2160 non-null   float64
 2   DEF_D_FGA                2160 non-null   float64
 3   DEF_NORMAL_FG_PCT        2160 non-null   float64
 4   DEF_PCT_PLUSMINUS        2160 non-null   float64
 5   DEF_W                    2160 non-null   float64
 6   DEF_L                    2160 non-null   float64
 7   DEF_MIN                  2160 non-null   float64
 8   DEF_STL                  2160 non-null   float64
 9   DEF_BLK                  2160 non-null   float64
 10  DEF_DREB                 2160 non-null   float64
 11  DEF_CONTESTED_SHOTS      2160 non-null   float64
 12  DEF_CONTESTED_SHOTS_2PT  2160 non-null   float64
 13  DEF_CONTESTED_SHOTS_3PT  2160 non-nul

# Sequential Model

Creates a simple Keras model composed of single stack layers connected sequentially. 

1. Normalize inputs
2. Dropout layer to prevent overfitting
3. Dense layer

In [246]:
# Create model
model = Sequential()
model.add(Dense(int(X.shape[1] / 2), activation='relu', input_shape=(X.shape[1],)))
model.add(Dropout(0.7))
model.add(Dense(1, activation='relu')) # y.shape[1]

# Compile model
model.compile(optimizer='nadam', loss='mean_squared_error', metrics=['accuracy'])

model.summary()

Model: "sequential_28"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_56 (Dense)            (None, 31)                1984      
                                                                 
 dropout_28 (Dropout)        (None, 31)                0         
                                                                 
 dense_57 (Dense)            (None, 1)                 32        
                                                                 
Total params: 2,016
Trainable params: 2,016
Non-trainable params: 0
_________________________________________________________________


In [247]:
# Fit model
bsize = int(X_train.shape[0] * 0.6) # 30% batch size

%time history = model.fit(X_train, y_train, epochs=6000, validation_split=0.2, batch_size=bsize, verbose=0)


`Model.state_updates` will be removed in a future version. This property should not be used in TensorFlow 2.0, as `updates` are applied automatically.



Wall time: 27.2 s


In [248]:
from math import sqrt

y_pred = model.predict(X_test)
score = model.evaluate(X_test, y_test)
print('RMSE: ' + str(sqrt(score[0])))

diff = abs(y_pred.flatten() - y_test.flatten())
print('Mean difference: ' + str(diff.mean()))
predictions = pd.DataFrame(data={'predicted': y_pred.flatten(), 'actual': y_test.flatten(), 'abs_diff': diff})
predictions.sort_values(by='abs_diff', inplace=True)
predictions


`Model.state_updates` will be removed in a future version. This property should not be used in TensorFlow 2.0, as `updates` are applied automatically.



RMSE: 8.211114713667184
Mean difference: 6.387510245164791


Unnamed: 0,predicted,actual,abs_diff
75,27.770430,27.777778,0.007348
68,18.739408,18.720749,0.018660
532,18.501200,18.544936,0.043736
173,15.385848,15.432099,0.046251
450,27.302437,27.241771,0.060666
...,...,...,...
534,19.786957,43.818466,24.031510
95,28.667068,54.687500,26.020432
356,23.423901,52.443385,29.019484
270,26.523123,56.000000,29.476877


In [249]:
# Plot training and validation loss
fig = px.line()
fig.add_scatter(y=history.history['loss'], name='Train')
fig.add_scatter(y=history.history['val_loss'], name='Validation')

fig.update_layout(title='Train vs Validation Loss',
                   xaxis_title='Epochs',
                   yaxis_title='Loss')

fig.show()

In [250]:
fig = px.histogram(x=predictions['abs_diff'], labels={
    'x': 'difference'
})

# print(predictions.quantile(q=0.5)['abs_diff'])
print('Percent acceptable: ' + str(len(predictions[predictions['abs_diff'] < 5]) / len(predictions) * 100))
print('Mean: ' + str(predictions['abs_diff'].mean()))
print('Standard deviation: ' + str(predictions['abs_diff'].std()))

fig.show()

Percent acceptable: 48.79852125693161
Mean: 6.387510245164798
Standard deviation: 5.164437756563383


# Predicted and Actual Distributions

Shown below are the distributions of the predicted and actual points per 100 possessions of the tested players

In [124]:
fig = px.histogram(x=predictions['predicted'], labels={
    'x': 'predicted'
})

# nth percentile difference (Lower number better)
print(predictions.quantile(q=0.5)['predicted'])
print('Standard deviation: ' + str(predictions['predicted'].std()))

fig.show()

18.041946411132812
Standard deviation: 5.884700298309326


In [125]:
fig = px.histogram(x=predictions['actual'], labels={
    'x': 'actual'
})

# nth percentile difference (Lower number better)
print(predictions.quantile(q=0.5)['actual'])
print('Standard deviation: ' + str(predictions['actual'].std()))

fig.show()

17.59530791788856
Standard deviation: 10.461625618858148


In [101]:
# compute SHAP values
explainer = shap.DeepExplainer(model, X_train)
shap_values = explainer.shap_values(X_test)

shap.initjs()
shap.force_plot(explainer.expected_value[0], shap_values[0][0], features = X.columns)


Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.



In [75]:
# Predict matchup
off_id = 203999
def_id = 1626158

# Get offense stats
data = pd.read_csv('./stats/2021-22_0.2_off_stats.csv')
data = data.add_prefix('OFF_')
data.rename(columns={'OFF_SEASON_ID': 'SEASON_ID'}, inplace=True)
inputs_o = data[data['OFF_PLAYER_ID'] == off_id]

# Get defensive stats
def_data = pd.read_csv('./stats/'+ season + '_def_stats.csv')
def_data = def_data.fillna(0)
def_data = def_data.add_prefix('DEF_')
def_data.rename(columns={'DEF_SEASON_ID': 'SEASON_ID', 'DEF_DEF_RATING': 'DEF_RATING', 
                     'DEF_DEF_BOXOUTS': 'DEF_BOXOUTS'}, inplace=True)
def_data.drop(columns=['DEF_GP', 'DEF_G', 'DEF_D_FG_PCT', 'DEF_DREB_PCT', 'DEF_PCT_STL', 'DEF_PCT_BLK'], inplace=True)
inputs_d = def_data[def_data['DEF_PLAYER_ID'] == def_id]

# Get both inputs together
inputs_o.drop(columns=['OFF_PLAYER_ID', 'SEASON_ID'], inplace=True)
inputs_d.drop(columns=['DEF_PLAYER_ID', 'DEF_PLAYER_NAME', 'SEASON_ID'], inplace=True)
inputs_o = inputs_o.reset_index(drop=True)
inputs_d = inputs_d.reset_index(drop=True)
input = pd.concat([inputs_d, inputs_o], axis=1)

X = pd.read_csv('./stats/cleaned/X_combined.csv')

input = (input - X.mean()) / X.std()
model.predict(input.to_numpy())



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



array([[32.230007]], dtype=float32)

In [59]:
32.3 / 66.6 * 36.232136 + 18/66.6*27.47274 + 16.3/66.6*31.981586

32.8244319009009