In [491]:
import pandas as pd
import requests
import json

import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
import numpy as np

In [492]:
season = '2020-21'
player = 'dlillard'
df = pd.read_csv('./stats/player/' + player + '_' + season + '_off_stats.csv')
# df = pd.read_csv('./stats/player/combined.csv')

print(df.shape)
df.head()

(345, 27)


Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,TEAM_PTS,...,MATCHUP_FG3A,MATCHUP_FG3_PCT,HELP_BLK,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC
0,22020,203081,Damian Lillard,1628415,Dillon Brooks,3,23:04,126.9,31,132,...,15,0.4,0,0,0,0,3,3,1,1383.9
1,22020,203081,Damian Lillard,1628969,Mikal Bridges,3,25:02,122.9,24,121,...,9,0.222,0,0,0,0,0,0,0,1501.8
2,22020,203081,Damian Lillard,202709,Cory Joseph,5,22:13,111.7,28,115,...,16,0.375,0,0,0,0,4,4,1,1332.8
3,22020,203081,Damian Lillard,1628366,Lonzo Ball,3,23:45,111.2,47,124,...,20,0.5,0,0,0,0,5,5,2,1425.5
4,22020,203081,Damian Lillard,201950,Jrue Holiday,2,18:31,97.9,24,86,...,7,0.571,0,0,0,0,0,0,0,1111.4


**Convert totals to per 100 possessions**

Most limited sample size, convert each to per possession and then multiply by 100 (more standard to measure in per 100 possessions rather than per possession, as generally nba teams average [100 possessions per game](https://www.teamrankings.com/nba/stat/possessions-per-game)).

In [493]:
MIN_MATCHUP_MINS = 5

In [494]:
off_df = df.copy()
off_df = off_df[off_df['MATCHUP_TIME_SEC'] > MIN_MATCHUP_MINS * 60] # Must have played more than x minutes
off_df.drop(columns=['TEAM_PTS', 'MATCHUP_FG_PCT', 'MATCHUP_FG3_PCT'], inplace=True)

def per_100_poss(x):
    return x / off_df['PARTIAL_POSS'] * 100

# Set stats to per 100 possessions
off_df = off_df.apply(lambda x: per_100_poss(x) if x.name not in off_df.columns[0:8] else x)

off_df.sort_values('DEF_PLAYER_NAME', ascending=True, inplace=True)
print(off_df.shape)
off_df.head()

(45, 24)


Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,MATCHUP_AST,...,MATCHUP_FG3M,MATCHUP_FG3A,HELP_BLK,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC
25,22020,203081,Damian Lillard,1627936,Alex Caruso,2,8:39,44.6,38.116592,4.484305,...,2.242152,6.726457,0.0,0.0,0.0,0.0,13.452915,13.452915,4.484305,1163.004484
22,22020,203081,Damian Lillard,203952,Andrew Wiggins,3,10:15,45.7,45.95186,6.564551,...,10.940919,19.693654,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1345.95186
31,22020,203081,Damian Lillard,1627732,Ben Simmons,1,9:27,37.4,18.716578,10.695187,...,5.347594,5.347594,0.0,0.0,0.0,0.0,2.673797,5.347594,2.673797,1515.775401
21,22020,203081,Damian Lillard,1629632,Coby White,2,8:33,46.6,40.772532,17.167382,...,8.583691,17.167382,0.0,0.0,0.0,0.0,10.729614,10.729614,4.291845,1101.072961
35,22020,203081,Damian Lillard,1629012,Collin Sexton,2,6:48,32.6,18.404908,3.067485,...,6.134969,18.404908,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1252.453988


**Retrieve Defensive Stats**

Retrieves defensive data for each defending player

In [495]:
data = pd.read_csv('./stats/' + season + '_def_stats.csv')
# data = pd.read_csv('./stats/combined.csv')

# Get def stats only from selected defenders
def_df = data[data['PLAYER_ID'].isin(off_df['DEF_PLAYER_ID'].tolist()) & data['SEASON_ID'].isin(off_df['SEASON_ID'].tolist())]

print(def_df.shape)
print(def_df.columns)
def_df.head()

(45, 22)
Index(['PLAYER_ID', 'PLAYER_NAME', 'AGE', 'GP', 'G', 'D_FGM', 'D_FGA',
       'D_FG_PCT', 'NORMAL_FG_PCT', 'PCT_PLUSMINUS', 'W', 'L', 'MIN', 'STL',
       'BLK', 'DREB', 'CONTESTED_SHOTS', 'DEFLECTIONS', 'CHARGES_DRAWN',
       'DEF_BOXOUTS', 'PCT_BOX_OUTS_REB', 'SEASON_ID'],
      dtype='object')


Unnamed: 0,PLAYER_ID,PLAYER_NAME,AGE,GP,G,D_FGM,D_FGA,D_FG_PCT,NORMAL_FG_PCT,PCT_PLUSMINUS,...,MIN,STL,BLK,DREB,CONTESTED_SHOTS,DEFLECTIONS,CHARGES_DRAWN,DEF_BOXOUTS,PCT_BOX_OUTS_REB,SEASON_ID
10,1627936,Alex Caruso,27.0,58,58,3.36,7.64,0.44,0.456,-0.016,...,19.9,1.0,0.5,3.0,4.1,2.13,0.07,0.48,0.286,22020
20,203952,Andrew Wiggins,26.0,71,71,6.59,14.82,0.445,0.466,-0.021,...,32.1,1.1,0.5,3.8,4.6,1.83,0.0,0.08,0.857,22020
35,1627732,Ben Simmons,24.0,58,58,4.91,11.79,0.417,0.462,-0.045,...,28.0,1.3,0.8,3.0,7.38,2.51,0.01,0.46,0.257,22020
81,1629632,Coby White,21.0,69,69,5.78,11.55,0.501,0.458,0.043,...,29.5,0.8,0.5,4.0,5.96,1.17,0.04,1.0,0.235,22020
85,1629012,Collin Sexton,22.0,60,60,5.57,11.45,0.486,0.455,0.032,...,30.6,0.6,1.2,7.2,13.23,1.06,0.0,1.13,0.802,22020


**Machine Learning Implementation**

Utilizes the keras Model from Tensorflow to predict offensive player's stats.
If there is an error, make sure to first install tensorflow. This can be done through Anaconda, or through the command 

```
!pip install tensorflow
```

In [496]:
# Clean data to use on model
X = def_df[def_df.columns[2:]].drop(columns=['GP', 'G', 'W', 'L', 'MIN', 'D_FGM', 'D_FGA', 'NORMAL_FG_PCT'])
y = off_df[off_df.columns[8:12]]
X = X.fillna(0)
y = y.fillna(0)

print(X.shape, y.shape)

# Generate training and verification data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print('Training Entries')
X_train.info()
X_train.describe()

print('Testing Entries')
X_test.info()
X_test.describe()

X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

(45, 12) (45, 4)
Training Entries
<class 'pandas.core.frame.DataFrame'>
Int64Index: 36 entries, 20 to 303
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   AGE               36 non-null     float64
 1   D_FG_PCT          36 non-null     float64
 2   PCT_PLUSMINUS     36 non-null     float64
 3   STL               36 non-null     float64
 4   BLK               36 non-null     float64
 5   DREB              36 non-null     float64
 6   CONTESTED_SHOTS   36 non-null     float64
 7   DEFLECTIONS       36 non-null     float64
 8   CHARGES_DRAWN     36 non-null     float64
 9   DEF_BOXOUTS       36 non-null     float64
 10  PCT_BOX_OUTS_REB  36 non-null     float64
 11  SEASON_ID         36 non-null     int64  
dtypes: float64(11), int64(1)
memory usage: 3.7 KB
Testing Entries
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9 entries, 298 to 369
Data columns (total 12 columns):
 #   Column            Non-Null

**Keras Model Philosophy**

1. Normalize inputs
2. Dropout layer to prevent overfitting
3. Dense layer

In [497]:
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.keras import layers, Sequential
from tensorflow.keras.layers import Dense, Dropout

# Add normalization to input (using all players to normalize)
normalize = Normalization()
normalize.adapt(X)

# Create model
model = Sequential()
model.add(normalize)
model.add(Dropout(0.6, input_shape=(11,)))
model.add(Dense(y.shape[1], activation='relu'))

# Compile model
model.compile(optimizer='nadam',loss='mean_squared_error', metrics=['accuracy'])

model.summary()

Model: "sequential_79"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 normalization_76 (Normaliza  (None, 12)               25        
 tion)                                                           
                                                                 
 dropout_62 (Dropout)        (None, 12)                0         
                                                                 
 dense_124 (Dense)           (None, 4)                 52        
                                                                 
Total params: 77
Trainable params: 52
Non-trainable params: 25
_________________________________________________________________


In [498]:
# Fit model
%time history = model.fit(X_train, y_train, epochs=1000, validation_split=0.1, batch_size=1, verbose=0)

Wall time: 58.8 s


In [502]:
y_pred = model.predict(X_test)
score = model.evaluate(X_test, y_test)
print(score)

y_pred, y_test

[25.74004554748535, 1.0]


(array([[2.4228458e+01, 7.9698615e+00, 3.6298573e+00, 0.0000000e+00],
        [1.9642200e+01, 9.6433239e+00, 4.1363678e+00, 0.0000000e+00],
        [2.3954271e+01, 8.7839613e+00, 3.9483109e+00, 0.0000000e+00],
        [2.3361513e+01, 8.2385540e+00, 3.4659290e+00, 0.0000000e+00],
        [2.7936497e+01, 1.0285373e+01, 3.3005130e+00, 0.0000000e+00],
        [1.9394684e+01, 8.5968533e+00, 4.5809774e+00, 0.0000000e+00],
        [1.8805105e+01, 8.3362379e+00, 3.2734342e+00, 9.5164776e-03],
        [1.8532431e+01, 9.2277918e+00, 2.1116676e+00, 2.1015584e-01],
        [2.2584305e+01, 8.8210020e+00, 3.0417771e+00, 0.0000000e+00]],
       dtype=float32),
 array([[22.15657312,  5.9084195 ,  1.47710487,  1.47710487],
        [20.92675635,  5.97907324,  4.48430493,  1.49476831],
        [42.34527687,  3.25732899,  0.        ,  0.        ],
        [ 6.36942675,  0.        ,  3.18471338,  0.        ],
        [22.67573696, 13.60544218,  4.53514739,  0.        ],
        [18.71657754, 10.69518717,  

In [503]:
from   plotly.graph_objs import *
import plotly.offline as pyo

df_history = pd.DataFrame(history.history)

# evaluate model
def train_validation_loss(df_history):
    trace = []
    for label, loss in zip(['Train','Validation'],['loss','val_loss']):
        trace0 = {'type':'scatter',
                  'x': df_history.index.tolist(),
                  'y': df_history[loss].tolist(),
                  'name': label,
                  'mode': 'lines'
                 }
        trace.append(trace0)
    data = Data(trace)
    
    layout = {'title': 'Model train vs validation loss',
              'titlefont':{'size':30},
              'xaxis' : {'title':  '<b> Epochs', 'titlefont':{ 'size' : 25}},
              'yaxis' : {'title':  '<b> Loss', 'titlefont':{ 'size' : 25}},
             }
    fig = Figure(data=data,layout=layout)
    
    return pyo.iplot(fig)

train_validation_loss(df_history)