In [1]:
import pandas as pd
import requests
import json
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split

import shap
import plotly.express as px

from minisom import MiniSom

# For shap to work with keras, disable v2 behavior
tf.compat.v1.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [4]:
season = '2021-22'
percentile = 0.2

df = pd.read_csv('./stats/' + season + '_' + str(percentile) + '_h2h_stats.csv')
# df = pd.read_csv('./stats/h2h_combined.csv')
df = df.fillna(0)

print(df.shape)
df.head()

(111702, 27)


Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,TEAM_PTS,...,MATCHUP_FG3A,MATCHUP_FG3_PCT,HELP_BLK,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC
0,22021,203932,Aaron Gordon,202711,Bojan Bogdanovic,3,17:12,92.0,16,93,...,4,0.25,0,0,0,0,1,2,0,1031.6
1,22021,203932,Aaron Gordon,1628404,Josh Hart,2,15:58,72.0,12,62,...,3,0.0,0,0,0,0,0,0,0,957.9
2,22021,203932,Aaron Gordon,1628991,Jaren Jackson Jr.,3,14:31,68.4,11,67,...,3,0.333,0,0,0,0,4,4,2,870.7
3,22021,203932,Aaron Gordon,1630532,Franz Wagner,2,14:09,67.8,5,66,...,2,0.5,0,0,0,0,0,0,0,849.4
4,22021,203932,Aaron Gordon,1629640,Keldon Johnson,2,13:42,62.2,8,60,...,2,0.0,0,0,0,0,0,0,0,821.9


In [9]:
def run_model(matchup_mins, bpercent, num_epochs, val_split):
    h2h_df = df.copy()
    h2h_df = h2h_df[h2h_df['MATCHUP_TIME_SEC'] > matchup_mins * 60] # Must have played more than x minutes
    h2h_df.drop(columns=['TEAM_PTS', 'MATCHUP_FG_PCT', 'MATCHUP_FG3_PCT'], inplace=True)

    def per_100_poss(x):
        return x / h2h_df['PARTIAL_POSS'] * 100

    # Set stats to per 100 possessions
    h2h_df = h2h_df.apply(lambda x: per_100_poss(x) if x.name not in h2h_df.columns[0:8] else x)

    h2h_df.sort_values('DEF_PLAYER_NAME', ascending=True, inplace=True)

    # Remove rows with zeros in important columns
    check = h2h_df[h2h_df.columns[8:11]] != 0
    h2h_df = h2h_df[check['PLAYER_PTS'] & check['MATCHUP_AST']]

    out = h2h_df

    data = pd.read_csv('./stats/'+ season + '_def_stats.csv')
    data = data.fillna(0)

    # Get def stats only from selected defenders (rename player_id to def_player_id to merge arrays)
    data = data.add_prefix('DEF_')
    data.rename(columns={'DEF_SEASON_ID': 'SEASON_ID', 'DEF_DEF_RATING': 'DEF_RATING', 
                        'DEF_DEF_BOXOUTS': 'DEF_BOXOUTS'}, inplace=True)
    def_df = pd.merge(data, out[['DEF_PLAYER_ID', 'SEASON_ID']], how ='inner', on=['DEF_PLAYER_ID', 'SEASON_ID'])
    def_df.drop(columns=['DEF_GP', 'DEF_G', 'DEF_D_FG_PCT', 'DEF_DREB_PCT', 'DEF_PCT_STL', 'DEF_PCT_BLK'], inplace=True)

    # Add offensive player (helps merging offensive stats)
    def_df['OFF_PLAYER_ID'] = out['OFF_PLAYER_ID'].to_numpy()
    def_df['OFF_PLAYER_NAME'] = out['OFF_PLAYER_NAME'].to_numpy()

    data = pd.read_csv('./stats/' + season + '_' + str(percentile) + '_off_stats.csv')
    data = data.fillna(0)

    # Get off stats only from selected offensive players
    data = data.add_prefix('OFF_')
    data.rename(columns={'OFF_SEASON_ID': 'SEASON_ID'}, inplace=True)
    combine = pd.merge(def_df, data, how='inner', on=['OFF_PLAYER_ID', 'SEASON_ID'])

    # Get correct sort
    combine.sort_values(['DEF_PLAYER_ID', 'OFF_PLAYER_ID'], inplace=True)

    # Sort head to head stats by same sort
    h2h_df.sort_values(['DEF_PLAYER_ID', 'OFF_PLAYER_ID'], inplace=True)

    # Clean data to use on model
    X = combine.drop(columns=['DEF_PLAYER_ID', 'DEF_PLAYER_NAME', 'OFF_PLAYER_ID', 'OFF_PLAYER_NAME', 'SEASON_ID'])
    y = out['PLAYER_PTS']

    # Save to csvs
    X.to_csv('./stats/cleaned/' + season + '_X.csv', index=False)
    y.to_csv('./stats/cleaned/' + season + '_y.csv', index=False)

    # Apply normalization to input
    X = (X - X.mean()) / X.std()

    # Generate training and verification data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    X_train = X_train.to_numpy()
    X_test = X_test.to_numpy()
    y_train = y_train.to_numpy()
    y_test = y_test.to_numpy()

    # Create model
    model = Sequential()
    model.add(Dense(int(X.shape[1] / 2), activation='relu', input_shape=(X.shape[1],)))
    model.add(Dropout(0.7))
    model.add(Dense(1, activation='relu')) # y.shape[1]

    # Compile model
    model.compile(optimizer='nadam', loss='mean_squared_error', metrics=['accuracy'])

    # Fit model
    bsize = int(X_train.shape[0] * bpercent) # 30% batch size

    history = model.fit(X_train, y_train, epochs=num_epochs, validation_split=val_split, batch_size=bsize, verbose=0)

    y_pred = model.predict(X_test)
    score = model.evaluate(X_test, y_test)

    # !!!!!!!!!!!!!! LOSS !!!!!!!!!!!
    

    diff = abs(y_pred.flatten() - y_test.flatten())
    predictions = pd.DataFrame(data={'predicted': y_pred.flatten(), 'actual': y_test.flatten(), 'abs_diff': diff})
    predictions.sort_values(by='abs_diff', inplace=True)

    return score[0], predictions.quantile(q=0.5)['abs_diff'], (len(predictions[predictions['abs_diff'] < 5]) / len(predictions) * 100), (history.history['loss'][len(history.history['loss']) - 1] < history.history['val_loss'][len(history.history['val_loss']) - 1])

In [11]:
cols = ['Matchup time', 'Batch', 'Epochs', 'Loss', '50th Quantile', '%Accept', 'Overfit']

result_df = pd.DataFrame(columns=cols)

for min in range(7, 15):
    for batch in np.arange(0.3, 0.8, 0.1):
        for epochs in range(1000, 4000, 1000):
            loss, q, pa, overfit = run_model(min, round(batch, 1), epochs, 0.2)
            result_df = result_df.append({
                'Matchup time': min,
                'Batch': round(batch, 1),
                'Epochs': epochs,
                'Loss': loss,
                '50th Quantile': q,
                '%Accept': pa,
                'Overfit': overfit
            }, ignore_index=True)
            
            print({
                'Matchup time': min,
                'Batch': round(batch, 1),
                'Epochs': epochs,
                'Loss': loss,
                '50th Quantile': q,
                '%Accept': pa,
                'Overfit': overfit
            })



{'Matchup time': 7, 'Batch': 0.3, 'Epochs': 1000, 'Loss': 93.49863914753901, '50th Quantile': 6.103210773433526, '%Accept': 41.34419551934827, 'Overfit': False}
{'Matchup time': 7, 'Batch': 0.3, 'Epochs': 2000, 'Loss': 99.74887384853635, '50th Quantile': 5.651342011650524, '%Accept': 44.602851323828915, 'Overfit': False}
{'Matchup time': 7, 'Batch': 0.3, 'Epochs': 3000, 'Loss': 98.86570624761329, '50th Quantile': 6.485210421317561, '%Accept': 41.54786150712831, 'Overfit': False}
{'Matchup time': 7, 'Batch': 0.4, 'Epochs': 1000, 'Loss': 97.93888886057432, '50th Quantile': 5.593156213519954, '%Accept': 46.63951120162933, 'Overfit': False}
{'Matchup time': 7, 'Batch': 0.4, 'Epochs': 2000, 'Loss': 87.25802239381126, '50th Quantile': 5.787607125376667, '%Accept': 42.973523421588595, 'Overfit': False}
{'Matchup time': 7, 'Batch': 0.4, 'Epochs': 3000, 'Loss': 78.58613619056592, '50th Quantile': 5.945731629450766, '%Accept': 41.75152749490835, 'Overfit': False}
{'Matchup time': 7, 'Batch': 0.5

In [12]:
result_df.to_csv('./results.csv')