In [1]:
from time import time, sleep
import datetime
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import tensorflow as tf
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint

import warnings
# Ignore all warnings
warnings.filterwarnings("ignore")

In [3]:
# Create a basic ML model py file
# Goal : create the blueprint, we will add complicated stuff later
    # Summarize the player stats going back to 2014-2015 season. Summarize the average up until that point. This model will look at current stats and how that feeds into rest of the season.
    # Filter games df for 2014-2015 season. Merge summary stats with games
    # Based on these, create a simple NN model that generates predictions for the current season.

# Create a complex ML model py file
# Goal : Add in more complicated statistics.
    # Add D-LEBRON and O-LEBRON Stats to the mix.
    # Pull in Odds data, predict the winning differential. Measure based on RMSE.
    # Pull in deepers stats if possible (IE, include injury reports, weighted averages for complex stats based on minutes played.)
    # By the end make sure to setup so that it is automated. 
    # Goal is to run a schedule an email report.

# Make an analytical report with major stats against the odds. Add points here at later time.

In [2]:
pathname = "/Users/alecnaidoo/Downloads/nba-data-04-20/"
def read_file(str_name):
    df = pd.read_csv(pathname + str_name + ".csv")
    return df

In [4]:
# Summarize game detail stats
gdf = read_file("games_details")
gdf = gdf.groupby(['GAME_ID','TEAM_ID'],as_index=False)[['FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA',
       'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS']].sum()

gdf['FT_PCT'] = gdf['FGM'] / gdf['FGA']
gdf['FG_PCT'] = gdf['FG3M'] / gdf['FG3A']
gdf['FG3_PCT'] = gdf['FTM'] / gdf['FTA']

gdf.drop(['FGM','FG3M', 'FTM', 'REB'],axis=1, inplace=True)
print(gdf.columns)
gdf

Index(['GAME_ID', 'TEAM_ID', 'FGA', 'FG3A', 'FTA', 'OREB', 'DREB', 'AST',
       'STL', 'BLK', 'TO', 'PF', 'PTS', 'FT_PCT', 'FG_PCT', 'FG3_PCT'],
      dtype='object')


Unnamed: 0,GAME_ID,TEAM_ID,FGA,FG3A,FTA,OREB,DREB,AST,STL,BLK,TO,PF,PTS,FT_PCT,FG_PCT,FG3_PCT
0,10300001,1610612742,76.0,8.0,30.0,12.0,26.0,20.0,9.0,4.0,18.0,34.0,85.0,0.447368,0.250000,0.500000
1,10300001,1610612762,70.0,7.0,34.0,9.0,32.0,23.0,8.0,4.0,18.0,26.0,90.0,0.457143,0.142857,0.735294
2,10300002,1610612749,75.0,13.0,40.0,11.0,32.0,20.0,9.0,4.0,24.0,26.0,94.0,0.426667,0.153846,0.700000
3,10300002,1610612763,81.0,15.0,34.0,14.0,34.0,25.0,18.0,7.0,25.0,35.0,105.0,0.493827,0.266667,0.617647
4,10300003,1610612739,77.0,6.0,29.0,12.0,40.0,25.0,10.0,7.0,25.0,33.0,100.0,0.493506,0.666667,0.689655
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56194,52200201,1610612748,78.0,30.0,32.0,10.0,41.0,19.0,7.0,2.0,13.0,20.0,102.0,0.410256,0.333333,0.875000
56195,52200211,1610612750,83.0,32.0,26.0,9.0,38.0,29.0,11.0,8.0,16.0,15.0,120.0,0.518072,0.406250,0.807692
56196,52200211,1610612760,89.0,40.0,20.0,12.0,30.0,19.0,9.0,6.0,17.0,20.0,95.0,0.359551,0.325000,0.900000
56197,62300001,1610612747,88.0,13.0,35.0,12.0,43.0,25.0,5.0,10.0,18.0,25.0,123.0,0.534091,0.153846,0.771429


In [5]:
df = read_file("games")

# Convert 'GAME_DATE_EST' to datetime
df['GAME_DATE_EST'] = pd.to_datetime(df['GAME_DATE_EST'])

# Sort DataFrame based on 'GAME_DATE_EST'
df = df.sort_values(by=['SEASON', 'GAME_DATE_EST'])

# Create game meta df
gmeta = df[['GAME_ID','SEASON', 'GAME_DATE_EST']]

# Create a cumulative count column within each group defined by 'SEASON' and 'GAME_DATE_EST'
date_count = df.groupby(['SEASON','GAME_DATE_EST'], as_index=False)['GAME_ID'].count()
date_count['date_rank'] = date_count.groupby('SEASON').cumcount()
date_count.drop(['GAME_ID'],axis=1,inplace=True)

#Merge date_count with gdf
gdf2 = gdf.merge(gmeta, on='GAME_ID', how='left').merge(date_count, on=['SEASON','GAME_DATE_EST'], how='left').sort_values(by=['SEASON', 'GAME_DATE_EST'])

# Identify non-id stat columns
non_id_columns = gdf2.columns.difference(['GAME_ID', 'TEAM_ID', 'SEASON', 'GAME_DATE_EST', 'date_rank'])
gdf2 = gdf2[['GAME_ID', 'TEAM_ID', 'SEASON', 'GAME_DATE_EST', 'date_rank'] + non_id_columns.to_list()]
gdf2

Unnamed: 0,GAME_ID,TEAM_ID,SEASON,GAME_DATE_EST,date_rank,AST,BLK,DREB,FG3A,FG3_PCT,FGA,FG_PCT,FTA,FT_PCT,OREB,PF,PTS,STL,TO
0,10300001,1610612742,2003,2003-10-05,0,20.0,4.0,26.0,8.0,0.500000,76.0,0.250000,30.0,0.447368,12.0,34.0,85.0,9.0,18.0
1,10300001,1610612762,2003,2003-10-05,0,23.0,4.0,32.0,7.0,0.735294,70.0,0.142857,34.0,0.457143,9.0,26.0,90.0,8.0,18.0
2,10300002,1610612749,2003,2003-10-06,1,20.0,4.0,32.0,13.0,0.700000,75.0,0.153846,40.0,0.426667,11.0,26.0,94.0,9.0,24.0
3,10300002,1610612763,2003,2003-10-06,1,25.0,7.0,34.0,15.0,0.617647,81.0,0.266667,34.0,0.493827,14.0,35.0,105.0,18.0,25.0
4,10300003,1610612739,2003,2003-10-07,2,25.0,7.0,40.0,6.0,0.689655,77.0,0.666667,29.0,0.493506,12.0,33.0,100.0,10.0,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52802,22300639,1610612749,2023,2024-01-26,104,21.0,4.0,39.0,34.0,0.708333,87.0,0.382353,24.0,0.402299,12.0,24.0,100.0,6.0,18.0
52803,22300640,1610612740,2023,2024-01-26,104,24.0,5.0,33.0,36.0,0.714286,82.0,0.250000,14.0,0.390244,8.0,16.0,83.0,9.0,20.0
52804,22300640,1610612760,2023,2024-01-26,104,22.0,5.0,39.0,33.0,0.928571,88.0,0.303030,14.0,0.477273,6.0,11.0,107.0,11.0,11.0
52805,22300641,1610612757,2023,2024-01-26,104,23.0,3.0,29.0,37.0,0.736842,90.0,0.270270,19.0,0.422222,8.0,20.0,100.0,11.0,13.0


In [7]:
df2 = df.copy()
df2 = df2[['GAME_ID','HOME_TEAM_ID','VISITOR_TEAM_ID', 'HOME_TEAM_WINS']]

# Merge df with home stats
df2 = df2.merge(gdf2, how='left', left_on=['GAME_ID','HOME_TEAM_ID'], right_on=['GAME_ID','TEAM_ID'])
df2 = df2[~df2['TEAM_ID'].isnull()].drop(['TEAM_ID'], axis=1)

# Merge df with away stats
df2 = df2.merge(gdf2, how='left', left_on=['GAME_ID','VISITOR_TEAM_ID'], right_on=['GAME_ID','TEAM_ID'], suffixes=('_HOME','_AWAY'))
df2 = df2[~df2['TEAM_ID'].isnull()].drop(['TEAM_ID'], axis=1)


df2

Unnamed: 0,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,HOME_TEAM_WINS,SEASON_HOME,GAME_DATE_EST_HOME,date_rank_HOME,AST_HOME,BLK_HOME,DREB_HOME,FG3A_HOME,FG3_PCT_HOME,FGA_HOME,FG_PCT_HOME,FTA_HOME,FT_PCT_HOME,OREB_HOME,PF_HOME,PTS_HOME,STL_HOME,TO_HOME,SEASON_AWAY,GAME_DATE_EST_AWAY,date_rank_AWAY,AST_AWAY,BLK_AWAY,DREB_AWAY,FG3A_AWAY,FG3_PCT_AWAY,FGA_AWAY,FG_PCT_AWAY,FTA_AWAY,FT_PCT_AWAY,OREB_AWAY,PF_AWAY,PTS_AWAY,STL_AWAY,TO_AWAY
0,10300001,1610612762,1610612742,1,2003.0,2003-10-05,0.0,23.0,4.0,32.0,7.0,0.735294,70.0,0.142857,34.0,0.457143,9.0,26.0,90.0,8.0,18.0,2003,2003-10-05,0,20.0,4.0,26.0,8.0,0.500000,76.0,0.250000,30.0,0.447368,12.0,34.0,85.0,9.0,18.0
1,10300002,1610612763,1610612749,1,2003.0,2003-10-06,1.0,25.0,7.0,34.0,15.0,0.617647,81.0,0.266667,34.0,0.493827,14.0,35.0,105.0,18.0,25.0,2003,2003-10-06,1,20.0,4.0,32.0,13.0,0.700000,75.0,0.153846,40.0,0.426667,11.0,26.0,94.0,9.0,24.0
2,10300010,1610612764,1610612752,1,2003.0,2003-10-07,2.0,26.0,7.0,30.0,11.0,0.677419,76.0,0.454545,31.0,0.513158,14.0,24.0,104.0,9.0,18.0,2003,2003-10-07,2,18.0,5.0,23.0,15.0,0.851852,77.0,0.200000,27.0,0.376623,13.0,28.0,84.0,10.0,17.0
3,10300003,1610612765,1610612739,0,2003.0,2003-10-07,2.0,19.0,3.0,25.0,9.0,0.600000,87.0,0.444444,40.0,0.390805,12.0,27.0,96.0,15.0,13.0,2003,2003-10-07,2,25.0,7.0,40.0,6.0,0.689655,77.0,0.666667,29.0,0.493506,12.0,33.0,100.0,10.0,25.0
4,10300004,1610612742,1610612753,1,2003.0,2003-10-07,2.0,20.0,5.0,37.0,6.0,0.666667,82.0,0.166667,27.0,0.487805,15.0,30.0,99.0,9.0,17.0,2003,2003-10-07,2,14.0,5.0,23.0,18.0,0.833333,61.0,0.333333,24.0,0.409836,6.0,24.0,76.0,9.0,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28297,22300637,1610612761,1610612746,0,2023.0,2024-01-26,104.0,26.0,3.0,30.0,32.0,0.590909,96.0,0.187500,22.0,0.458333,15.0,18.0,107.0,6.0,11.0,2023,2024-01-26,104,29.0,7.0,38.0,32.0,0.916667,95.0,0.281250,24.0,0.505263,12.0,17.0,127.0,6.0,10.0
28298,22300638,1610612763,1610612753,1,2023.0,2024-01-26,104.0,30.0,7.0,34.0,41.0,0.615385,90.0,0.365854,13.0,0.466667,9.0,21.0,107.0,9.0,13.0,2023,2024-01-26,104,21.0,4.0,31.0,34.0,0.724138,81.0,0.264706,29.0,0.469136,9.0,14.0,106.0,8.0,11.0
28299,22300639,1610612749,1610612739,0,2023.0,2024-01-26,104.0,21.0,4.0,39.0,34.0,0.708333,87.0,0.382353,24.0,0.402299,12.0,24.0,100.0,6.0,18.0,2023,2024-01-26,104,20.0,8.0,36.0,36.0,0.709677,92.0,0.388889,31.0,0.413043,15.0,22.0,112.0,7.0,13.0
28300,22300640,1610612740,1610612760,0,2023.0,2024-01-26,104.0,24.0,5.0,33.0,36.0,0.714286,82.0,0.250000,14.0,0.390244,8.0,16.0,83.0,9.0,20.0,2023,2024-01-26,104,22.0,5.0,39.0,33.0,0.928571,88.0,0.303030,14.0,0.477273,6.0,11.0,107.0,11.0,11.0


In [8]:
df2.columns

Index(['GAME_ID', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'HOME_TEAM_WINS',
       'SEASON_HOME', 'GAME_DATE_EST_HOME', 'date_rank_HOME', 'AST_HOME',
       'BLK_HOME', 'DREB_HOME', 'FG3A_HOME', 'FG3_PCT_HOME', 'FGA_HOME',
       'FG_PCT_HOME', 'FTA_HOME', 'FT_PCT_HOME', 'OREB_HOME', 'PF_HOME',
       'PTS_HOME', 'STL_HOME', 'TO_HOME', 'SEASON_AWAY', 'GAME_DATE_EST_AWAY',
       'date_rank_AWAY', 'AST_AWAY', 'BLK_AWAY', 'DREB_AWAY', 'FG3A_AWAY',
       'FG3_PCT_AWAY', 'FGA_AWAY', 'FG_PCT_AWAY', 'FTA_AWAY', 'FT_PCT_AWAY',
       'OREB_AWAY', 'PF_AWAY', 'PTS_AWAY', 'STL_AWAY', 'TO_AWAY'],
      dtype='object')

In [9]:
df2['HOME_PT_DIFF'] = df2['PTS_HOME'] - df2['PTS_AWAY']

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

#Drop Unnecessary Fields
drop = ['HOME_TEAM_WINS','SEASON_HOME', 'GAME_DATE_EST_HOME', 'GAME_DATE_EST_AWAY', 'date_rank_HOME',
        'date_rank_AWAY','PTS_HOME', 'PTS_AWAY']

target = ['HOME_PT_DIFF']
IDcol = ['GAME_ID', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID']
predictors = [x for x in df2.columns if x not in drop+target+IDcol]


X = df2[predictors]
y = df2[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
predictors

['AST_HOME',
 'BLK_HOME',
 'DREB_HOME',
 'FG3A_HOME',
 'FG3_PCT_HOME',
 'FGA_HOME',
 'FG_PCT_HOME',
 'FTA_HOME',
 'FT_PCT_HOME',
 'OREB_HOME',
 'PF_HOME',
 'STL_HOME',
 'TO_HOME',
 'SEASON_AWAY',
 'AST_AWAY',
 'BLK_AWAY',
 'DREB_AWAY',
 'FG3A_AWAY',
 'FG3_PCT_AWAY',
 'FGA_AWAY',
 'FG_PCT_AWAY',
 'FTA_AWAY',
 'FT_PCT_AWAY',
 'OREB_AWAY',
 'PF_AWAY',
 'STL_AWAY',
 'TO_AWAY']

In [22]:
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, PowerTransformer, MaxAbsScaler, LabelEncoder

#Scale each variable
sc_X = StandardScaler()
X_train2 = pd.DataFrame(sc_X.fit_transform(X_train))
X_test2 = pd.DataFrame(sc_X.fit_transform(X_test))
X_train2.columns = X_train.columns.values
X_test2.columns = X_test.columns.values
X_train2.index = X_train.index.values
X_test2.index = X_test.index.values
X_train = X_train2
X_test = X_test2

In [23]:
from pandas import DataFrame 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 42)
import math
from sklearn import metrics
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import VarianceThreshold # Feature selector
# Various pre-processing steps
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, PowerTransformer, MaxAbsScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV # For optimization

def model_predict(model):
    test_mae = []
    test_rmse =  []
    
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
        model_fit = model.fit(X_train, y_train)

        #make predictions against the test df
        y_pred = model_fit.predict(X_test)

        #define metrics
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = math.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)

        test_mae.append(mae)
        test_rmse.append(rmse)
    
    
    print("rmse : %.3f +/- %.3f" % (np.mean(test_rmse),np.std(test_rmse)))
    print("mae : %.3f +/- %.3f" % (np.mean(test_mae),np.std(test_mae)))
        
        

# Linear Regression

In [24]:
model_predict(LinearRegression())

rmse : 2.289 +/- 0.018
mae : 1.712 +/- 0.019


# Sequential Neural Net

In [26]:
X_train.shape[1]

27

In [30]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
import pandas as pd
import numpy as np

# Defining a function to find the best parameters for ANN
def FunctionFindBestParams(X_train, y_train, X_test, y_test):
    
    # Defining the list of hyper parameters to try
    batch_size_list = [10]
    epoch_list = [5]
    
    SearchResultsData = pd.DataFrame(columns=['TrialNumber', 'Parameters', 'Accuracy'])
    
    # initializing the trials
    TrialNumber = 0
    for batch_size_trial in batch_size_list:
        for epochs_trial in epoch_list:
            TrialNumber += 1
            # create ANN model
            model = Sequential()
            # Defining the first layer of the model
            model.add(tf.keras.layers.Dense(units=20, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))

            # Defining the Second layer of the model
            model.add(tf.keras.layers.Dense(units=5, kernel_initializer='normal', activation='relu'))

            # The output neuron is a single fully connected node 
            # Since we will be predicting a single number
            model.add(tf.keras.layers.Dense(1, kernel_initializer='normal'))

            # Compiling the model
            model.compile(loss='mean_squared_error', optimizer='adam')

            # Fitting the ANN to the Training set
            model.fit(X_train, y_train, batch_size=batch_size_trial, epochs=epochs_trial, verbose=0)

            #MAPE = np.mean(100 * (np.abs(y_test - model.predict(X_test)) / y_test))
            mse = metrics.mean_squared_error(y_test, model.predict(X_test))
            rmse = math.sqrt(mse)

            # printing the results of the current iteration
            print(TrialNumber, 'Parameters:', 'batch_size:', batch_size_trial, '-', 'epochs:', epochs_trial, 'RMSE:', rmse)
            
            # Concatenating the new data with the existing DataFrame
            SearchResultsData = pd.concat([SearchResultsData, pd.DataFrame(data=[[TrialNumber, str(batch_size_trial) + '-' + str(epochs_trial), rmse]],
                                                                           columns=['TrialNumber', 'Parameters', 'Accuracy'])], ignore_index=True)
    return SearchResultsData

# Calling the function
FunctionFindBestParams(X_train, y_train, X_test, y_test)


1 Parameters: batch_size: 5 - epochs: 5 RMSE: 1.0953549103396223
2 Parameters: batch_size: 10 - epochs: 5 RMSE: 1.092520403451002
3 Parameters: batch_size: 15 - epochs: 5 RMSE: 1.3446218031063655
4 Parameters: batch_size: 20 - epochs: 5 RMSE: 6.517421850165346


Unnamed: 0,TrialNumber,Parameters,Accuracy
0,1,5-5,1.095355
1,2,10-5,1.09252
2,3,15-5,1.344622
3,4,20-5,6.517422


In [31]:
model

NameError: name 'model' is not defined