# Import Libraries

In [2]:
import sqlite3
from time import time, sleep
import datetime
import joblib
import os


import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import tensorflow as tf
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint

import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

# Create Functions

In [8]:
pathname = "/Users/alecnaidoo/Downloads/nba-data-04-20/"
def read_file(str_name):
    df = pd.read_csv(pathname + str_name + ".csv")
    return df

# Cleanse Data for Model Processing

In [263]:
# Summarize game detail stats
gdf = read_file("games_details")
gdf = gdf.groupby(['GAME_ID','TEAM_ID'],as_index=False)[['FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA',
       'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS']].sum()

gdf['FT_PCT'] = gdf['FGM'] / gdf['FGA']
gdf['FG_PCT'] = gdf['FG3M'] / gdf['FG3A']
gdf['FG3_PCT'] = gdf['FTM'] / gdf['FTA']

gdf.drop(['FGM','FG3M', 'FTM', 'REB'],axis=1, inplace=True)
print(gdf.columns)
gdf

Index(['GAME_ID', 'TEAM_ID', 'FGA', 'FG3A', 'FTA', 'OREB', 'DREB', 'AST',
       'STL', 'BLK', 'TO', 'PF', 'PTS', 'FT_PCT', 'FG_PCT', 'FG3_PCT'],
      dtype='object')


Unnamed: 0,GAME_ID,TEAM_ID,FGA,FG3A,FTA,OREB,DREB,AST,STL,BLK,TO,PF,PTS,FT_PCT,FG_PCT,FG3_PCT
0,10300001,1610612742,76.0,8.0,30.0,12.0,26.0,20.0,9.0,4.0,18.0,34.0,85.0,0.447368,0.250000,0.500000
1,10300001,1610612762,70.0,7.0,34.0,9.0,32.0,23.0,8.0,4.0,18.0,26.0,90.0,0.457143,0.142857,0.735294
2,10300002,1610612749,75.0,13.0,40.0,11.0,32.0,20.0,9.0,4.0,24.0,26.0,94.0,0.426667,0.153846,0.700000
3,10300002,1610612763,81.0,15.0,34.0,14.0,34.0,25.0,18.0,7.0,25.0,35.0,105.0,0.493827,0.266667,0.617647
4,10300003,1610612739,77.0,6.0,29.0,12.0,40.0,25.0,10.0,7.0,25.0,33.0,100.0,0.493506,0.666667,0.689655
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56318,52200201,1610612748,78.0,30.0,32.0,10.0,41.0,19.0,7.0,2.0,13.0,20.0,102.0,0.410256,0.333333,0.875000
56319,52200211,1610612750,83.0,32.0,26.0,9.0,38.0,29.0,11.0,8.0,16.0,15.0,120.0,0.518072,0.406250,0.807692
56320,52200211,1610612760,89.0,40.0,20.0,12.0,30.0,19.0,9.0,6.0,17.0,20.0,95.0,0.359551,0.325000,0.900000
56321,62300001,1610612747,88.0,13.0,35.0,12.0,43.0,25.0,5.0,10.0,18.0,25.0,123.0,0.534091,0.153846,0.771429


In [264]:
df = read_file("games")

# Convert 'GAME_DATE_EST' to datetime
df['GAME_DATE_EST'] = pd.to_datetime(df['GAME_DATE_EST'])

# Create game meta df
gmeta = df[['GAME_ID','SEASON', 'GAME_DATE_EST']]
#Merge date_count with gdf
gdf2 = gdf.merge(gmeta, on='GAME_ID', how='left').sort_values(by=['SEASON', 'GAME_DATE_EST'])

# Identify non-id stat columns
non_id_columns = gdf2.columns.difference(['GAME_ID', 'TEAM_ID', 'SEASON', 'GAME_DATE_EST', 'date_rank'])

#Re-arragne columns
gdf2 = gdf2[['GAME_ID', 'TEAM_ID', 'SEASON', 'GAME_DATE_EST'] + non_id_columns.to_list()]
gdf2['PTS'] = gdf2.pop('PTS')
gdf2

Unnamed: 0,GAME_ID,TEAM_ID,SEASON,GAME_DATE_EST,AST,BLK,DREB,FG3A,FG3_PCT,FGA,FG_PCT,FTA,FT_PCT,OREB,PF,STL,TO,PTS
0,10300001,1610612742,2003,2003-10-05,20.0,4.0,26.0,8.0,0.500000,76.0,0.250000,30.0,0.447368,12.0,34.0,9.0,18.0,85.0
1,10300001,1610612762,2003,2003-10-05,23.0,4.0,32.0,7.0,0.735294,70.0,0.142857,34.0,0.457143,9.0,26.0,8.0,18.0,90.0
2,10300002,1610612749,2003,2003-10-06,20.0,4.0,32.0,13.0,0.700000,75.0,0.153846,40.0,0.426667,11.0,26.0,9.0,24.0,94.0
3,10300002,1610612763,2003,2003-10-06,25.0,7.0,34.0,15.0,0.617647,81.0,0.266667,34.0,0.493827,14.0,35.0,18.0,25.0,105.0
4,10300003,1610612739,2003,2003-10-07,25.0,7.0,40.0,6.0,0.689655,77.0,0.666667,29.0,0.493506,12.0,33.0,10.0,25.0,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52926,22300701,1610612752,2023,2024-02-03,21.0,4.0,33.0,35.0,0.866667,100.0,0.285714,15.0,0.410000,18.0,20.0,1.0,7.0,105.0
52927,22300702,1610612742,2023,2024-02-03,22.0,1.0,34.0,39.0,0.913043,81.0,0.410256,23.0,0.493827,8.0,19.0,9.0,21.0,117.0
52928,22300702,1610612749,2023,2024-02-03,34.0,2.0,31.0,30.0,0.680000,88.0,0.400000,25.0,0.568182,5.0,21.0,13.0,11.0,129.0
52929,22300703,1610612739,2023,2024-02-03,31.0,4.0,41.0,44.0,0.923077,97.0,0.295455,13.0,0.474227,10.0,19.0,6.0,9.0,117.0


In [265]:
# make gdf2 to numpy for efficiency
gdf2 = gdf2.to_numpy()

# Identify non-id stat columns
non_id_columns = np.setdiff1d(np.arange(gdf2.shape[1]), [0, 1, 2, 3]) 

#Create placeholder numpy memory
result_array = np.empty((gdf2.shape[0], 4), dtype=object)

for i, row in enumerate(gdf2):
    curr_team = row[1]  # Assuming 'TEAM_ID' is at index 1
    curr_season = row[2]  # Assuming 'SEASON' is at index 2
    curr_date = row[3]  # Assuming 'GAME_DATE_EST' is at index 17

    # Filter rows based on conditions using boolean indexing
    temp_array = gdf2[(gdf2[:, 1] == curr_team) & (gdf2[:, 2] == curr_season) & (gdf2[:, 3] < curr_date)]

    if temp_array.shape[0] > 0:
        # Calculate mean for each stat using vectorized operations
        mean_values = np.nanmean(temp_array[:, non_id_columns], axis=0)
    else:
        # If tempdf is empty, set mean_values to NaN
        mean_values = np.full(len(non_id_columns), np.nan)

    # Flatten the array and assign values to the result array
    result_array[i, 0] = curr_team
    result_array[i, 1] = curr_season
    result_array[i, 2] = curr_date
    result_array[i, 3] = mean_values.tolist()


In [266]:
model_df = np.hstack((gdf2[:, 0].reshape(-1, 1), result_array))
model_df = np.hstack([model_df[:, :4], np.nan_to_num(np.vstack(model_df[:, 4]))])

rolling_szn_avgs = pd.DataFrame(model_df,columns=['GAME_ID', 'TEAM_ID', 'SEASON', 'GAME_DATE_EST', 'AST', 'BLK', 'DREB',
       'FG3A', 'FG3_PCT', 'FGA', 'FG_PCT', 'FTA', 'FT_PCT', 'OREB', 'PF',
       'STL', 'TO', 'PTS'])

In [267]:
rolling_szn_avgs

Unnamed: 0,GAME_ID,TEAM_ID,SEASON,GAME_DATE_EST,AST,BLK,DREB,FG3A,FG3_PCT,FGA,FG_PCT,FTA,FT_PCT,OREB,PF,STL,TO,PTS
0,10300001,1610612742,2003,2003-10-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10300001,1610612762,2003,2003-10-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10300002,1610612749,2003,2003-10-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10300002,1610612763,2003,2003-10-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10300003,1610612739,2003,2003-10-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56376,22300701,1610612752,2023,2024-02-03,23.886792,3.90566,33.45283,35.943396,0.783366,88.867925,0.369178,23.962264,0.467629,12.54717,18.207547,7.566038,12.641509,114.886792
56377,22300702,1610612742,2023,2024-02-03,25.176471,4.509804,31.666667,40.745098,0.75158,90.078431,0.367667,23.745098,0.469707,10.098039,19.196078,6.666667,12.058824,117.352941
56378,22300702,1610612749,2023,2024-02-03,26.490566,5.45283,35.169811,38.169811,0.772974,89.962264,0.371084,25.943396,0.490134,9.698113,20.169811,6.849057,12.90566,122.377358
56379,22300703,1610612739,2023,2024-02-03,26.714286,4.653061,34.040816,37.122449,0.774683,88.571429,0.357364,20.918367,0.477019,10.673469,19.102041,7.938776,13.408163,113.714286


In [268]:
df2 = df.copy()
df2 = df2[['GAME_ID','HOME_TEAM_ID','VISITOR_TEAM_ID', 'HOME_TEAM_WINS']]

# Merge df with home stats
df2 = df2.merge(rolling_szn_avgs, how='left', left_on=['GAME_ID','HOME_TEAM_ID'], right_on=['GAME_ID','TEAM_ID'])
df2 = df2[~df2['TEAM_ID'].isnull()].drop(['TEAM_ID'], axis=1)

# Merge df with away stats
df2 = df2.merge(rolling_szn_avgs, how='left', left_on=['GAME_ID','VISITOR_TEAM_ID'], right_on=['GAME_ID','TEAM_ID'], suffixes=('_HOME','_AWAY'))
df2 = df2[~df2['TEAM_ID'].isnull()].drop(['TEAM_ID'], axis=1)

# Since we are working with averages, drop all rows where avg was 0 for either home or away team
df2 = df2[~((df2['PTS_HOME']==0)|(df2['PTS_AWAY']==0))]

# Create target variable point differential
df['HOME_PT_DIFF'] = df['PTS_home'] - df['PTS_away']
df2 = df2.merge(df[['GAME_ID','HOME_PT_DIFF']], on='GAME_ID', how='left')

# Model Preparation

In [306]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

#Drop Unnecessary Fields
drop = ['HOME_TEAM_WINS',
        'SEASON_HOME', 'SEASON_AWAY',
        'GAME_DATE_EST_HOME', 'GAME_DATE_EST_AWAY', 
        'date_rank_HOME', 'date_rank_AWAY',
        'PTS_HOME', 'PTS_AWAY']

target = ['HOME_PT_DIFF']
IDcol = ['GAME_ID', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID']
predictors = [x for x in df2.columns if x not in drop+target+IDcol]


X = df2[predictors]
y = df2[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [328]:
import numpy as np
from sklearn.base import is_classifier
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, PowerTransformer, MaxAbsScaler, LabelEncoder

def model_predict(model, X, y):
    test_mae = []
    test_rmse =  []
    trained_models = []

    for i in range(100):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
            
        #Scale each variable
        sc_X = StandardScaler()
        X_train2 = pd.DataFrame(sc_X.fit_transform(X_train))
        X_test2 = pd.DataFrame(sc_X.fit_transform(X_test))
        X_train2.columns = X_train.columns.values
        X_test2.columns = X_test.columns.values
        X_train2.index = X_train.index.values
        X_test2.index = X_test.index.values
        X_train = X_train2
        X_test = X_test2

        if isinstance(model, Sequential):  # Check if it's a Keras Sequential model
            
            # Train the model
            model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)
            
            # Make predictions against the test data
            y_pred = model.predict(X_test)

        else:  # Handle scikit-learn models
            model_fit = model.fit(X_train, y_train)
            y_pred = model_fit.predict(X_test)

        # Calculate metrics
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = math.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)

        test_mae.append(mae)
        test_rmse.append(rmse)
        trained_models.append(model)
    
    print("rmse : %.3f +/- %.3f" % (np.mean(test_rmse),np.std(test_rmse)))
    print("mae : %.3f +/- %.3f" % (np.mean(test_mae),np.std(test_mae)))
    
    return trained_models


In [321]:
import numpy as np
from sklearn.base import is_regressor

def model_predict(model, X, y):
    test_mae = []
    test_rmse =  []
    trained_models = []

    for i in range(100):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)

        if is_regressor(model):  # Check if it's a regressor
            # Convert data to NumPy arrays (if not already)
            X_train_np = np.array(X_train)
            X_test_np = np.array(X_test)
            y_train_np = np.array(y_train)
            y_test_np = np.array(y_test)
            
            # Train the model
            model.fit(X_train_np, y_train_np, epochs=10, batch_size=32, verbose=0)
            
            # Make predictions against the test data
            y_pred = model.predict(X_test_np)

        else:  # Handle scikit-learn models
            model_fit = model.fit(X_train, y_train)
            y_pred = model_fit.predict(X_test)

        # Calculate metrics
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = math.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)

        test_mae.append(mae)
        test_rmse.append(rmse)
        trained_models.append(model)
    
    print("rmse : %.3f +/- %.3f" % (np.mean(test_rmse),np.std(test_rmse)))
    print("mae : %.3f +/- %.3f" % (np.mean(test_mae),np.std(test_mae)))
    
    return trained_models


## Rescale Predictor Variables

# Model Predictions / Scores

## Linear Regression

In [329]:
trained_linear_models = model_predict(LinearRegression(), X, y)

rmse : 12.876 +/- 0.115
mae : 10.141 +/- 0.088


## Sequential Neural Net

In [324]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
import pandas as pd
import numpy as np

In [330]:
model = Sequential()

# Defining the layers of the model
model.add(tf.keras.layers.Dense(units=20, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
model.add(tf.keras.layers.Dense(units=5, kernel_initializer='normal', activation='relu'))
model.add(tf.keras.layers.Dense(1, kernel_initializer='normal'))
model.compile(loss='mean_squared_error', optimizer='adam')

trained_sequential_models = model_predict(model, X, y)
trained_sequential_models

rmse : 12.465 +/- 0.142
mae : 9.782 +/- 0.111


[<keras.src.engine.sequential.Sequential at 0x148aa5600>,
 <keras.src.engine.sequential.Sequential at 0x148aa5600>,
 <keras.src.engine.sequential.Sequential at 0x148aa5600>,
 <keras.src.engine.sequential.Sequential at 0x148aa5600>,
 <keras.src.engine.sequential.Sequential at 0x148aa5600>,
 <keras.src.engine.sequential.Sequential at 0x148aa5600>,
 <keras.src.engine.sequential.Sequential at 0x148aa5600>,
 <keras.src.engine.sequential.Sequential at 0x148aa5600>,
 <keras.src.engine.sequential.Sequential at 0x148aa5600>,
 <keras.src.engine.sequential.Sequential at 0x148aa5600>,
 <keras.src.engine.sequential.Sequential at 0x148aa5600>,
 <keras.src.engine.sequential.Sequential at 0x148aa5600>,
 <keras.src.engine.sequential.Sequential at 0x148aa5600>,
 <keras.src.engine.sequential.Sequential at 0x148aa5600>,
 <keras.src.engine.sequential.Sequential at 0x148aa5600>,
 <keras.src.engine.sequential.Sequential at 0x148aa5600>,
 <keras.src.engine.sequential.Sequential at 0x148aa5600>,
 <keras.src.en

In [None]:
model_predict(model)

In [320]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
import pandas as pd
import numpy as np

# Defining a function to find the best parameters for ANN
def FunctionFindBestParams(X_train, y_train, X_test, y_test):
    
    # Defining the list of hyper parameters to try
    batch_size_list = [10]
    epoch_list = [5]
    
    SearchResultsData = pd.DataFrame(columns=['TrialNumber', 'Parameters', 'Accuracy'])
    
    # initializing the trials
    TrialNumber = 0
    for batch_size_trial in batch_size_list:
        for epochs_trial in epoch_list:
            TrialNumber += 1
            # create ANN model
            model = Sequential()
            # Defining the first layer of the model
            model.add(tf.keras.layers.Dense(units=20, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))

            # Defining the Second layer of the model
            model.add(tf.keras.layers.Dense(units=5, kernel_initializer='normal', activation='relu'))

            # The output neuron is a single fully connected node 
            # Since we will be predicting a single number
            model.add(tf.keras.layers.Dense(1, kernel_initializer='normal'))

            # Compiling the model
            model.compile(loss='mean_squared_error', optimizer='adam')

            # Fitting the ANN to the Training set
            model.fit(X_train, y_train, batch_size=batch_size_trial, epochs=epochs_trial, verbose=0)

            #MAPE = np.mean(100 * (np.abs(y_test - model.predict(X_test)) / y_test))
            mse = metrics.mean_squared_error(y_test, model.predict(X_test))
            rmse = math.sqrt(mse)

            # printing the results of the current iteration
            print(TrialNumber, 'Parameters:', 'batch_size:', batch_size_trial, '-', 'epochs:', epochs_trial, 'RMSE:', rmse)
            
            # Concatenating the new data with the existing DataFrame
            SearchResultsData = pd.concat([SearchResultsData, pd.DataFrame(data=[[TrialNumber, str(batch_size_trial) + '-' + str(epochs_trial), rmse]],
                                                                           columns=['TrialNumber', 'Parameters', 'RMSE'])], ignore_index=True)
    return SearchResultsData

# Calling the function
FunctionFindBestParams(X_train, y_train, X_test, y_test)


1 Parameters: batch_size: 10 - epochs: 5 RMSE: 12.735448920545151


Unnamed: 0,TrialNumber,Parameters,Accuracy,RMSE
0,1,10-5,,12.735449


In [74]:
y_test

Unnamed: 0,HOME_PT_DIFF
9322,-12.0
1569,14.0
21945,16.0
18176,-10.0
24776,10.0
...,...
7794,4.0
26632,-14.0
4418,-30.0
26060,28.0


In [75]:
df2[df2.index == 4388]

Unnamed: 0,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,HOME_TEAM_WINS,SEASON_HOME,GAME_DATE_EST_HOME,AST_HOME,BLK_HOME,DREB_HOME,FG3A_HOME,FG3_PCT_HOME,FGA_HOME,FG_PCT_HOME,FTA_HOME,FT_PCT_HOME,OREB_HOME,PF_HOME,STL_HOME,TO_HOME,PTS_HOME,SEASON_AWAY,GAME_DATE_EST_AWAY,AST_AWAY,BLK_AWAY,DREB_AWAY,FG3A_AWAY,FG3_PCT_AWAY,FGA_AWAY,FG_PCT_AWAY,FTA_AWAY,FT_PCT_AWAY,OREB_AWAY,PF_AWAY,STL_AWAY,TO_AWAY,PTS_AWAY,HOME_PT_DIFF
4388,22000548,1610612740,1610612741,0,2020,2021-03-03,25.025641,4.025641,36.769231,32.846154,0.712336,90.820513,0.355566,26.205128,0.483816,12.153846,17.794872,7.641026,13.384615,118.205128,2020,2021-03-03,26.974359,4.692308,38.076923,37.025641,0.780274,93.384615,0.361185,21.871795,0.477776,9.358974,20.846154,7.333333,16.615385,119.384615,-4.0


In [None]:
# Next, 
# pull in the odds and predict the accuracy
# pull in averages for the season and predict the line

## What's the accuracy on the 2023-2024 season so far? Need to pull in odds data

In [80]:
team_meta = read_file("games_details")
team_meta = team_meta[['TEAM_ID','TEAM_ABBREVIATION']].drop_duplicates()

# Get ODDS Data

## Import Libraries

In [5]:
!ls

README.md           main.py             [34mmodels[m[m              [34msrc[m[m
[34mdata[m[m                model-build-3.ipynb requirements.txt    [34mutils[m[m


In [15]:
import requests
import pandas as pd
import numpy as np
from time import time, sleep

f = open('utils/API_KEY.txt', 'r')
API_KEY = f.read()
f.close()

SPORT = 'basketball_nba' # use the sport_key from the /sports endpoint below, or use 'upcoming' to see the next 8 games across all sports

REGIONS = 'us' # uk | us | eu | au. Multiple can be specified if comma delimited

MARKETS = 'spreads' # h2h | spreads | totals. Multiple can be specified if comma delimited

ODDS_FORMAT = 'american' # decimal | american

DATE_FORMAT = 'iso' # iso | unix

BOOKMAKERS = 'draftkings' # 'draftkings' | 'fanduel' | 'pointsbetus'| 'williamhill_us'| 'betmgm'|'unibet_us'| 'betrivers'| 'bovada'| 'wynnbet'| 'mybookieag'|'lowvig'| 'betonlineag'| 'betus'| 'superbook'

BET_SIZE = 100


def main():
    print('----- START -----')
    t0 = time()

    # Update this line if the data directory is not in the parent directory
    path = "/Users/alecnaidoo/Downloads/nba-data-04-20/"

    # Get old games data to find out the last date that the script was executed
    try: 
        old_odds = pd.read_csv(path + 'odds_data.csv', index_col=None)
    
    except:
        raise Exception(
            'odds_data.csv should be in the data/ directory'
        )
    
    try:
        # Total Odds Data
        odds_response = requests.get(
        f'https://api.the-odds-api.com/v4/sports/{SPORT}/odds',
        params={
            'api_key': API_KEY,
            'regions': REGIONS,
            'markets': MARKETS,
            'oddsFormat': ODDS_FORMAT,
            'dateFormat': DATE_FORMAT,
        }).json()

        # Events Data
        event_response = requests.get(
            f'https://api.the-odds-api.com/v4/sports/{SPORT}/events',
            params={
                'api_key': API_KEY,
                'regions': REGIONS,
                'markets': MARKETS,
                'oddsFormat': ODDS_FORMAT,
                'dateFormat': DATE_FORMAT,
            }).json()

        # Reformat the Odds Data
        events = pd.DataFrame(event_response)
        data_list = []

        for odds_response_entry in odds_response:
            for bookmaker_entry in odds_response_entry['bookmakers']:
                for market_entry in bookmaker_entry['markets']:
                    id = odds_response_entry['id']
                    key = bookmaker_entry['key']
                    title = bookmaker_entry['title']
                    last_update = market_entry['last_update']
                    team1_name = market_entry['outcomes'][0]['name']
                    team1_price = market_entry['outcomes'][0]['price']
                    team1_point = market_entry['outcomes'][0]['point']
                    team2_name = market_entry['outcomes'][1]['name']
                    team2_price = market_entry['outcomes'][1]['price']
                    team2_point = market_entry['outcomes'][1]['point']

                    data_list.append([id, key, title, last_update, team1_name, team1_price, team1_point, team2_name, team2_price, team2_point])

        event_data = pd.DataFrame(data_list, columns=['id', 'key', 'title', 'last_update', 'team1_name', 'team1_price', 'team1_point', 'team2_name', 'team2_price', 'team2_point'])

        def join_odds_data(bookmaker):
            newdata = events.merge(event_data[event_data['key']==bookmaker], on='id', how='left').dropna() #drop games with no odds data
            return newdata
        
        # change bookmaker if needed
        new_odds = join_odds_data('draftkings')

        # Exclude duplicates from the old data
        df_old_no_duplicates = old_odds[~old_odds['id'].isin(new_odds['id'])]

        # Concatenate old data without duplicates and the new data
        full_odds = pd.concat([df_old_no_duplicates, new_odds]).reset_index(drop=True)
        full_odds.to_csv(path + r'odds_data.csv', index=False)
        print('-----  END  ----- execution time : %.2fs' % (time() - t0))
        
        return full_odds, new_odds

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except Exception as err:
        print(f"An error occurred: {err}")
    
    

if __name__ == "__main__":
    try:
        odds_data, new_odds = main()

    except:
        print("Failed to retrieve NBA odds data.")


----- START -----
-----  END  ----- execution time : 0.91s


In [17]:
new_odds

Unnamed: 0,id,sport_key,sport_title,commence_time,home_team,away_team,key,title,last_update,team1_name,team1_price,team1_point,team2_name,team2_price,team2_point
0,3873201e0f40ce422145c71be5e1a61f,basketball_nba,NBA,2024-02-15T00:10:00Z,Charlotte Hornets,Atlanta Hawks,draftkings,DraftKings,2024-02-14T06:07:54Z,Atlanta Hawks,-105,-7.0,Charlotte Hornets,-115,7.0
1,e77f5314a1a805736fd7247d871e3c59,basketball_nba,NBA,2024-02-15T00:10:00Z,Philadelphia 76ers,Miami Heat,draftkings,DraftKings,2024-02-14T06:07:54Z,Miami Heat,-110,3.0,Philadelphia 76ers,-110,-3.0
2,93b5425c6c4f1b6703cb9119eaeb8abe,basketball_nba,NBA,2024-02-15T00:10:00Z,Orlando Magic,New York Knicks,draftkings,DraftKings,2024-02-14T06:07:54Z,New York Knicks,-105,2.5,Orlando Magic,-115,-2.5
3,9ea896d4f598255427899a99e5774f04,basketball_nba,NBA,2024-02-15T00:40:00Z,Boston Celtics,Brooklyn Nets,draftkings,DraftKings,2024-02-14T06:07:54Z,Boston Celtics,-110,-12.5,Brooklyn Nets,-110,12.5
4,086bdadf2294ca0d8eba86e1d8f951d8,basketball_nba,NBA,2024-02-15T00:40:00Z,Cleveland Cavaliers,Chicago Bulls,draftkings,DraftKings,2024-02-14T06:07:54Z,Chicago Bulls,-110,9.0,Cleveland Cavaliers,-110,-9.0
5,b2247224ce2b85d0b92bc742b5c2be94,basketball_nba,NBA,2024-02-15T00:40:00Z,Toronto Raptors,Indiana Pacers,draftkings,DraftKings,2024-02-14T06:07:54Z,Indiana Pacers,-110,-3.5,Toronto Raptors,-110,3.5
6,a219e8a170388085b42533c4f9098a75,basketball_nba,NBA,2024-02-15T01:10:00Z,Memphis Grizzlies,Houston Rockets,draftkings,DraftKings,2024-02-14T06:07:54Z,Houston Rockets,-105,-4.0,Memphis Grizzlies,-115,4.0
7,5c13792dff5b758592c8f11a96e66cfd,basketball_nba,NBA,2024-02-15T01:10:00Z,New Orleans Pelicans,Washington Wizards,draftkings,DraftKings,2024-02-14T06:07:54Z,New Orleans Pelicans,-110,-12.5,Washington Wizards,-110,12.5
8,66c8edc37c95ce7dc6b3f5059903b586,basketball_nba,NBA,2024-02-15T01:40:00Z,Dallas Mavericks,San Antonio Spurs,draftkings,DraftKings,2024-02-14T06:07:54Z,Dallas Mavericks,-110,-11.0,San Antonio Spurs,-110,11.0
9,83fa94fe55795402814cc80e29fdefd2,basketball_nba,NBA,2024-02-15T02:10:00Z,Denver Nuggets,Sacramento Kings,draftkings,DraftKings,2024-02-14T06:07:54Z,Denver Nuggets,-110,-6.0,Sacramento Kings,-110,6.0


In [16]:
odds_data

Unnamed: 0,id,sport_key,sport_title,commence_time,home_team,away_team,key,title,last_update,team1_name,team1_price,team1_point,team2_name,team2_price,team2_point
0,916e37accc708c4e76fd2f4e14a02d42,basketball_nba,NBA,2024-02-04T23:10:00Z,Charlotte Hornets,Indiana Pacers,draftkings,DraftKings,2024-02-05T01:25:20Z,Charlotte Hornets,-125.0,18.5,Indiana Pacers,-105.0,-18.5
1,6405ed37b375739beca42069272d8e2a,basketball_nba,NBA,2024-02-04T23:17:02Z,Miami Heat,Los Angeles Clippers,draftkings,DraftKings,2024-02-05T01:30:10Z,Los Angeles Clippers,-115.0,-5.5,Miami Heat,-115.0,5.5
2,2eaf2bc151d58288cfed10dcda63d9fd,basketball_nba,NBA,2024-02-05T00:10:00Z,Minnesota Timberwolves,Houston Rockets,draftkings,DraftKings,2024-02-05T01:30:10Z,Houston Rockets,-105.0,10.5,Minnesota Timberwolves,-125.0,-10.5
3,f660b1395a23d754eae81194c1712a2f,basketball_nba,NBA,2024-02-05T00:10:00Z,Oklahoma City Thunder,Toronto Raptors,draftkings,DraftKings,2024-02-05T01:30:10Z,Oklahoma City Thunder,-115.0,8.5,Toronto Raptors,-115.0,-8.5
4,964828ad63abbe09a1ce39b33e183ac0,basketball_nba,NBA,2024-02-05T01:10:00Z,Utah Jazz,Milwaukee Bucks,draftkings,DraftKings,2024-02-05T01:30:10Z,Milwaukee Bucks,-115.0,1.5,Utah Jazz,-115.0,-1.5
5,baf4676539c18a08af7b65c01d7a4c68,basketball_nba,NBA,2024-02-05T01:40:00Z,Denver Nuggets,Portland Trail Blazers,draftkings,DraftKings,2024-02-05T01:30:10Z,Denver Nuggets,-112.0,-15.0,Portland Trail Blazers,-108.0,15.0
6,c57c67cfb539cf4c05362a523bddb5c6,basketball_nba,NBA,2024-02-06T00:10:00Z,Charlotte Hornets,Los Angeles Lakers,draftkings,DraftKings,2024-02-05T04:03:01Z,Charlotte Hornets,-110.0,10.5,Los Angeles Lakers,-110.0,-10.5
7,8186cd1bf9532be759bb8eb7bde84e4b,basketball_nba,NBA,2024-02-06T00:10:00Z,Cleveland Cavaliers,Sacramento Kings,draftkings,DraftKings,2024-02-05T04:03:01Z,Cleveland Cavaliers,-110.0,-5.0,Sacramento Kings,-110.0,5.0
8,32f291b23ce6df0f9fe15c0a58c99b42,basketball_nba,NBA,2024-02-06T00:10:00Z,Philadelphia 76ers,Dallas Mavericks,draftkings,DraftKings,2024-02-05T04:03:01Z,Dallas Mavericks,-110.0,-1.5,Philadelphia 76ers,-110.0,1.5
9,3bcf1d078ea54ebd637faf85b1c467dd,basketball_nba,NBA,2024-02-06T00:40:00Z,Atlanta Hawks,Los Angeles Clippers,draftkings,DraftKings,2024-02-05T04:03:01Z,Atlanta Hawks,-110.0,3.0,Los Angeles Clippers,-110.0,-3.0


In [18]:
np.concatenate([new_odds['home_team'].unique(), new_odds['away_team'].unique()]).tolist()

['Charlotte Hornets',
 'Philadelphia 76ers',
 'Orlando Magic',
 'Boston Celtics',
 'Cleveland Cavaliers',
 'Toronto Raptors',
 'Memphis Grizzlies',
 'New Orleans Pelicans',
 'Dallas Mavericks',
 'Denver Nuggets',
 'Phoenix Suns',
 'Utah Jazz',
 'Golden State Warriors',
 'Atlanta Hawks',
 'Miami Heat',
 'New York Knicks',
 'Brooklyn Nets',
 'Chicago Bulls',
 'Indiana Pacers',
 'Houston Rockets',
 'Washington Wizards',
 'San Antonio Spurs',
 'Sacramento Kings',
 'Detroit Pistons',
 'Los Angeles Lakers',
 'Los Angeles Clippers']

In [19]:
# Identify the Team ID for each of the teams in odds
team_list = np.concatenate([new_odds['home_team'].unique(), new_odds['away_team'].unique()]).tolist()

# Clean up Team ID
team_meta = read_file('teams')
team_meta['CITY-TEAM'] = team_meta['CITY'] + ' ' + team_meta['NICKNAME']
team_meta = team_meta[['TEAM_ID','CITY-TEAM']]

#Merge in Team ID
odds_data = odds_data.merge(team_meta, left_on='home_team', right_on='CITY-TEAM', how='left').rename(columns={'TEAM_ID':'home_team_id'}).drop('CITY-TEAM',axis=1)
odds_data = odds_data.merge(team_meta, left_on='away_team', right_on='CITY-TEAM', how='left').rename(columns={'TEAM_ID':'away_team_id'}).drop('CITY-TEAM',axis=1)

In [11]:
team_list

['Charlotte Hornets',
 'Miami Heat',
 'Minnesota Timberwolves',
 'Oklahoma City Thunder',
 'Utah Jazz',
 'Denver Nuggets',
 'Cleveland Cavaliers',
 'Philadelphia 76ers',
 'Atlanta Hawks',
 'Brooklyn Nets',
 'New Orleans Pelicans',
 'Los Angeles Lakers',
 'Orlando Magic',
 'Boston Celtics',
 'Toronto Raptors',
 'Memphis Grizzlies',
 'Dallas Mavericks',
 'Phoenix Suns',
 'Golden State Warriors',
 'Indiana Pacers',
 'Los Angeles Clippers',
 'Houston Rockets',
 'Toronto Raptors',
 'Milwaukee Bucks',
 'Portland Trail Blazers',
 'Los Angeles Lakers',
 'Sacramento Kings',
 'Dallas Mavericks',
 'Golden State Warriors',
 'Detroit Pistons',
 'Atlanta Hawks',
 'Miami Heat',
 'New York Knicks',
 'Brooklyn Nets',
 'Chicago Bulls',
 'Washington Wizards',
 'San Antonio Spurs']

In [20]:
odds_data

Unnamed: 0,id,sport_key,sport_title,commence_time,home_team,away_team,key,title,last_update,team1_name,team1_price,team1_point,team2_name,team2_price,team2_point,home_team_id,away_team_id
0,916e37accc708c4e76fd2f4e14a02d42,basketball_nba,NBA,2024-02-04T23:10:00Z,Charlotte Hornets,Indiana Pacers,draftkings,DraftKings,2024-02-05T01:25:20Z,Charlotte Hornets,-125.0,18.5,Indiana Pacers,-105.0,-18.5,1610612766,1610612754
1,6405ed37b375739beca42069272d8e2a,basketball_nba,NBA,2024-02-04T23:17:02Z,Miami Heat,Los Angeles Clippers,draftkings,DraftKings,2024-02-05T01:30:10Z,Los Angeles Clippers,-115.0,-5.5,Miami Heat,-115.0,5.5,1610612748,1610612746
2,2eaf2bc151d58288cfed10dcda63d9fd,basketball_nba,NBA,2024-02-05T00:10:00Z,Minnesota Timberwolves,Houston Rockets,draftkings,DraftKings,2024-02-05T01:30:10Z,Houston Rockets,-105.0,10.5,Minnesota Timberwolves,-125.0,-10.5,1610612750,1610612745
3,f660b1395a23d754eae81194c1712a2f,basketball_nba,NBA,2024-02-05T00:10:00Z,Oklahoma City Thunder,Toronto Raptors,draftkings,DraftKings,2024-02-05T01:30:10Z,Oklahoma City Thunder,-115.0,8.5,Toronto Raptors,-115.0,-8.5,1610612760,1610612761
4,964828ad63abbe09a1ce39b33e183ac0,basketball_nba,NBA,2024-02-05T01:10:00Z,Utah Jazz,Milwaukee Bucks,draftkings,DraftKings,2024-02-05T01:30:10Z,Milwaukee Bucks,-115.0,1.5,Utah Jazz,-115.0,-1.5,1610612762,1610612749
5,baf4676539c18a08af7b65c01d7a4c68,basketball_nba,NBA,2024-02-05T01:40:00Z,Denver Nuggets,Portland Trail Blazers,draftkings,DraftKings,2024-02-05T01:30:10Z,Denver Nuggets,-112.0,-15.0,Portland Trail Blazers,-108.0,15.0,1610612743,1610612757
6,c57c67cfb539cf4c05362a523bddb5c6,basketball_nba,NBA,2024-02-06T00:10:00Z,Charlotte Hornets,Los Angeles Lakers,draftkings,DraftKings,2024-02-05T04:03:01Z,Charlotte Hornets,-110.0,10.5,Los Angeles Lakers,-110.0,-10.5,1610612766,1610612747
7,8186cd1bf9532be759bb8eb7bde84e4b,basketball_nba,NBA,2024-02-06T00:10:00Z,Cleveland Cavaliers,Sacramento Kings,draftkings,DraftKings,2024-02-05T04:03:01Z,Cleveland Cavaliers,-110.0,-5.0,Sacramento Kings,-110.0,5.0,1610612739,1610612758
8,32f291b23ce6df0f9fe15c0a58c99b42,basketball_nba,NBA,2024-02-06T00:10:00Z,Philadelphia 76ers,Dallas Mavericks,draftkings,DraftKings,2024-02-05T04:03:01Z,Dallas Mavericks,-110.0,-1.5,Philadelphia 76ers,-110.0,1.5,1610612755,1610612742
9,3bcf1d078ea54ebd637faf85b1c467dd,basketball_nba,NBA,2024-02-06T00:40:00Z,Atlanta Hawks,Los Angeles Clippers,draftkings,DraftKings,2024-02-05T04:03:01Z,Atlanta Hawks,-110.0,3.0,Los Angeles Clippers,-110.0,-3.0,1610612737,1610612746


# Function that prepares data and predicts with the current model

In [213]:
team_id_list = np.concatenate([odds_data['home_team_id'].unique(), odds_data['away_team_id'].unique()]).tolist()
team_id_list

[1610612766,
 1610612748,
 1610612750,
 1610612760,
 1610612762,
 1610612743,
 1610612739,
 1610612755,
 1610612737,
 1610612751,
 1610612740,
 1610612754,
 1610612746,
 1610612745,
 1610612761,
 1610612749,
 1610612757,
 1610612747,
 1610612758,
 1610612742,
 1610612744]

2023

In [217]:
gdf2[gdf2['SEASON']==gdf2['SEASON'].max()]

Unnamed: 0,GAME_ID,TEAM_ID,SEASON,GAME_DATE_EST,AST,BLK,DREB,FG3A,FG3_PCT,FGA,FG_PCT,FTA,FT_PCT,OREB,PF,STL,TO,PTS
3351,12300001,1610612742,2023,2023-10-05,18.0,8.0,35.0,46.0,0.566667,92.0,0.304348,30.0,0.369565,7.0,25.0,11.0,13.0,99.0
3352,12300001,1610612750,2023,2023-10-05,28.0,12.0,45.0,35.0,0.727273,95.0,0.314286,22.0,0.442105,11.0,20.0,7.0,16.0,111.0
3353,12300002,1610612742,2023,2023-10-07,19.0,6.0,36.0,46.0,0.705882,94.0,0.304348,17.0,0.372340,15.0,15.0,8.0,20.0,96.0
3354,12300002,1610612750,2023,2023-10-07,25.0,8.0,33.0,31.0,0.750000,90.0,0.451613,16.0,0.433333,13.0,23.0,9.0,18.0,104.0
3355,12300003,1610612744,2023,2023-10-07,22.0,7.0,42.0,42.0,0.666667,79.0,0.357143,21.0,0.468354,7.0,16.0,8.0,16.0,103.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52926,22300701,1610612752,2023,2024-02-03,21.0,4.0,33.0,35.0,0.866667,100.0,0.285714,15.0,0.410000,18.0,20.0,1.0,7.0,105.0
52927,22300702,1610612742,2023,2024-02-03,22.0,1.0,34.0,39.0,0.913043,81.0,0.410256,23.0,0.493827,8.0,19.0,9.0,21.0,117.0
52928,22300702,1610612749,2023,2024-02-03,34.0,2.0,31.0,30.0,0.680000,88.0,0.400000,25.0,0.568182,5.0,21.0,13.0,11.0,129.0
52929,22300703,1610612739,2023,2024-02-03,31.0,4.0,41.0,44.0,0.923077,97.0,0.295455,13.0,0.474227,10.0,19.0,6.0,9.0,117.0


In [218]:
gdf = read_file("games_details")

gdf = gdf.groupby(['GAME_ID','TEAM_ID'],as_index=False)[['FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA',
    'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS']].sum()

gdf['FT_PCT'] = gdf['FGM'] / gdf['FGA']
gdf['FG_PCT'] = gdf['FG3M'] / gdf['FG3A']
gdf['FG3_PCT'] = gdf['FTM'] / gdf['FTA']

gdf.drop(['FGM','FG3M', 'FTM', 'REB'],axis=1, inplace=True)

df = read_file("games")

# Convert 'GAME_DATE_EST' to datetime
df['GAME_DATE_EST'] = pd.to_datetime(df['GAME_DATE_EST'])

# Create game meta df
gmeta = df[['GAME_ID','SEASON', 'GAME_DATE_EST']]
#Merge date_count with gdf
gdf2 = gdf.merge(gmeta, on='GAME_ID', how='left').sort_values(by=['SEASON', 'GAME_DATE_EST'])

# Identify non-id stat columns
non_id_columns = gdf2.columns.difference(['GAME_ID', 'TEAM_ID', 'SEASON', 'GAME_DATE_EST', 'date_rank'])

#Re-arragne columns
gdf2 = gdf2[['GAME_ID', 'TEAM_ID', 'SEASON', 'GAME_DATE_EST'] + non_id_columns.to_list()]
gdf2['PTS'] = gdf2.pop('PTS')

# Filter for current season
gdf2 = gdf2[gdf2['SEASON']==gdf2['SEASON'].max()]

# Filter for only upcoming teams
gdf2[gdf2['TEAM_ID'].isin(team_id_list)]

Unnamed: 0,GAME_ID,TEAM_ID,SEASON,GAME_DATE_EST,AST,BLK,DREB,FG3A,FG3_PCT,FGA,FG_PCT,FTA,FT_PCT,OREB,PF,STL,TO,PTS
3351,12300001,1610612742,2023,2023-10-05,18.0,8.0,35.0,46.0,0.566667,92.0,0.304348,30.0,0.369565,7.0,25.0,11.0,13.0,99.0
3352,12300001,1610612750,2023,2023-10-05,28.0,12.0,45.0,35.0,0.727273,95.0,0.314286,22.0,0.442105,11.0,20.0,7.0,16.0,111.0
3353,12300002,1610612742,2023,2023-10-07,19.0,6.0,36.0,46.0,0.705882,94.0,0.304348,17.0,0.372340,15.0,15.0,8.0,20.0,96.0
3354,12300002,1610612750,2023,2023-10-07,25.0,8.0,33.0,31.0,0.750000,90.0,0.451613,16.0,0.433333,13.0,23.0,9.0,18.0,104.0
3355,12300003,1610612744,2023,2023-10-07,22.0,7.0,42.0,42.0,0.666667,79.0,0.357143,21.0,0.468354,7.0,16.0,8.0,16.0,103.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52924,22300700,1610612758,2023,2024-02-03,33.0,1.0,36.0,39.0,0.827586,84.0,0.384615,29.0,0.500000,4.0,18.0,12.0,16.0,123.0
52925,22300701,1610612747,2023,2024-02-03,28.0,11.0,35.0,31.0,0.851852,80.0,0.387097,27.0,0.487500,3.0,16.0,4.0,5.0,113.0
52927,22300702,1610612742,2023,2024-02-03,22.0,1.0,34.0,39.0,0.913043,81.0,0.410256,23.0,0.493827,8.0,19.0,9.0,21.0,117.0
52928,22300702,1610612749,2023,2024-02-03,34.0,2.0,31.0,30.0,0.680000,88.0,0.400000,25.0,0.568182,5.0,21.0,13.0,11.0,129.0


In [277]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

def prepare_data():
    # Summarize game detail stats
    gdf = read_file("games_details")
    
    gdf = gdf.groupby(['GAME_ID','TEAM_ID'],as_index=False)[['FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA',
        'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS']].sum()

    gdf['FT_PCT'] = gdf['FGM'] / gdf['FGA']
    gdf['FG_PCT'] = gdf['FG3M'] / gdf['FG3A']
    gdf['FG3_PCT'] = gdf['FTM'] / gdf['FTA']

    gdf.drop(['FGM','FG3M', 'FTM', 'REB'],axis=1, inplace=True)
    
    df = read_file("games")

    # Convert 'GAME_DATE_EST' to datetime
    df['GAME_DATE_EST'] = pd.to_datetime(df['GAME_DATE_EST'])

    # Create game meta df
    gmeta = df[['GAME_ID','SEASON', 'GAME_DATE_EST']]
    #Merge date_count with gdf
    gdf2 = gdf.merge(gmeta, on='GAME_ID', how='left').sort_values(by=['SEASON', 'GAME_DATE_EST'])

    # Identify non-id stat columns
    non_id_columns = gdf2.columns.difference(['GAME_ID', 'TEAM_ID', 'SEASON', 'GAME_DATE_EST', 'date_rank'])

    #Re-arragne columns
    gdf2 = gdf2[['GAME_ID', 'TEAM_ID', 'SEASON', 'GAME_DATE_EST'] + non_id_columns.to_list()]
    gdf2['PTS'] = gdf2.pop('PTS')

    # Filter for current season
    gdf2 = gdf2[gdf2['SEASON']==gdf2['SEASON'].max()]

    # Filter for only upcoming teams
    gdf2 = gdf2[gdf2['TEAM_ID'].isin(team_id_list)]

    gdf3 = gdf2.copy()
    
    # make gdf2 to numpy for efficiency
    gdf2 = gdf2.to_numpy()

    # Identify non-id stat columns
    non_id_columns = np.setdiff1d(np.arange(gdf2.shape[1]), [0, 1, 2, 3]) 

    #Create placeholder numpy memory
    result_array = np.empty((gdf2.shape[0], 4), dtype=object)

    for i, row in enumerate(gdf2):
        curr_team = row[1]  # Assuming 'TEAM_ID' is at index 1
        curr_season = row[2]  # Assuming 'SEASON' is at index 2
        curr_date = row[3]  # Assuming 'GAME_DATE_EST' is at index 17

        # Filter rows based on conditions using boolean indexing
        temp_array = gdf2[(gdf2[:, 1] == curr_team) & (gdf2[:, 2] == curr_season) & (gdf2[:, 3] < curr_date)]

        if temp_array.shape[0] > 0:
            # Calculate mean for each stat using vectorized operations
            mean_values = np.nanmean(temp_array[:, non_id_columns], axis=0)
        else:
            # If tempdf is empty, set mean_values to NaN
            mean_values = np.full(len(non_id_columns), np.nan)

        # Flatten the array and assign values to the result array
        result_array[i, 0] = curr_team
        result_array[i, 1] = curr_season
        result_array[i, 2] = curr_date
        result_array[i, 3] = mean_values.tolist()

    model_df = np.hstack((gdf2[:, 0].reshape(-1, 1), result_array))
    model_df = np.hstack([model_df[:, :4], np.nan_to_num(np.vstack(model_df[:, 4]))])

    rolling_szn_avgs = pd.DataFrame(model_df,columns=['GAME_ID', 'TEAM_ID', 'SEASON', 'GAME_DATE_EST', 'AST', 'BLK', 'DREB',
        'FG3A', 'FG3_PCT', 'FGA', 'FG_PCT', 'FTA', 'FT_PCT', 'OREB', 'PF',
        'STL', 'TO', 'PTS'])
    
    # Group by 'TEAM_ID' and find the index of the row with the maximum 'GAME_DATE_EST' value in each group
    max_indices = rolling_szn_avgs.groupby('TEAM_ID')['GAME_DATE_EST'].idxmax()

    # Use the indices to extract the corresponding rows
    current_day_avg_data = rolling_szn_avgs.loc[max_indices]

    odd2 = odds_data.copy()

    # Merge df with home stats
    odd2 = odd2.merge(current_day_avg_data, how='left', left_on=['home_team_id'], right_on=['TEAM_ID']).drop('GAME_ID',axis=1)
    odd2 = odd2[~odd2['TEAM_ID'].isnull()].drop(['TEAM_ID'], axis=1)

    # Merge df with away stats
    odd2 = odd2.merge(current_day_avg_data, how='left', left_on=['away_team_id'], right_on=['TEAM_ID'],suffixes=('_HOME','_AWAY')).drop('GAME_ID',axis=1)
    #odd2 = odd2.merge(rolling_szn_avgs, how='left', left_on=['GAME_ID','VISITOR_TEAM_ID'], right_on=['TEAM_ID'], suffixes=('_HOME','_AWAY'))
    odd2 = odd2[~odd2['TEAM_ID'].isnull()].drop(['TEAM_ID'], axis=1)
    return odd2

def predict_with_model():
    pass

In [333]:
data = prepare_data()
data

Unnamed: 0,id,sport_key,sport_title,commence_time,home_team,away_team,key,title,last_update,team1_name,team1_price,team1_point,team2_name,team2_price,team2_point,home_team_id,away_team_id,SEASON_HOME,GAME_DATE_EST_HOME,AST_HOME,BLK_HOME,DREB_HOME,FG3A_HOME,FG3_PCT_HOME,FGA_HOME,FG_PCT_HOME,FTA_HOME,FT_PCT_HOME,OREB_HOME,PF_HOME,STL_HOME,TO_HOME,PTS_HOME,SEASON_AWAY,GAME_DATE_EST_AWAY,AST_AWAY,BLK_AWAY,DREB_AWAY,FG3A_AWAY,FG3_PCT_AWAY,FGA_AWAY,FG_PCT_AWAY,FTA_AWAY,FT_PCT_AWAY,OREB_AWAY,PF_AWAY,STL_AWAY,TO_AWAY,PTS_AWAY
0,916e37accc708c4e76fd2f4e14a02d42,basketball_nba,NBA,2024-02-04T23:10:00Z,Charlotte Hornets,Indiana Pacers,draftkings,DraftKings,2024-02-05T01:25:20Z,Charlotte Hornets,-125.0,18.5,Indiana Pacers,-105.0,-18.5,1610612766,1610612754,2023,2024-02-02,24.78,5.1,31.34,33.18,0.787222,89.1,0.347539,19.72,0.456932,10.64,19.38,6.68,13.52,108.24,2023,2024-02-02,30.722222,6.148148,31.148148,36.981481,0.786241,93.055556,0.370014,21.722222,0.498278,10.444444,22.814815,7.740741,13.12963,123.240741
1,6405ed37b375739beca42069272d8e2a,basketball_nba,NBA,2024-02-04T23:17:02Z,Miami Heat,Los Angeles Clippers,draftkings,DraftKings,2024-02-05T01:30:10Z,Los Angeles Clippers,-115.0,-5.5,Miami Heat,-115.0,5.5,1610612748,1610612746,2023,2024-02-02,25.698113,3.415094,32.471698,33.415094,0.820285,85.792453,0.375036,23.0,0.46147,9.679245,18.735849,7.207547,12.886792,110.54717,2023,2024-02-02,25.76,5.48,33.1,32.8,0.812601,87.76,0.383951,22.78,0.489541,10.68,19.72,8.18,12.3,116.74
2,2eaf2bc151d58288cfed10dcda63d9fd,basketball_nba,NBA,2024-02-05T00:10:00Z,Minnesota Timberwolves,Houston Rockets,draftkings,DraftKings,2024-02-05T01:30:10Z,Houston Rockets,-105.0,10.5,Minnesota Timberwolves,-125.0,-10.5,1610612750,1610612745,2023,2024-02-02,26.615385,5.788462,34.75,32.038462,0.774676,84.211538,0.394279,23.346154,0.49068,9.134615,19.634615,7.461538,14.480769,113.057692,2023,2024-02-02,25.019231,4.461538,35.115385,34.307692,0.779075,89.653846,0.345525,24.307692,0.459954,10.692308,21.634615,7.980769,12.480769,113.173077
3,f660b1395a23d754eae81194c1712a2f,basketball_nba,NBA,2024-02-05T00:10:00Z,Oklahoma City Thunder,Toronto Raptors,draftkings,DraftKings,2024-02-05T01:30:10Z,Oklahoma City Thunder,-115.0,8.5,Toronto Raptors,-115.0,-8.5,1610612760,1610612761,2023,2024-02-02,27.113208,6.301887,32.981132,33.698113,0.82379,89.075472,0.384167,22.924528,0.498419,8.849057,19.735849,8.169811,11.90566,120.433962,2023,2024-02-02,29.56,5.0,32.72,32.88,0.74218,89.14,0.356375,23.18,0.480959,11.28,19.06,7.66,13.28,114.66
4,964828ad63abbe09a1ce39b33e183ac0,basketball_nba,NBA,2024-02-05T01:10:00Z,Utah Jazz,Milwaukee Bucks,draftkings,DraftKings,2024-02-05T01:30:10Z,Milwaukee Bucks,-115.0,1.5,Utah Jazz,-115.0,-1.5,1610612762,1610612749,2023,2024-02-01,28.056604,5.867925,33.962264,36.716981,0.812491,90.433962,0.35533,23.54717,0.470933,12.660377,19.320755,6.830189,15.358491,117.132075,2023,2024-02-03,26.490566,5.45283,35.169811,38.169811,0.772974,89.962264,0.371084,25.943396,0.490134,9.698113,20.169811,6.849057,12.90566,122.377358
5,baf4676539c18a08af7b65c01d7a4c68,basketball_nba,NBA,2024-02-05T01:40:00Z,Denver Nuggets,Portland Trail Blazers,draftkings,DraftKings,2024-02-05T01:30:10Z,Denver Nuggets,-112.0,-15.0,Portland Trail Blazers,-108.0,15.0,1610612743,1610612757,2023,2024-02-02,28.648148,5.407407,32.962963,31.814815,0.747823,88.425926,0.367209,21.074074,0.493052,10.722222,18.981481,7.092593,11.981481,114.5,2023,2024-02-02,22.803922,4.647059,29.627451,34.313725,0.79099,90.176471,0.354901,21.686275,0.442991,11.823529,20.666667,8.078431,14.294118,108.980392
6,c57c67cfb539cf4c05362a523bddb5c6,basketball_nba,NBA,2024-02-06T00:10:00Z,Charlotte Hornets,Los Angeles Lakers,draftkings,DraftKings,2024-02-05T04:03:01Z,Charlotte Hornets,-110.0,10.5,Los Angeles Lakers,-110.0,-10.5,1610612766,1610612747,2023,2024-02-02,24.78,5.1,31.34,33.18,0.787222,89.1,0.347539,19.72,0.456932,10.64,19.38,6.68,13.52,108.24,2023,2024-02-03,27.578947,5.684211,34.754386,30.912281,0.76005,87.421053,0.361294,24.824561,0.488009,8.438596,17.140351,8.122807,14.157895,115.421053
7,8186cd1bf9532be759bb8eb7bde84e4b,basketball_nba,NBA,2024-02-06T00:10:00Z,Cleveland Cavaliers,Sacramento Kings,draftkings,DraftKings,2024-02-05T04:03:01Z,Cleveland Cavaliers,-110.0,-5.0,Sacramento Kings,-110.0,5.0,1610612739,1610612758,2023,2024-02-03,26.714286,4.653061,34.040816,37.122449,0.774683,88.571429,0.357364,20.918367,0.477019,10.673469,19.102041,7.938776,13.408163,113.714286,2023,2024-02-03,28.557692,4.192308,32.903846,40.711538,0.726535,91.173077,0.364004,21.884615,0.475772,10.423077,20.346154,7.211538,12.980769,117.538462
8,32f291b23ce6df0f9fe15c0a58c99b42,basketball_nba,NBA,2024-02-06T00:10:00Z,Philadelphia 76ers,Dallas Mavericks,draftkings,DraftKings,2024-02-05T04:03:01Z,Dallas Mavericks,-110.0,-1.5,Philadelphia 76ers,-110.0,1.5,1610612755,1610612742,2023,2024-02-03,25.019608,6.039216,31.901961,32.294118,0.82882,89.254902,0.361696,26.686275,0.475211,11.039216,21.117647,9.215686,11.607843,118.627451,2023,2024-02-03,25.176471,4.509804,31.666667,40.745098,0.75158,90.078431,0.367667,23.745098,0.469707,10.098039,19.196078,6.666667,12.058824,117.352941
9,3bcf1d078ea54ebd637faf85b1c467dd,basketball_nba,NBA,2024-02-06T00:40:00Z,Atlanta Hawks,Los Angeles Clippers,draftkings,DraftKings,2024-02-05T04:03:01Z,Atlanta Hawks,-110.0,3.0,Los Angeles Clippers,-110.0,-3.0,1610612737,1610612746,2023,2024-02-03,25.90566,4.283019,32.056604,37.811321,0.806614,92.660377,0.359251,24.641509,0.465628,12.773585,18.830189,8.188679,13.679245,119.54717,2023,2024-02-02,25.76,5.48,33.1,32.8,0.812601,87.76,0.383951,22.78,0.489541,10.68,19.72,8.18,12.3,116.74


In [334]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

#Drop Unnecessary Fields
drop = ['SEASON_HOME', 'SEASON_AWAY','GAME_DATE_EST_HOME', 
        'GAME_DATE_EST_AWAY', 'date_rank_HOME',
        'date_rank_AWAY','PTS_HOME', 'PTS_AWAY',
        'team1_name', 'team1_price','team1_point',
        'team2_name','team2_price','team2_point',
        'home_team_id', 'away_team_id']

target = ['HOME_PT_DIFF']
IDcol = ['id','sport_key','sport_title',
         'commence_time','home_team','away_team',
         'key','title','last_update']

predictors = [x for x in data.columns if x not in drop+target+IDcol]


X = data[predictors]
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, PowerTransformer, MaxAbsScaler, LabelEncoder

#Scale each variable
sc_X = StandardScaler()
X2 = pd.DataFrame(sc_X.fit_transform(X))

X2.columns = X.columns.values
X2.index = X.index.values
X = X2



In [335]:
predictors

['AST_HOME',
 'BLK_HOME',
 'DREB_HOME',
 'FG3A_HOME',
 'FG3_PCT_HOME',
 'FGA_HOME',
 'FG_PCT_HOME',
 'FTA_HOME',
 'FT_PCT_HOME',
 'OREB_HOME',
 'PF_HOME',
 'STL_HOME',
 'TO_HOME',
 'AST_AWAY',
 'BLK_AWAY',
 'DREB_AWAY',
 'FG3A_AWAY',
 'FG3_PCT_AWAY',
 'FGA_AWAY',
 'FG_PCT_AWAY',
 'FTA_AWAY',
 'FT_PCT_AWAY',
 'OREB_AWAY',
 'PF_AWAY',
 'STL_AWAY',
 'TO_AWAY']

In [336]:
data.columns

Index(['id', 'sport_key', 'sport_title', 'commence_time', 'home_team',
       'away_team', 'key', 'title', 'last_update', 'team1_name', 'team1_price',
       'team1_point', 'team2_name', 'team2_price', 'team2_point',
       'home_team_id', 'away_team_id', 'SEASON_HOME', 'GAME_DATE_EST_HOME',
       'AST_HOME', 'BLK_HOME', 'DREB_HOME', 'FG3A_HOME', 'FG3_PCT_HOME',
       'FGA_HOME', 'FG_PCT_HOME', 'FTA_HOME', 'FT_PCT_HOME', 'OREB_HOME',
       'PF_HOME', 'STL_HOME', 'TO_HOME', 'PTS_HOME', 'SEASON_AWAY',
       'GAME_DATE_EST_AWAY', 'AST_AWAY', 'BLK_AWAY', 'DREB_AWAY', 'FG3A_AWAY',
       'FG3_PCT_AWAY', 'FGA_AWAY', 'FG_PCT_AWAY', 'FTA_AWAY', 'FT_PCT_AWAY',
       'OREB_AWAY', 'PF_AWAY', 'STL_AWAY', 'TO_AWAY', 'PTS_AWAY'],
      dtype='object')

In [337]:
data.head()

Unnamed: 0,id,sport_key,sport_title,commence_time,home_team,away_team,key,title,last_update,team1_name,team1_price,team1_point,team2_name,team2_price,team2_point,home_team_id,away_team_id,SEASON_HOME,GAME_DATE_EST_HOME,AST_HOME,BLK_HOME,DREB_HOME,FG3A_HOME,FG3_PCT_HOME,FGA_HOME,FG_PCT_HOME,FTA_HOME,FT_PCT_HOME,OREB_HOME,PF_HOME,STL_HOME,TO_HOME,PTS_HOME,SEASON_AWAY,GAME_DATE_EST_AWAY,AST_AWAY,BLK_AWAY,DREB_AWAY,FG3A_AWAY,FG3_PCT_AWAY,FGA_AWAY,FG_PCT_AWAY,FTA_AWAY,FT_PCT_AWAY,OREB_AWAY,PF_AWAY,STL_AWAY,TO_AWAY,PTS_AWAY
0,916e37accc708c4e76fd2f4e14a02d42,basketball_nba,NBA,2024-02-04T23:10:00Z,Charlotte Hornets,Indiana Pacers,draftkings,DraftKings,2024-02-05T01:25:20Z,Charlotte Hornets,-125.0,18.5,Indiana Pacers,-105.0,-18.5,1610612766,1610612754,2023,2024-02-02,24.78,5.1,31.34,33.18,0.787222,89.1,0.347539,19.72,0.456932,10.64,19.38,6.68,13.52,108.24,2023,2024-02-02,30.722222,6.148148,31.148148,36.981481,0.786241,93.055556,0.370014,21.722222,0.498278,10.444444,22.814815,7.740741,13.12963,123.240741
1,6405ed37b375739beca42069272d8e2a,basketball_nba,NBA,2024-02-04T23:17:02Z,Miami Heat,Los Angeles Clippers,draftkings,DraftKings,2024-02-05T01:30:10Z,Los Angeles Clippers,-115.0,-5.5,Miami Heat,-115.0,5.5,1610612748,1610612746,2023,2024-02-02,25.698113,3.415094,32.471698,33.415094,0.820285,85.792453,0.375036,23.0,0.46147,9.679245,18.735849,7.207547,12.886792,110.54717,2023,2024-02-02,25.76,5.48,33.1,32.8,0.812601,87.76,0.383951,22.78,0.489541,10.68,19.72,8.18,12.3,116.74
2,2eaf2bc151d58288cfed10dcda63d9fd,basketball_nba,NBA,2024-02-05T00:10:00Z,Minnesota Timberwolves,Houston Rockets,draftkings,DraftKings,2024-02-05T01:30:10Z,Houston Rockets,-105.0,10.5,Minnesota Timberwolves,-125.0,-10.5,1610612750,1610612745,2023,2024-02-02,26.615385,5.788462,34.75,32.038462,0.774676,84.211538,0.394279,23.346154,0.49068,9.134615,19.634615,7.461538,14.480769,113.057692,2023,2024-02-02,25.019231,4.461538,35.115385,34.307692,0.779075,89.653846,0.345525,24.307692,0.459954,10.692308,21.634615,7.980769,12.480769,113.173077
3,f660b1395a23d754eae81194c1712a2f,basketball_nba,NBA,2024-02-05T00:10:00Z,Oklahoma City Thunder,Toronto Raptors,draftkings,DraftKings,2024-02-05T01:30:10Z,Oklahoma City Thunder,-115.0,8.5,Toronto Raptors,-115.0,-8.5,1610612760,1610612761,2023,2024-02-02,27.113208,6.301887,32.981132,33.698113,0.82379,89.075472,0.384167,22.924528,0.498419,8.849057,19.735849,8.169811,11.90566,120.433962,2023,2024-02-02,29.56,5.0,32.72,32.88,0.74218,89.14,0.356375,23.18,0.480959,11.28,19.06,7.66,13.28,114.66
4,964828ad63abbe09a1ce39b33e183ac0,basketball_nba,NBA,2024-02-05T01:10:00Z,Utah Jazz,Milwaukee Bucks,draftkings,DraftKings,2024-02-05T01:30:10Z,Milwaukee Bucks,-115.0,1.5,Utah Jazz,-115.0,-1.5,1610612762,1610612749,2023,2024-02-01,28.056604,5.867925,33.962264,36.716981,0.812491,90.433962,0.35533,23.54717,0.470933,12.660377,19.320755,6.830189,15.358491,117.132075,2023,2024-02-03,26.490566,5.45283,35.169811,38.169811,0.772974,89.962264,0.371084,25.943396,0.490134,9.698113,20.169811,6.849057,12.90566,122.377358


In [338]:
trained_sequential_models[0]

<keras.src.engine.sequential.Sequential at 0x148aa5600>

In [348]:
# make predictions
y_pred = trained_sequential_models[0].predict(X)

# Line up predictions with the dataset
data['predictions'] = np.round(y_pred)

# Create new dataset with just the date, the teams, the bet spread, and predicted spread
d2 = data[['commence_time','last_update','home_team','away_team','key', 'team1_price','team1_point', 'predictions']]
d2['hometeam-cover-predictions'] = np.where(d2.predictions < d2.team1_point, "Fail to Cover", "Cover")

# Decline bet if it's within 2 points of odds
#d2['should_you_bet'] = abs(d2['team1_price']) + 100
d2
# send the data to email
client_id = '601583171279-9foq2l1ibhrio6422hk98pspf2g7plht.apps.googleusercontent.com'
client_secret= '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'
# keep track of all models / date run / current RMSE / 
# come up with performance tracker?
# cleanup



Unnamed: 0,commence_time,last_update,home_team,away_team,key,team1_price,team1_point,predictions,hometeam-cover-predictions,winnings
0,2024-02-04T23:10:00Z,2024-02-05T01:25:20Z,Charlotte Hornets,Indiana Pacers,draftkings,-125.0,18.5,-1.0,Fail to Cover,225.0
1,2024-02-04T23:17:02Z,2024-02-05T01:30:10Z,Miami Heat,Los Angeles Clippers,draftkings,-115.0,-5.5,-3.0,Cover,215.0
2,2024-02-05T00:10:00Z,2024-02-05T01:30:10Z,Minnesota Timberwolves,Houston Rockets,draftkings,-105.0,10.5,9.0,Fail to Cover,205.0
3,2024-02-05T00:10:00Z,2024-02-05T01:30:10Z,Oklahoma City Thunder,Toronto Raptors,draftkings,-115.0,8.5,8.0,Fail to Cover,215.0
4,2024-02-05T01:10:00Z,2024-02-05T01:30:10Z,Utah Jazz,Milwaukee Bucks,draftkings,-115.0,1.5,-4.0,Fail to Cover,215.0
5,2024-02-05T01:40:00Z,2024-02-05T01:30:10Z,Denver Nuggets,Portland Trail Blazers,draftkings,-112.0,-15.0,18.0,Cover,212.0
6,2024-02-06T00:10:00Z,2024-02-05T04:03:01Z,Charlotte Hornets,Los Angeles Lakers,draftkings,-110.0,10.5,-20.0,Fail to Cover,210.0
7,2024-02-06T00:10:00Z,2024-02-05T04:03:01Z,Cleveland Cavaliers,Sacramento Kings,draftkings,-110.0,-5.0,8.0,Cover,210.0
8,2024-02-06T00:10:00Z,2024-02-05T04:03:01Z,Philadelphia 76ers,Dallas Mavericks,draftkings,-110.0,-1.5,6.0,Cover,210.0
9,2024-02-06T00:40:00Z,2024-02-05T04:03:01Z,Atlanta Hawks,Los Angeles Clippers,draftkings,-110.0,3.0,-1.0,Fail to Cover,210.0


In [350]:
import os
import base64
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.base import MIMEBase
from email import encoders
import pandas as pd
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from googleapiclient.discovery import build

# Define your email parameters
sender_email = 'alecarnassi@gmail.com'
receiver_email = 'alecarnassi@gmail.com'
subject = 'DataFrame Attachment Test'
message_text = 'Please find the attached Sports Bets.'

# Convert DataFrame to CSV
csv_content = d2.to_csv(index=False)

# Create a multipart message
message = MIMEMultipart()
message['From'] = sender_email
message['To'] = receiver_email
message['Subject'] = subject

# Attach the message text
message.attach(MIMEText(message_text, 'plain'))

# Attach the CSV file
attachment = MIMEBase('application', 'octet-stream')
attachment.set_payload(csv_content)
encoders.encode_base64(attachment)
attachment.add_header('Content-Disposition', 'attachment', filename='d2.csv')
message.attach(attachment)

# Load Gmail API credentials
creds = None
token_file = 'token.json'
if os.path.exists(token_file):
    creds = Credentials.from_authorized_user_file(token_file)
if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
        creds.refresh(Request())
    else:
        flow = InstalledAppFlow.from_client_secrets_file('/Users/alecnaidoo/Downloads/client_secret_601583171279-9foq2l1ibhrio6422hk98pspf2g7plht.apps.googleusercontent.com.json', ['https://www.googleapis.com/auth/gmail.send'])
        creds = flow.run_local_server(port=0)
    with open(token_file, 'w') as token:
        token.write(creds.to_json())

# Build the Gmail service
service = build('gmail', 'v1', credentials=creds)

# Send the email
raw_message = base64.urlsafe_b64encode(message.as_bytes()).decode()
service.users().messages().send(userId='me', body={'raw': raw_message}).execute()


Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=601583171279-9foq2l1ibhrio6422hk98pspf2g7plht.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A51078%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fgmail.send&state=jIUWWWVb09n3AhJ8p8NpZDREXcJn9l&access_type=offline


{'id': '18da1417d262ad63',
 'threadId': '18da1417d262ad63',
 'labelIds': ['UNREAD', 'SENT', 'INBOX']}

In [None]:
# Organize code
    # create new libraries