In [33]:
from nba_api.stats.static import teams, players
from nba_api.stats.endpoints import cumestatsteamgames, cumestatsteam, gamerotation
import pandas as pd
import numpy as np
import json
import difflib
import time
import requests

In [35]:
def retry(func, retries=3):
    def retry_wrapper(*args, **kwargs):
        attempts = 0
        while attempts < retries:
            try:
                return func(*args, **kwargs)
            except requests.exceptions.RequestException as e:
                print(e)
                time.sleep(30)
                attempts += 1

    return retry_wrapper

In [37]:
# Get Season Schedule Function 

def getSeasonScheduleFrame(seasons,seasonType): 

    
    def getGameDate(matchup):
        return matchup.partition(' at')[0][:10]

    def getHomeTeam(matchup):
        return matchup.partition(' at')[2]

    def getAwayTeam(matchup):
        return matchup.partition(' at')[0][10:]

    def getTeamIDFromNickname(nickname):
        return teamLookup.loc[teamLookup['nickname'] == difflib.get_close_matches(nickname,teamLookup['nickname'],1)[0]].values[0][0] 
    
    @retry
    def getRegularSeasonSchedule(season,teamID,seasonType):
        season = str(season) + "-" + str(season+1)[-2:]
        teamGames = cumestatsteamgames.CumeStatsTeamGames(league_id = '00',season = season ,
                                                                      season_type_all_star=seasonType,
                                                                      team_id = teamID).get_normalized_json()

        teamGames = pd.DataFrame(json.loads(teamGames)['CumeStatsTeamGames'])
        teamGames['SEASON'] = season
        return teamGames    
    
    teamLookup = pd.DataFrame(teams.get_teams())
    
    scheduleFrame = pd.DataFrame()

    for season in seasons:
        for id in teamLookup['id']:
            time.sleep(1)
            scheduleFrame = pd.concat([scheduleFrame, getRegularSeasonSchedule(season, id, seasonType)], ignore_index=True)
            
    scheduleFrame['GAME_DATE'] = pd.to_datetime(scheduleFrame['MATCHUP'].map(getGameDate))
    scheduleFrame['HOME_TEAM_NICKNAME'] = scheduleFrame['MATCHUP'].map(getHomeTeam)
    scheduleFrame['HOME_TEAM_ID'] = scheduleFrame['HOME_TEAM_NICKNAME'].map(getTeamIDFromNickname)
    scheduleFrame['AWAY_TEAM_NICKNAME'] = scheduleFrame['MATCHUP'].map(getAwayTeam)
    scheduleFrame['AWAY_TEAM_ID'] = scheduleFrame['AWAY_TEAM_NICKNAME'].map(getTeamIDFromNickname)
    scheduleFrame = scheduleFrame.drop_duplicates() # There's a row for both teams, only need 1
    scheduleFrame = scheduleFrame.reset_index(drop=True)
            
    return scheduleFrame





In [38]:
# Get Single Game aggregation columns

def getSingleGameMetrics(gameID,homeTeamID,awayTeamID,awayTeamNickname,seasonYear,gameDate):

    @retry
    def getGameStats(teamID,gameID,seasonYear):
        #season = str(seasonYear) + "-" + str(seasonYear+1)[-2:]
        gameStats = cumestatsteam.CumeStatsTeam(game_ids=gameID,league_id ="00",
                                               season=seasonYear,season_type_all_star="Regular Season",
                                               team_id = teamID).get_normalized_json()

        gameStats = pd.DataFrame(json.loads(gameStats)['TotalTeamStats'])

        return gameStats

    data = getGameStats(homeTeamID,gameID,seasonYear)
    data.at[1,'NICKNAME'] = awayTeamNickname
    data.at[1,'TEAM_ID'] = awayTeamID
    data.at[1,'OFFENSIVE_EFFICIENCY'] = (data.at[1,'FG'] + data.at[1,'AST'])/(data.at[1,'FGA'] - data.at[1,'OFF_REB'] + data.at[1,'AST'] + data.at[1,'TOTAL_TURNOVERS'])
    data.at[1,'SCORING_MARGIN'] = data.at[1,'PTS'] - data.at[0,'PTS']

    data.at[0,'OFFENSIVE_EFFICIENCY'] = (data.at[0,'FG'] + data.at[0,'AST'])/(data.at[0,'FGA'] - data.at[0,'OFF_REB'] + data.at[0,'AST'] + data.at[0,'TOTAL_TURNOVERS'])
    data.at[0,'SCORING_MARGIN'] = data.at[0,'PTS'] - data.at[1,'PTS']

    data['SEASON'] = seasonYear
    data['GAME_DATE'] = gameDate
    data['GAME_ID'] = gameID

    return data

In [39]:
def getGameLogs(gameLogs,scheduleFrame):
    
    # Functions to prepare additional columns after gameLogs table loads
    def getHomeAwayFlag(gameDF):
        gameDF['HOME_FLAG'] = np.where((gameDF['W_HOME']==1) | (gameDF['L_HOME']==1),1,0)
        gameDF['AWAY_FLAG'] = np.where((gameDF['W_ROAD']==1) | (gameDF['L_ROAD']==1),1,0)
        #return gameDF 

    def getTotalWinPctg(gameDF):
        gameDF['TOTAL_GAMES_PLAYED'] = gameDF.groupby(['TEAM_ID','SEASON'])['GAME_DATE'].rank(ascending=True)
        gameDF['TOTAL_WINS'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID','SEASON'])['W'].cumsum()
        gameDF['TOTAL_WIN_PCTG'] = gameDF['TOTAL_WINS']/gameDF['TOTAL_GAMES_PLAYED']
        return gameDF.drop(['TOTAL_GAMES_PLAYED','TOTAL_WINS'],axis=1)

    def getHomeWinPctg(gameDF):
        gameDF['HOME_GAMES_PLAYED'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID','SEASON'])['HOME_FLAG'].cumsum()
        gameDF['HOME_WINS'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID','SEASON'])['W_HOME'].cumsum()
        gameDF['HOME_WIN_PCTG'] = gameDF['HOME_WINS']/gameDF['HOME_GAMES_PLAYED']
        return gameDF.drop(['HOME_GAMES_PLAYED','HOME_WINS'],axis=1)

    def getAwayWinPctg(gameDF):
        gameDF['AWAY_GAMES_PLAYED'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID','SEASON'])['AWAY_FLAG'].cumsum()
        gameDF['AWAY_WINS'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID','SEASON'])['W_ROAD'].cumsum()
        gameDF['AWAY_WIN_PCTG'] = gameDF['AWAY_WINS']/gameDF['AWAY_GAMES_PLAYED']
        return gameDF.drop(['AWAY_GAMES_PLAYED','AWAY_WINS'],axis=1)

    def getRollingOE(gameDF):
        gameDF['ROLLING_OE'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID','SEASON'])['OFFENSIVE_EFFICIENCY'].transform(lambda x: x.rolling(3, 1).mean())

    def getRollingScoringMargin(gameDF):
        gameDF['ROLLING_SCORING_MARGIN'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID','SEASON'])['SCORING_MARGIN'].transform(lambda x: x.rolling(3, 1).mean())

    def getRestDays(gameDF):
        gameDF['LAST_GAME_DATE'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID','SEASON'])['GAME_DATE'].shift(1)
        gameDF['NUM_REST_DAYS'] = (gameDF['GAME_DATE'] - gameDF['LAST_GAME_DATE'])/np.timedelta64(1,'D') 
        return gameDF.drop('LAST_GAME_DATE',axis=1)
    
    start = time.perf_counter_ns()

    i = int(len(gameLogs)/2) #Can use a previously completed gameLog datasetn

    while i<len(scheduleFrame):


        time.sleep(1)
        gameLogs = pd.concat([gameLogs, getSingleGameMetrics(scheduleFrame.at[i,'GAME_ID'], scheduleFrame.at[i,'HOME_TEAM_ID'],
                    scheduleFrame.at[i,'AWAY_TEAM_ID'], scheduleFrame.at[i,'AWAY_TEAM_NICKNAME'],
                    scheduleFrame.at[i,'SEASON'], scheduleFrame.at[i,'GAME_DATE'])], ignore_index=True)
        
        gameLogs = gameLogs.reset_index(drop=True)

        end = time.perf_counter_ns()


        if i%100 == 0:
            mins = ((end-start)/1e9)/60
            print(i,mins)

        i+=1
        
    # Get Table Level Aggregation Columns
    getHomeAwayFlag(gameLogs)
    gameLogs = getHomeWinPctg(gameLogs)
    gameLogs = getAwayWinPctg(gameLogs)
    gameLogs = getTotalWinPctg(gameLogs)
    getRollingScoringMargin(gameLogs)
    getRollingOE(gameLogs)
    gameLogs = getRestDays(gameLogs)

    return gameLogs.reset_index(drop=True)

In [40]:
#Get ScheduleFrame

seasons = [2021,2022,2023]
seasonType = 'Regular Season'

start = time.perf_counter_ns()
scheduleFrame = getSeasonScheduleFrame(seasons,seasonType)
end = time.perf_counter_ns()

secs = (end-start)/1e9
mins = secs/60
print(mins)

2.1942397099999997


In [41]:
#Example Output of Single Game Metrics
getSingleGameMetrics(scheduleFrame.at[104,'GAME_ID'],scheduleFrame.at[104,'HOME_TEAM_ID'],
                     scheduleFrame.at[104,'AWAY_TEAM_ID'],scheduleFrame.at[104,'AWAY_TEAM_NICKNAME'],
                     scheduleFrame.at[104,'SEASON'],scheduleFrame.at[104,'GAME_DATE'])

Unnamed: 0,CITY,NICKNAME,TEAM_ID,W,L,W_HOME,L_HOME,W_ROAD,L_ROAD,TEAM_TURNOVERS,...,BLK,PTS,AVG_REB,AVG_PTS,DQ,OFFENSIVE_EFFICIENCY,SCORING_MARGIN,SEASON,GAME_DATE,GAME_ID
0,Philadelphia,76ers,1610612755,0,1,0,1,0,0,0,...,2,87,47.0,87.0,0,0.425532,-48.0,2021-22,2022-02-15,22100869
1,OPPONENTS,Celtics,1610612738,1,0,0,0,1,0,2,...,10,135,56.0,135.0,0,0.649123,48.0,2021-22,2022-02-15,22100869


In [42]:
#Create the gameLogs DataFrame
gameLogs = pd.DataFrame()
gameLogs = getGameLogs(gameLogs,scheduleFrame)
gameLogs.to_csv('gameLogs.csv')

0 0.023312443333333332
100 2.3302757566666665
200 4.644133106666667
300 6.973164101666667
400 9.476188136666668
500 11.807434160000001
600 14.232009790000001
700 16.624499919999998
800 19.548943889999997
900 22.792827385
1000 25.152003891666666
1100 27.66130936833333
1200 30.16098841333333
1300 32.50934163833333
1400 34.993115655
1500 37.69880959166667
1600 40.078039546666666
1700 42.585358325
1800 45.34956570333333
1900 48.15469531666667
2000 50.90540516666667
2100 53.67682325166666
2200 56.48881420666667
2300 59.556433184999996
2400 62.35484034666667
2500 65.52744783
2600 68.79345996833334
2700 72.80204430666666
2800 75.76351211666666
2900 78.843760265
3000 81.83229975166667


In [50]:
#Example Output of Game Logs
gameLogs[(gameLogs['SEASON'] == '2023-24')].sort_values('GAME_DATE')

Unnamed: 0,CITY,NICKNAME,TEAM_ID,W,L,W_HOME,L_HOME,W_ROAD,L_ROAD,TEAM_TURNOVERS,...,TOTAL_WIN_PCTG,ROLLING_SCORING_MARGIN,ROLLING_OE,NUM_REST_DAYS,LAST_GAME_OE,LAST_GAME_HOME_WIN_PCTG,LAST_GAME_AWAY_WIN_PCTG,LAST_GAME_TOTAL_WIN_PCTG,LAST_GAME_ROLLING_SCORING_MARGIN,LAST_GAME_ROLLING_OE
5456,Golden State,Warriors,1610612744,0,1,0,1,0,0,0,...,0.000000,-4.000000,0.486726,,,,,,,
5401,OPPONENTS,Lakers,1610612747,0,1,0,0,0,1,1,...,0.000000,-12.000000,0.571429,,,,,,,
5400,Denver,Nuggets,1610612743,1,0,1,0,0,0,1,...,1.000000,12.000000,0.626016,,,,,,,
5457,OPPONENTS,Suns,1610612756,1,0,0,0,1,0,0,...,1.000000,4.000000,0.541667,,,,,,,
5062,New York,Knicks,1610612752,0,1,0,1,0,0,2,...,0.000000,-4.000000,0.521739,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5953,OPPONENTS,Kings,1610612758,1,0,0,0,1,0,1,...,0.621622,4.333333,0.617259,1.0,0.659574,0.650000,0.562500,0.611111,-2.333333,0.611825
5832,Indiana,Pacers,1610612754,1,0,1,0,0,0,1,...,0.594595,-2.333333,0.580474,2.0,0.620968,0.600000,0.562500,0.583333,6.333333,0.626580
4920,Atlanta,Hawks,1610612737,1,0,1,0,0,0,1,...,0.416667,-11.333333,0.561412,3.0,0.538462,0.357143,0.428571,0.400000,-12.666667,0.569232
5065,OPPONENTS,Nets,1610612751,0,1,0,0,0,1,1,...,0.421053,-2.333333,0.564802,4.0,0.590551,0.526316,0.333333,0.432432,-3.000000,0.574990


In [150]:
def getGameLogFeatureSet(gameDF):

    def shiftGameLogRecords(gameDF):
        gameDF['LAST_GAME_OE'] = gameLogs.sort_values('GAME_DATE').groupby(['TEAM_ID','SEASON'])['OFFENSIVE_EFFICIENCY'].shift(1)
        gameDF['LAST_GAME_HOME_WIN_PCTG'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID','SEASON'])['HOME_WIN_PCTG'].shift(1)
        gameDF['LAST_GAME_AWAY_WIN_PCTG'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID','SEASON'])['AWAY_WIN_PCTG'].shift(1)
        gameDF['LAST_GAME_TOTAL_WIN_PCTG'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID','SEASON'])['TOTAL_WIN_PCTG'].shift(1)
        gameDF['LAST_GAME_ROLLING_SCORING_MARGIN'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID','SEASON'])['ROLLING_SCORING_MARGIN'].shift(1)
        gameDF['LAST_GAME_ROLLING_OE'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID','SEASON'])['ROLLING_OE'].shift(1)
    
    
    def getHomeTeamFrame(gameDF):
        homeTeamFrame = gameDF[gameDF['CITY'] != 'OPPONENTS']
        homeTeamFrame = homeTeamFrame[['LAST_GAME_OE','LAST_GAME_HOME_WIN_PCTG','NUM_REST_DAYS','LAST_GAME_AWAY_WIN_PCTG','LAST_GAME_TOTAL_WIN_PCTG','LAST_GAME_ROLLING_SCORING_MARGIN','LAST_GAME_ROLLING_OE','W','TEAM_ID','GAME_ID','SEASON', 'GAME_DATE']]

        colRenameDict = {}
        for col in homeTeamFrame.columns:
            if (col != 'GAME_ID') & (col != 'SEASON') & (col != 'GAME_DATE') :
                colRenameDict[col] = 'HOME_' + col 

        homeTeamFrame.rename(columns=colRenameDict,inplace=True)

        return homeTeamFrame

    def getAwayTeamFrame(gameDF):
        awayTeamFrame = gameDF[gameDF['CITY'] == 'OPPONENTS']
        awayTeamFrame = awayTeamFrame[['LAST_GAME_OE','LAST_GAME_HOME_WIN_PCTG','NUM_REST_DAYS','LAST_GAME_AWAY_WIN_PCTG','LAST_GAME_TOTAL_WIN_PCTG','LAST_GAME_ROLLING_SCORING_MARGIN','LAST_GAME_ROLLING_OE','TEAM_ID','GAME_ID','SEASON']]

        colRenameDict = {}
        for col in awayTeamFrame.columns:
            if (col != 'GAME_ID') & (col != 'SEASON'):
                colRenameDict[col] = 'AWAY_' + col 

        awayTeamFrame.rename(columns=colRenameDict,inplace=True)

        return awayTeamFrame
    
    shiftGameLogRecords(gameLogs)
    awayTeamFrame = getAwayTeamFrame(gameLogs)
    homeTeamFrame = getHomeTeamFrame(gameLogs)
    
    return pd.merge(homeTeamFrame, awayTeamFrame, how="inner", on=[ "GAME_ID","SEASON"]).drop(['GAME_ID'],axis=1)

In [151]:
modelData = getGameLogFeatureSet(gameLogs)

In [152]:
# Final Data Set before Train,Test, Validation Split
modelData

Unnamed: 0,HOME_LAST_GAME_OE,HOME_LAST_GAME_HOME_WIN_PCTG,HOME_NUM_REST_DAYS,HOME_LAST_GAME_AWAY_WIN_PCTG,HOME_LAST_GAME_TOTAL_WIN_PCTG,HOME_LAST_GAME_ROLLING_SCORING_MARGIN,HOME_LAST_GAME_ROLLING_OE,HOME_W,HOME_TEAM_ID,SEASON,GAME_DATE,AWAY_LAST_GAME_OE,AWAY_LAST_GAME_HOME_WIN_PCTG,AWAY_NUM_REST_DAYS,AWAY_LAST_GAME_AWAY_WIN_PCTG,AWAY_LAST_GAME_TOTAL_WIN_PCTG,AWAY_LAST_GAME_ROLLING_SCORING_MARGIN,AWAY_LAST_GAME_ROLLING_OE,AWAY_TEAM_ID
0,0.572650,0.275000,2.0,0.219512,0.246914,-7.333333,0.567699,0,1610612745,2021-22,2022-04-10,0.551402,0.658537,2.0,0.375000,0.518519,0.333333,0.582568,1610612737
1,0.641667,0.700000,3.0,0.600000,0.650000,17.333333,0.624980,1,1610612748,2021-22,2022-04-08,0.589744,0.658537,2.0,0.384615,0.525000,4.000000,0.576985,1610612737
2,0.606557,0.650000,1.0,0.384615,0.518987,7.000000,0.589099,1,1610612737,2021-22,2022-04-06,0.671429,0.525000,1.0,0.358974,0.443038,2.666667,0.626338,1610612764
3,0.495413,0.552632,2.0,0.600000,0.576923,10.333333,0.538786,1,1610612761,2021-22,2022-04-05,0.534653,0.650000,3.0,0.394737,0.525641,16.333333,0.590617,1610612737
4,0.626087,0.641026,2.0,0.394737,0.519481,17.000000,0.631844,1,1610612737,2021-22,2022-04-02,0.546763,0.447368,2.0,0.589744,0.519481,-1.000000,0.576689,1610612751
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3012,0.580357,0.142857,3.0,0.111111,0.125000,-18.666667,0.548555,0,1610612765,2023-24,2023-11-27,0.571429,0.142857,2.0,0.111111,0.125000,-11.333333,0.585437,1610612764
3013,0.581395,0.250000,2.0,0.400000,0.307692,-14.000000,0.549156,1,1610612766,2023-24,2023-11-22,0.661417,0.166667,2.0,0.142857,0.153846,-15.666667,0.591414,1610612764
3014,0.603306,0.500000,2.0,0.200000,0.285714,-3.000000,0.599744,0,1610612764,2023-24,2023-11-10,0.542056,0.250000,2.0,0.333333,0.285714,-7.000000,0.594748,1610612766
3015,0.617188,0.333333,3.0,0.333333,0.333333,-4.666667,0.610491,0,1610612766,2023-24,2023-11-08,0.588235,0.500000,2.0,0.000000,0.166667,-11.333333,0.599126,1610612764


In [154]:
modelData.to_csv('nbaHomeWinLossModelDataset.csv')