In [125]:
from nba_api.stats.static import teams, players
from nba_api.stats.endpoints import cumestatsteamgames, cumestatsteam, gamerotation
import pandas as pd
import numpy as np
import json
import difflib
import time
import requests

In [126]:
# Retry Wrapper 
def retry(func, retries=3):
    def retry_wrapper(*args, **kwargs):
        attempts = 0
        while attempts < retries:
            try:
                return func(*args, **kwargs)
            except requests.exceptions.RequestException as e:
                print(e)
                time.sleep(30)
                attempts += 1

    return retry_wrapper

In [127]:
# Get Season Schedule Function


def getSeasonScheduleFrame(seasons,seasonType):


   # Get date from string
   def getGameDate(matchup):
       return matchup.partition(' at')[0][:10]


   # Get Home team from string
   def getHomeTeam(matchup):
       return matchup.partition(' at')[2]


   # Get Away team from string
   def getAwayTeam(matchup):
       return matchup.partition(' at')[0][10:]


   # Match nickname from schedule to team table to find ID
   def getTeamIDFromNickname(nickname):
       return teamLookup.loc[teamLookup['nickname'] == difflib.get_close_matches(nickname,teamLookup['nickname'],1)[0]].values[0][0]
  
   @retry
   def getRegularSeasonSchedule(season,teamID,seasonType):
       season = str(season) + "-" + str(season+1)[-2:] # Convert year to season format ie. 2020 -> 2020-21
       teamGames = cumestatsteamgames.CumeStatsTeamGames(league_id = '00',season = season ,
                                                                     season_type_all_star=seasonType,
                                                                     team_id = teamID).get_normalized_json()


       teamGames = pd.DataFrame(json.loads(teamGames)['CumeStatsTeamGames'])
       teamGames['SEASON'] = season
       return teamGames   
  
   # Get team lookup table
   teamLookup = pd.DataFrame(teams.get_teams())
  
   # Get teams schedule for each team for each season
   scheduleFrame = pd.DataFrame()
  
   for season in seasons:
       for id in teamLookup['id']:
           time.sleep(1)
           scheduleFrame = pd.concat([scheduleFrame, getRegularSeasonSchedule(season, id, seasonType)], ignore_index=True)
          
   scheduleFrame['GAME_DATE'] = pd.to_datetime(scheduleFrame['MATCHUP'].map(getGameDate))
   scheduleFrame['HOME_TEAM_NICKNAME'] = scheduleFrame['MATCHUP'].map(getHomeTeam)
   scheduleFrame['HOME_TEAM_ID'] = scheduleFrame['HOME_TEAM_NICKNAME'].map(getTeamIDFromNickname)
   scheduleFrame['AWAY_TEAM_NICKNAME'] = scheduleFrame['MATCHUP'].map(getAwayTeam)
   scheduleFrame['AWAY_TEAM_ID'] = scheduleFrame['AWAY_TEAM_NICKNAME'].map(getTeamIDFromNickname)
   scheduleFrame = scheduleFrame.drop_duplicates() # There's a row for both teams, only need 1
   scheduleFrame = scheduleFrame.reset_index(drop=True)
          
   return scheduleFrame


In [128]:
def getSingleGameMetrics(gameID, homeTeamID, awayTeamID, awayTeamNickname, seasonYear, gameDate):
    
    @retry
    def getGameStats(teamID,gameID,seasonYear):
        gameStats = cumestatsteam.CumeStatsTeam(game_ids=gameID,league_id ="00",
                                               season=seasonYear,season_type_all_star="Regular Season",
                                               team_id = teamID).get_normalized_json()

        gameStats = pd.DataFrame(json.loads(gameStats)['TotalTeamStats'])

        return gameStats

    data = getGameStats(homeTeamID, gameID, seasonYear)
    # Check if data is None before checking if it's empty
    #if data is None or data.empty:
        #print(f"Warning: No data for home team {homeTeamID} in game {gameID}")
        #return None  # Early exit if no data or if data is empty


    data.at[1, 'NICKNAME'] = awayTeamNickname
    data.at[1, 'TEAM_ID'] = awayTeamID
    data.at[1, 'OFFENSIVE_EFFICIENCY'] = (data.at[1, 'FG'] + data.at[1, 'AST']) / (data.at[1, 'FGA'] - data.at[1, 'OFF_REB'] + data.at[1, 'AST'] + data.at[1, 'TOTAL_TURNOVERS'])
    data.at[1, 'SCORING_MARGIN'] = data.at[1, 'PTS'] - data.at[0, 'PTS']

    data.at[0, 'OFFENSIVE_EFFICIENCY'] = (data.at[0, 'FG'] + data.at[0, 'AST']) / (data.at[0, 'FGA'] - data.at[0, 'OFF_REB'] + data.at[0, 'AST'] + data.at[0, 'TOTAL_TURNOVERS'])
    data.at[0, 'SCORING_MARGIN'] = data.at[0, 'PTS'] - data.at[1, 'PTS']

    data['SEASON'] = seasonYear
    data['GAME_DATE'] = gameDate
    data['GAME_ID'] = gameID

    return data


In [129]:
def getGameLogs(gameLogs, scheduleFrame, start_idx, end_idx, output_file='gameLogs1.csv'):
    # Functions to prepare additional columns after gameLogs table loads
    def getHomeAwayFlag(gameDF):
        gameDF['HOME_FLAG'] = np.where((gameDF['W_HOME'] == 1) | (gameDF['L_HOME'] == 1), 1, 0)
        gameDF['AWAY_FLAG'] = np.where((gameDF['W_ROAD'] == 1) | (gameDF['L_ROAD'] == 1), 1, 0)

    def getTotalWinPctg(gameDF):
        gameDF['TOTAL_GAMES_PLAYED'] = gameDF.groupby(['TEAM_ID', 'SEASON'])['GAME_DATE'].rank(ascending=True)
        gameDF['TOTAL_WINS'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID', 'SEASON'])['W'].cumsum()
        gameDF['TOTAL_WIN_PCTG'] = gameDF['TOTAL_WINS'] / gameDF['TOTAL_GAMES_PLAYED']
        return gameDF.drop(['TOTAL_GAMES_PLAYED', 'TOTAL_WINS'], axis=1)

    def getHomeWinPctg(gameDF):
        gameDF['HOME_GAMES_PLAYED'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID', 'SEASON'])['HOME_FLAG'].cumsum()
        gameDF['HOME_WINS'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID', 'SEASON'])['W_HOME'].cumsum()
        gameDF['HOME_WIN_PCTG'] = gameDF['HOME_WINS'] / gameDF['HOME_GAMES_PLAYED']
        return gameDF.drop(['HOME_GAMES_PLAYED', 'HOME_WINS'], axis=1)

    def getAwayWinPctg(gameDF):
        gameDF['AWAY_GAMES_PLAYED'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID', 'SEASON'])['AWAY_FLAG'].cumsum()
        gameDF['AWAY_WINS'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID', 'SEASON'])['W_ROAD'].cumsum()
        gameDF['AWAY_WIN_PCTG'] = gameDF['AWAY_WINS'] / gameDF['AWAY_GAMES_PLAYED']
        return gameDF.drop(['AWAY_GAMES_PLAYED', 'AWAY_WINS'], axis=1)

    def getRollingOE(gameDF):
        gameDF['ROLLING_OE'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID', 'SEASON'])['OFFENSIVE_EFFICIENCY'].transform(lambda x: x.rolling(3, 1).mean())

    def getRollingScoringMargin(gameDF):
        gameDF['ROLLING_SCORING_MARGIN'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID', 'SEASON'])['SCORING_MARGIN'].transform(lambda x: x.rolling(3, 1).mean())

    def getRestDays(gameDF):
        gameDF['LAST_GAME_DATE'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID', 'SEASON'])['GAME_DATE'].shift(1)
        gameDF['NUM_REST_DAYS'] = (gameDF['GAME_DATE'] - gameDF['LAST_GAME_DATE']) / np.timedelta64(1, 'D')
        return gameDF.drop('LAST_GAME_DATE', axis=1)
    
    start = time.perf_counter_ns()

    i = start_idx  # Start from the given index
    new_data = []

    while i < end_idx:  # Loop through the given range of indices

        time.sleep(1)
        
        # Get the new game metrics
        new_game_metrics = getSingleGameMetrics(
            scheduleFrame.at[i, 'GAME_ID'],
            scheduleFrame.at[i, 'HOME_TEAM_ID'],
            scheduleFrame.at[i, 'AWAY_TEAM_ID'],
            scheduleFrame.at[i, 'AWAY_TEAM_NICKNAME'],
            scheduleFrame.at[i, 'SEASON'],
            scheduleFrame.at[i, 'GAME_DATE']
        )
    
        if isinstance(new_game_metrics, pd.DataFrame):
            new_data.append(new_game_metrics)  # Append DataFrame to list
        else:
            new_data.append(pd.DataFrame([new_game_metrics]))  # Convert to DataFrame first

        end = time.perf_counter_ns()

        # Output time it took to load x amount of records
        if i % 100 == 0:
            mins = ((end - start) / 1e9) / 60
            print(i, str(mins) + ' minutes')

        i += 1

    # After the loop, convert the list of DataFrames to a single DataFrame
    if new_data:
        new_data_df = pd.concat(new_data, ignore_index=True)
        gameLogs = pd.concat([gameLogs, new_data_df], ignore_index=True)
    #print(gameLogs['GAME_DATE'].apply(type).value_counts())
    
    # Get Table Level Aggregation Columns
    getHomeAwayFlag(gameLogs)
    gameLogs = getHomeWinPctg(gameLogs)
    gameLogs = getAwayWinPctg(gameLogs)
    gameLogs = getTotalWinPctg(gameLogs)
    getRollingScoringMargin(gameLogs)
    getRollingOE(gameLogs)
    gameLogs = getRestDays(gameLogs)

    # Save to CSV
    gameLogs.to_csv(output_file, mode='a', header=not bool(i), index=False)  # Append to CSV file (add header only once)

    return gameLogs.reset_index(drop=True)



In [172]:
#Get ScheduleFrame

seasons = [2024]
seasonType = 'Regular Season'

start = time.perf_counter_ns() # Track cell's runtime
scheduleFrame = getSeasonScheduleFrame(seasons,seasonType)
end = time.perf_counter_ns()



secs = (end-start)/1e9
mins = secs/60
print(mins)

0.6406191382


In [173]:
print(len(scheduleFrame))

1173


In [175]:
#1054 = length of 2020
#1230 = length of 2021
#1230 = length of 2022
#1230 = length of 2023
# Convert GAME_DATE to datetime, coercing errors to NaT
scheduleFrame['GAME_DATE'] = pd.to_datetime(scheduleFrame['GAME_DATE'], errors='coerce')

# Drop rows with NaT in GAME_DATE column only, if necessary
scheduleFrame = scheduleFrame.dropna(subset=['GAME_DATE'])

# Now check if there are any NaT values left
print(scheduleFrame[scheduleFrame['GAME_DATE'].isna()])

Empty DataFrame
Columns: [MATCHUP, GAME_ID, SEASON, GAME_DATE, HOME_TEAM_NICKNAME, HOME_TEAM_ID, AWAY_TEAM_NICKNAME, AWAY_TEAM_ID]
Index: []


In [137]:
# Create the gameLogs DataFrame
gamelogs = pd.read_csv('gamelogs1.csv')



In [178]:
getGameLogs(gamelogs, scheduleFrame, 1042, 1100)

TypeError: '<' not supported between instances of 'Timestamp' and 'str'

In [None]:
#Example Output of Game Logs
gamelogs[(gamelogs['TEAM_ID'] == 1610612737 ) & (gamelogs['SEASON'] == '2024-25')].sort_values('GAME_DATE')

In [147]:
#This function produces the dataset with all current feature variables for modeling and additional columns
# to perform a training set/validation set split.
def getGameLogFeatureSet(gameDF):

    def shiftGameLogRecords(gameDF):
        gameDF['LAST_GAME_OE'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID','SEASON'])['OFFENSIVE_EFFICIENCY'].shift(1)
        gameDF['LAST_GAME_HOME_WIN_PCTG'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID','SEASON'])['HOME_WIN_PCTG'].shift(1)
        gameDF['LAST_GAME_AWAY_WIN_PCTG'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID','SEASON'])['AWAY_WIN_PCTG'].shift(1)
        gameDF['LAST_GAME_TOTAL_WIN_PCTG'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID','SEASON'])['TOTAL_WIN_PCTG'].shift(1)
        gameDF['LAST_GAME_ROLLING_SCORING_MARGIN'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID','SEASON'])['ROLLING_SCORING_MARGIN'].shift(1)
        gameDF['LAST_GAME_ROLLING_OE'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID','SEASON'])['ROLLING_OE'].shift(1)
    
    
    def getHomeTeamFrame(gameDF):
        homeTeamFrame = gameDF[gameDF['CITY'] != 'OPPONENTS']
        homeTeamFrame = homeTeamFrame[['LAST_GAME_OE','LAST_GAME_HOME_WIN_PCTG','NUM_REST_DAYS','LAST_GAME_AWAY_WIN_PCTG','LAST_GAME_TOTAL_WIN_PCTG','LAST_GAME_ROLLING_SCORING_MARGIN','LAST_GAME_ROLLING_OE','W','TEAM_ID','GAME_ID','SEASON']]

        colRenameDict = {}
        for col in homeTeamFrame.columns:
            if (col != 'GAME_ID') & (col != 'SEASON') :
                colRenameDict[col] = 'HOME_' + col 

        homeTeamFrame.rename(columns=colRenameDict,inplace=True)

        return homeTeamFrame

    def getAwayTeamFrame(gameDF):
        awayTeamFrame = gameDF[gameDF['CITY'] == 'OPPONENTS']
        awayTeamFrame = awayTeamFrame[['LAST_GAME_OE','LAST_GAME_HOME_WIN_PCTG','NUM_REST_DAYS','LAST_GAME_AWAY_WIN_PCTG','LAST_GAME_TOTAL_WIN_PCTG','LAST_GAME_ROLLING_SCORING_MARGIN','LAST_GAME_ROLLING_OE','TEAM_ID','GAME_ID','SEASON']]

        colRenameDict = {}
        for col in awayTeamFrame.columns:
            if (col != 'GAME_ID') & (col != 'SEASON'):
                colRenameDict[col] = 'AWAY_' + col 

        awayTeamFrame.rename(columns=colRenameDict,inplace=True)

        return awayTeamFrame
    
    shiftGameLogRecords(gamelogs)
    awayTeamFrame = getAwayTeamFrame(gamelogs)
    homeTeamFrame = getHomeTeamFrame(gamelogs)
    
    return pd.merge(homeTeamFrame, awayTeamFrame, how="inner", on=[ "GAME_ID","SEASON"]).drop(['GAME_ID','AWAY_TEAM_ID','HOME_TEAM_ID'],axis=1)

In [148]:
modelData = getGameLogFeatureSet(gamelogs)

In [149]:
modelData

Unnamed: 0,HOME_LAST_GAME_OE,HOME_LAST_GAME_HOME_WIN_PCTG,HOME_NUM_REST_DAYS,HOME_LAST_GAME_AWAY_WIN_PCTG,HOME_LAST_GAME_TOTAL_WIN_PCTG,HOME_LAST_GAME_ROLLING_SCORING_MARGIN,HOME_LAST_GAME_ROLLING_OE,HOME_W,SEASON,AWAY_LAST_GAME_OE,AWAY_LAST_GAME_HOME_WIN_PCTG,AWAY_NUM_REST_DAYS,AWAY_LAST_GAME_AWAY_WIN_PCTG,AWAY_LAST_GAME_TOTAL_WIN_PCTG,AWAY_LAST_GAME_ROLLING_SCORING_MARGIN,AWAY_LAST_GAME_ROLLING_OE
0,0.677419,0.361111,3.0,0.277778,0.319444,-4.000000,0.583732,1,2020-21,0.593750,0.583333,2.0,0.583333,0.583333,2.333333,0.585312
1,0.677419,0.361111,3.0,0.277778,0.319444,-4.000000,0.583732,1,2020-21,0.593750,0.583333,2.0,0.583333,0.583333,2.333333,0.585312
2,0.677419,0.361111,3.0,0.277778,0.319444,-4.000000,0.583732,1,2020-21,0.593750,0.583333,2.0,0.583333,0.583333,2.333333,0.585312
3,0.677419,0.361111,3.0,0.277778,0.319444,-4.000000,0.583732,1,2020-21,0.593750,0.583333,2.0,0.583333,0.583333,2.333333,0.585312
4,0.677419,0.361111,3.0,0.277778,0.319444,-4.000000,0.583732,1,2020-21,0.593750,0.583333,2.0,0.583333,0.583333,2.333333,0.585312
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182849,0.518248,0.250000,4.0,0.500000,0.333333,-4.000000,0.560119,0,2024-25,0.529412,0.400000,2.0,0.500000,0.454545,-0.333333,0.559753
182850,0.617886,0.555556,2.0,0.600000,0.578947,-3.000000,0.581690,1,2024-25,0.620155,0.333333,2.0,0.200000,0.277778,-6.333333,0.574002
182851,0.554622,0.454545,2.0,0.600000,0.538462,4.333333,0.566473,1,2024-25,0.503876,0.000000,6.0,0.000000,0.000000,-11.666667,0.505304
182852,0.538462,0.800000,2.0,0.200000,0.500000,-8.333333,0.531471,1,2024-25,0.584746,0.333333,4.0,0.000000,0.250000,-5.000000,0.580165


In [150]:
modelData.to_csv('nbaHomeWinLossModelDataset.csv')