# Getting and filtering data for each game for seasons from 2011-2022 (12 seasons)

In [19]:
from nba_api.stats.endpoints import leaguegamelog
import pandas as pd

YEAR = 2010

COUNTER =  1
DIRECTION = "ASC"
LEAGUE = "00"
PLAYER_OR_TEAM = "T"
SEASON_TYPE = "Regular Season"
SORTER = "DATE"

def fetch_season_game_logs(season):
    # Create LeagueGameLog object instance for the season
    gamelog = leaguegamelog.LeagueGameLog(
    COUNTER, DIRECTION, LEAGUE, PLAYER_OR_TEAM, season, SEASON_TYPE, SORTER
)
    
    # Execute request and fetch data
    data = gamelog.get_data_frames()[0] 
    
    # Filter columns and add Home/Away column
    columns_to_keep = ["TEAM_ID","TEAM_NAME","GAME_ID","MATCHUP","WL", "GAME_DATE"]
    filtered_data = data[columns_to_keep].copy()
    filtered_data['HOME/AWAY'] = filtered_data['MATCHUP'].apply(lambda x: 0 if '@' in x else 1)
    filtered_data['TEAM_2_HOME/AWAY'] = 1 - filtered_data['HOME/AWAY']
    
    return filtered_data

def fetch_multiple_seasons(start_year, end_year):
    all_seasons_data = []  # List to store data for all seasons
    
    # Loop through each season from start_year to end_year (inclusive)
    for year in range(start_year, end_year + 1):
        season = f"{year}-{str(year + 1)[-2:]}"  # Format season string (e.g., '2010-11')
        season_data = fetch_season_game_logs(season)
        all_seasons_data.append(season_data)
    
    # Concatenate all seasons' data into a single DataFrame
    combined_data = pd.concat(all_seasons_data, ignore_index=True)
    combined_data = combined_data.sort_values(by='GAME_ID', ascending=True)
    combined_data = combined_data.reset_index(drop=True)

    team_2_id = []
    team_2_name = []

    # Add new columns for team IDs and names from adjacent rows
    for i in range(len(combined_data)):
        if i % 2 == 0:  # Every even row
            team_2_id.append(combined_data.loc[i+1, 'TEAM_ID'])
            team_2_name.append(combined_data.loc[i+1, 'TEAM_NAME'])
        else:  # Every odd row
            team_2_id.append(combined_data.loc[i-1, 'TEAM_ID'])
            team_2_name.append(combined_data.loc[i-1, 'TEAM_NAME'])
    
    # Add new columns to the DataFrame
    combined_data['TEAM_2_ID'] = team_2_id
    combined_data['TEAM_2_NAME'] = team_2_name
    
    columns_order = ["GAME_ID", "GAME_DATE", "MATCHUP", "TEAM_NAME", "TEAM_ID", "TEAM_2_NAME", "TEAM_2_ID", "WL", "HOME/AWAY", "TEAM_2_HOME/AWAY"]
    combined_data = combined_data[columns_order]
    
    return combined_data

fetch_multiple_seasons(2023,2023)

Unnamed: 0,GAME_ID,GAME_DATE,MATCHUP,TEAM_NAME,TEAM_ID,TEAM_2_NAME,TEAM_2_ID,WL,HOME/AWAY,TEAM_2_HOME/AWAY
0,0022300001,2023-11-03,CLE @ IND,Cleveland Cavaliers,1610612739,Indiana Pacers,1610612754,L,0,1
1,0022300001,2023-11-03,IND vs. CLE,Indiana Pacers,1610612754,Cleveland Cavaliers,1610612739,W,1,0
2,0022300002,2023-11-03,MIL vs. NYK,Milwaukee Bucks,1610612749,New York Knicks,1610612752,W,1,0
3,0022300002,2023-11-03,NYK @ MIL,New York Knicks,1610612752,Milwaukee Bucks,1610612749,L,0,1
4,0022300003,2023-11-03,WAS @ MIA,Washington Wizards,1610612764,Miami Heat,1610612748,L,0,1
...,...,...,...,...,...,...,...,...,...,...
2455,0022301228,2023-12-08,PHX vs. SAC,Phoenix Suns,1610612756,Sacramento Kings,1610612758,L,1,0
2456,0022301229,2023-12-07,MIL vs. IND,Milwaukee Bucks,1610612749,Indiana Pacers,1610612754,L,1,0
2457,0022301229,2023-12-07,IND @ MIL,Indiana Pacers,1610612754,Milwaukee Bucks,1610612749,W,0,1
2458,0022301230,2023-12-07,NOP @ LAL,New Orleans Pelicans,1610612740,Los Angeles Lakers,1610612747,L,0,1


# Aquiring Season Stats for each team 

In [20]:
from api_helpers.team_stats_helpers import load_dataframe

attribute_list = ["PTS","FGA", "FGM", "FTA", "FG3M", "FG_PCT", "OREB","DREB","AST", "TOV", "WIN_PCT"]
nba_dataframe = load_dataframe(attribute_list)
nba_dataframe = nba_dataframe.drop(nba_dataframe[nba_dataframe["FGM"] == 0].index)
nba_dataframe.sort_values(by='YEAR')

nba_dataframe['YEAR'] = nba_dataframe['YEAR'].str.split('-').str[0]
nba_dataframe['YEAR'] = pd.to_numeric(nba_dataframe['YEAR'])
nba_dataframe.sort_values(by="YEAR",inplace=True)

nba_dataframe["NBA_FINALS_APPEARANCE"].fillna(0.0, inplace=True)
nba_dataframe["NBA_FINALS_APPEARANCE"].replace("FINALS APPEARANCE", 0.0, inplace=True)
nba_dataframe["NBA_FINALS_APPEARANCE"].replace("LEAGUE CHAMPION", 2, inplace=True)

nba_dataframe = nba_dataframe[nba_dataframe['YEAR'] == YEAR]
season_stat_dataframe = nba_dataframe.reset_index(drop=True)

season_stat_dataframe



Unnamed: 0,TEAM_ID,YEAR,WIN_PCT,NBA_FINALS_APPEARANCE,FGM,FGA,FG_PCT,FG3M,FTA,OREB,DREB,AST,TOV,PTS
0,1610612745,2010,0.524,0.0,3170,6975,0.454,677,2083,962,2549,1955,1110,8685
1,1610612737,2010,0.537,0.0,2971,6429,0.462,502,1728,762,2460,1802,1118,7790
2,1610612739,2010,0.232,0.0,2886,6647,0.434,509,2075,856,2449,1720,1166,7827
3,1610612759,2010,0.744,0.0,3148,6628,0.475,685,1984,829,2603,1836,1101,8502
4,1610612749,2010,0.427,0.0,2814,6544,0.43,483,1881,862,2480,1545,1103,7534
5,1610612751,2010,0.293,0.0,2918,6638,0.44,459,1881,909,2440,1723,1152,7722
6,1610612744,2010,0.439,0.0,3251,7047,0.461,685,1695,955,2370,1847,1198,8477
7,1610612758,2010,0.293,0.0,3134,6979,0.449,428,1981,1071,2526,1675,1324,8151
8,1610612750,2010,0.207,0.0,3090,7014,0.441,589,1977,1085,2556,1650,1398,8288
9,1610612762,2010,0.476,0.0,3064,6590,0.465,435,2061,898,2338,1921,1175,8153


# Combining Dataframes
For each game, listing the stats of both teams for comparison

In [21]:
# As a reminder: attribute_list = ["PTS","FGA", "FGM", "FG_PCT", "OREB","DREB","AST", "TOV", "WIN_PCT"]

def update_game_data(game_data, season_team_data, attribute_list):
    # Dictionary to hold lists of attributes for each team
    team_stats = {f'team_1_{attr.lower()}': [] for attr in attribute_list}
    team_stats.update({f'team_2_{attr.lower()}': [] for attr in attribute_list})
    
    # Iterate over each row in game_data
    for i in range(len(game_data)):
        team_1_id = game_data.loc[i, 'TEAM_ID']
        team_2_id = game_data.loc[i, 'TEAM_2_ID']
        
        # Locate the rows in season_team_data
        team_1_data = season_team_data[season_team_data['TEAM_ID'] == team_1_id].iloc[0]
        team_2_data = season_team_data[season_team_data['TEAM_ID'] == team_2_id].iloc[0]
        
        # Append data to respective lists
        for attr in attribute_list:
            team_stats[f'team_1_{attr.lower()}'].append(team_1_data[attr])
            team_stats[f'team_2_{attr.lower()}'].append(team_2_data[attr])
    
    # Update game_data with new columns
    for attr in attribute_list:
        game_data[f'TEAM_1_{attr}'] = team_stats[f'team_1_{attr.lower()}']
        game_data[f'TEAM_2_{attr}'] = team_stats[f'team_2_{attr.lower()}']
    
    # Select every second row and reset the index
    game_data = game_data.iloc[::2].reset_index(drop=True)
    
    return game_data


game_data = fetch_multiple_seasons(YEAR, YEAR)
season_team_data = season_stat_dataframe
game_data = update_game_data(game_data, season_team_data, attribute_list)

game_data





Unnamed: 0,GAME_ID,GAME_DATE,MATCHUP,TEAM_NAME,TEAM_ID,TEAM_2_NAME,TEAM_2_ID,WL,HOME/AWAY,TEAM_2_HOME/AWAY,...,TEAM_1_OREB,TEAM_2_OREB,TEAM_1_DREB,TEAM_2_DREB,TEAM_1_AST,TEAM_2_AST,TEAM_1_TOV,TEAM_2_TOV,TEAM_1_WIN_PCT,TEAM_2_WIN_PCT
0,0021000001,2010-10-26,MIA @ BOS,Miami Heat,1610612748,Boston Celtics,1610612738,L,0,1,...,790.0,639.0,2666.0,2542.0,1639.0,1921.0,1142.0,1195.0,0.707,0.683
1,0021000002,2010-10-26,PHX @ POR,Phoenix Suns,1610612756,Portland Trail Blazers,1610612757,L,0,1,...,821.0,996.0,2478.0,2230.0,1945.0,1736.0,1169.0,1070.0,0.488,0.585
2,0021000003,2010-10-26,LAL vs. HOU,Los Angeles Lakers,1610612747,Houston Rockets,1610612745,W,1,0,...,989.0,962.0,2616.0,2549.0,1801.0,1955.0,1073.0,1110.0,0.695,0.524
3,0021000004,2010-10-27,CLE vs. BOS,Cleveland Cavaliers,1610612739,Boston Celtics,1610612738,W,1,0,...,856.0,639.0,2449.0,2542.0,1720.0,1921.0,1166.0,1195.0,0.232,0.683
4,0021000005,2010-10-27,DET @ NJN,Detroit Pistons,1610612765,New Jersey Nets,1610612751,L,0,1,...,931.0,909.0,2236.0,2440.0,1730.0,1723.0,1067.0,1152.0,0.366,0.293
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1225,0021001226,2011-04-13,DEN @ UTA,Denver Nuggets,1610612743,Utah Jazz,1610612762,L,0,1,...,791.0,898.0,2652.0,2338.0,1813.0,1921.0,1157.0,1175.0,0.610,0.476
1226,0021001227,2011-04-13,GSW vs. POR,Golden State Warriors,1610612744,Portland Trail Blazers,1610612757,W,1,0,...,955.0,996.0,2370.0,2230.0,1847.0,1736.0,1198.0,1070.0,0.439,0.585
1227,0021001228,2011-04-13,MEM @ LAC,Memphis Grizzlies,1610612763,Los Angeles Clippers,1610612746,L,0,1,...,970.0,955.0,2391.0,2501.0,1691.0,1813.0,1145.0,1343.0,0.561,0.390
1228,0021001229,2011-04-13,PHX vs. SAS,Phoenix Suns,1610612756,San Antonio Spurs,1610612759,W,1,0,...,821.0,829.0,2478.0,2603.0,1945.0,1836.0,1169.0,1101.0,0.488,0.744


# More data processing/redefining teams as winners and losers   

In [22]:
game_data['TEAM_1_POSS'] = game_data['TEAM_1_FGA'] + 0.44 * game_data['TEAM_1_FTA'] - game_data['TEAM_1_OREB'] + game_data['TEAM_1_TOV']
game_data['TEAM_2_POSS'] = game_data['TEAM_2_FGA'] + 0.44 * game_data['TEAM_2_FTA'] - game_data['TEAM_2_OREB'] + game_data['TEAM_2_TOV']

# Calculate PPP, DEF_PPP,  for each team
game_data['TEAM_1_PPP'] = game_data['TEAM_1_PTS'] / game_data['TEAM_1_POSS']
game_data['TEAM_2_PPP'] = game_data['TEAM_2_PTS'] / game_data['TEAM_2_POSS']
game_data['TEAM_1_DEF_PPP'] = game_data['TEAM_2_PTS'] / game_data['TEAM_1_POSS']
game_data['TEAM_2_DEF_PPP'] = game_data['TEAM_1_PTS'] / game_data['TEAM_2_POSS']

# Calculate True Shooting Percentage (TS%) for each team
game_data['TEAM_1_TS%'] = game_data['TEAM_1_PTS'] / (2 * (game_data['TEAM_1_FGA'] + 0.44 * game_data['TEAM_1_FTA']))
game_data['TEAM_2_TS%'] = game_data['TEAM_2_PTS'] / (2 * (game_data['TEAM_2_FGA'] + 0.44 * game_data['TEAM_2_FTA']))

# Calculate Effective Field Goal Percentage (eFG%) for each team
game_data['TEAM_1_eFG%'] = (game_data['TEAM_1_FGM'] + 0.5 * game_data['TEAM_1_FG3M']) / game_data['TEAM_1_FGA']
game_data['TEAM_2_eFG%'] = (game_data['TEAM_2_FGM'] + 0.5 * game_data['TEAM_2_FG3M']) / game_data['TEAM_2_FGA']

new_column_order = ["GAME_ID", "GAME_DATE", "MATCHUP", "TEAM_NAME", "TEAM_ID", "TEAM_2_NAME", "TEAM_2_ID", "WL", "HOME/AWAY", "TEAM_1_PTS", "TEAM_1_POSS", "TEAM_1_PPP", "TEAM_1_DEF_PPP", "TEAM_1_TS%", "TEAM_1_eFG%", "TEAM_1_FGA", "TEAM_1_FGM", "TEAM_1_FG3M", "TEAM_1_FTA", "TEAM_1_FG_PCT", "TEAM_1_OREB","TEAM_1_DREB", "TEAM_1_AST", "TEAM_1_TOV", "TEAM_1_WIN_PCT", "TEAM_2_HOME/AWAY", "TEAM_2_PTS", "TEAM_2_POSS", "TEAM_2_PPP","TEAM_2_DEF_PPP", "TEAM_2_TS%", "TEAM_2_eFG%", "TEAM_2_FGA", "TEAM_2_FGM", "TEAM_2_FG3M", "TEAM_2_FTA", "TEAM_2_FG_PCT", "TEAM_2_OREB","TEAM_2_DREB","TEAM_2_AST", "TEAM_2_TOV", "TEAM_2_WIN_PCT"]
sorted_game_data = game_data[new_column_order]
sorted_game_data.rename(columns={'TEAM_NAME': 'TEAM_1_NAME', 'TEAM_ID': 'TEAM_1_ID', 'HOME/AWAY': 'TEAM_1_HOME/AWAY', 'WL': 'TEAM_1_WIN/LOSS'}, inplace=True)

sorted_game_data


KeyError: 'TEAM_1_3PM'

# Saving data... One year at a time 


In [None]:
sorted_game_data.to_csv(f'data/game_by_game_season_stats/game_data_{YEAR}.csv', index=False)