# Getting and filtering data for each game for seasons from 2011-2022 (12 seasons)

In [4]:
from nba_api.stats.endpoints import leaguegamelog
import pandas as pd

YEAR = 2023

COUNTER =  1
DIRECTION = "ASC"
LEAGUE = "00"
PLAYER_OR_TEAM = "T"
SEASON_TYPE = "Regular Season"
SORTER = "DATE"

def fetch_season_game_logs(season):
    # Create LeagueGameLog object instance for the season
    gamelog = leaguegamelog.LeagueGameLog(
    COUNTER, DIRECTION, LEAGUE, PLAYER_OR_TEAM, season, SEASON_TYPE, SORTER
)
    
    # Execute request and fetch data
    data = gamelog.get_data_frames()[0] 
    
    # Filter columns and add Home/Away column
    columns_to_keep = ["TEAM_ID","TEAM_NAME","GAME_ID","MATCHUP","WL", "GAME_DATE"]
    filtered_data = data[columns_to_keep].copy()
    filtered_data['HOME/AWAY'] = filtered_data['MATCHUP'].apply(lambda x: 0 if '@' in x else 1)
    filtered_data['TEAM_2_HOME/AWAY'] = 1 - filtered_data['HOME/AWAY']
    
    return filtered_data

def fetch_multiple_seasons(start_year, end_year):
    all_seasons_data = []  # List to store data for all seasons
    
    # Loop through each season from start_year to end_year (inclusive)
    for year in range(start_year, end_year + 1):
        season = f"{year}-{str(year + 1)[-2:]}"  # Format season string (e.g., '2010-11')
        season_data = fetch_season_game_logs(season)
        all_seasons_data.append(season_data)
    
    # Concatenate all seasons' data into a single DataFrame
    combined_data = pd.concat(all_seasons_data, ignore_index=True)
    combined_data = combined_data.sort_values(by='GAME_ID', ascending=True)
    combined_data = combined_data.reset_index(drop=True)

    team_2_id = []
    team_2_name = []

    # Add new columns for team IDs and names from adjacent rows
    for i in range(len(combined_data)):
        if i % 2 == 0:  # Every even row
            team_2_id.append(combined_data.loc[i+1, 'TEAM_ID'])
            team_2_name.append(combined_data.loc[i+1, 'TEAM_NAME'])
        else:  # Every odd row
            team_2_id.append(combined_data.loc[i-1, 'TEAM_ID'])
            team_2_name.append(combined_data.loc[i-1, 'TEAM_NAME'])
    
    # Add new columns to the DataFrame
    combined_data['TEAM_2_ID'] = team_2_id
    combined_data['TEAM_2_NAME'] = team_2_name
    
    columns_order = ["GAME_ID", "GAME_DATE", "MATCHUP", "TEAM_NAME", "TEAM_ID", "TEAM_2_NAME", "TEAM_2_ID", "WL", "HOME/AWAY", "TEAM_2_HOME/AWAY"]
    combined_data = combined_data[columns_order]
    
    return combined_data

fetch_multiple_seasons(2023,2023)

# Aquiring Season Stats for each team 

In [2]:
def load_dataframe(columns=["CONF_RANK"]):
    """
    This function returns a loaded dictionary with each key
    representing a year. Each key maps to another dictionary -- where
    each team ID maps to that team's specified statistic for the
    season. The argument 'column' allow you to specify any
    statistic available in the CSV file.

    Argument:
        column: string representing a desired/specified
        statistic. Its default value is 'CONF_RANK' as
        for plot_statistics function later.

    Returns:
        seasons_dict: dictionary where each key represents
        a year and maps to dictionary, with a TEAM_ID as a key
        and the corresponding statistic of that team for that
        year.
    """

    # list of columns for the data frame from the CSV
    column_list = ["YEAR", "TEAM_ID", "NBA_FINALS_APPEARANCE"] + columns

    # data frame of the year, team_id, and specified statistic
    data_frame = pd.read_csv(
        "/home/swisnoski/nba_predictor_development/models/data/team_year_stats.csv",
        usecols=column_list,
    )

    return data_frame

attribute_list = ["PTS","FGA", "FGM", "FTA", "FG3M", "FG_PCT", "OREB","DREB","AST", "TOV", "WIN_PCT"]
nba_dataframe = load_dataframe(attribute_list)
nba_dataframe = nba_dataframe.drop(nba_dataframe[nba_dataframe["FGM"] == 0].index)
nba_dataframe.sort_values(by='YEAR')

nba_dataframe['YEAR'] = nba_dataframe['YEAR'].str.split('-').str[0]
nba_dataframe['YEAR'] = pd.to_numeric(nba_dataframe['YEAR'])
nba_dataframe.sort_values(by="YEAR",inplace=True)

nba_dataframe["NBA_FINALS_APPEARANCE"].fillna(0.0, inplace=True)
nba_dataframe["NBA_FINALS_APPEARANCE"].replace("FINALS APPEARANCE", 0.0, inplace=True)
nba_dataframe["NBA_FINALS_APPEARANCE"].replace("LEAGUE CHAMPION", 2, inplace=True)

nba_dataframe = nba_dataframe[nba_dataframe['YEAR'] == YEAR]
season_stat_dataframe = nba_dataframe.reset_index(drop=True)

season_stat_dataframe



Unnamed: 0,TEAM_ID,YEAR,WIN_PCT,NBA_FINALS_APPEARANCE,FGM,FGA,FG_PCT,FG3M,FTA,OREB,DREB,AST,TOV,PTS
0,1610612737,2023,0.439,0.0,3529,7584,0.465,1125,1906,1024,2639,2180,1110,9703
1,1610612738,2023,0.78,2.0,3601,7396,0.487,1351,1654,876,2923,2207,979,9887
2,1610612761,2023,0.305,0.0,3466,7356,0.471,942,1772,891,2607,2335,1147,9213
3,1610612763,2023,0.329,0.0,3145,7230,0.435,1071,1722,896,2598,2025,1236,8677
4,1610612762,2023,0.378,0.0,3443,7371,0.467,1060,1853,1002,2725,2232,1285,9484
5,1610612758,2023,0.561,0.0,3553,7455,0.477,1178,1710,888,2719,2324,1078,9558
6,1610612759,2023,0.268,0.0,3438,7436,0.462,1036,1637,849,2778,2449,1242,9192
7,1610612760,2023,0.695,0.0,3653,7324,0.499,1090,1759,722,2725,2223,1039,9847
8,1610612747,2023,0.573,0.0,3580,7177,0.499,969,1983,676,2859,2340,1146,9679
9,1610612757,2023,0.256,0.0,3227,7356,0.439,939,1680,1036,2469,1894,1249,8722


# Combining Dataframes
For each game, listing the stats of both teams for comparison

In [3]:
# As a reminder: attribute_list = ["PTS","FGA", "FGM", "FG_PCT", "OREB","DREB","AST", "TOV", "WIN_PCT"]

def update_game_data(game_data, season_team_data, attribute_list):
    # Dictionary to hold lists of attributes for each team
    team_stats = {f'team_1_{attr.lower()}': [] for attr in attribute_list}
    team_stats.update({f'team_2_{attr.lower()}': [] for attr in attribute_list})
    
    # Iterate over each row in game_data
    for i in range(len(game_data)):
        team_1_id = game_data.loc[i, 'TEAM_ID']
        team_2_id = game_data.loc[i, 'TEAM_2_ID']
        
        # Locate the rows in season_team_data
        team_1_data = season_team_data[season_team_data['TEAM_ID'] == team_1_id].iloc[0]
        team_2_data = season_team_data[season_team_data['TEAM_ID'] == team_2_id].iloc[0]
        
        # Append data to respective lists
        for attr in attribute_list:
            team_stats[f'team_1_{attr.lower()}'].append(team_1_data[attr])
            team_stats[f'team_2_{attr.lower()}'].append(team_2_data[attr])
    
    # Update game_data with new columns
    for attr in attribute_list:
        game_data[f'TEAM_1_{attr}'] = team_stats[f'team_1_{attr.lower()}']
        game_data[f'TEAM_2_{attr}'] = team_stats[f'team_2_{attr.lower()}']
    
    # Select every second row and reset the index
    game_data = game_data.iloc[::2].reset_index(drop=True)
    
    return game_data


game_data = fetch_multiple_seasons(YEAR, YEAR)
season_team_data = season_stat_dataframe
game_data = update_game_data(game_data, season_team_data, attribute_list)

game_data





KeyboardInterrupt: 

# More data processing/redefining teams as winners and losers   

In [None]:
game_data['TEAM_1_POSS'] = game_data['TEAM_1_FGA'] + 0.44 * game_data['TEAM_1_FTA'] - game_data['TEAM_1_OREB'] + game_data['TEAM_1_TOV']
game_data['TEAM_2_POSS'] = game_data['TEAM_2_FGA'] + 0.44 * game_data['TEAM_2_FTA'] - game_data['TEAM_2_OREB'] + game_data['TEAM_2_TOV']

# Calculate PPP, DEF_PPP,  for each team
game_data['TEAM_1_PPP'] = game_data['TEAM_1_PTS'] / game_data['TEAM_1_POSS']
game_data['TEAM_2_PPP'] = game_data['TEAM_2_PTS'] / game_data['TEAM_2_POSS']
game_data['TEAM_1_DEF_PPP'] = game_data['TEAM_2_PTS'] / game_data['TEAM_1_POSS']
game_data['TEAM_2_DEF_PPP'] = game_data['TEAM_1_PTS'] / game_data['TEAM_2_POSS']

# Calculate True Shooting Percentage (TS%) for each team
game_data['TEAM_1_TS%'] = game_data['TEAM_1_PTS'] / (2 * (game_data['TEAM_1_FGA'] + 0.44 * game_data['TEAM_1_FTA']))
game_data['TEAM_2_TS%'] = game_data['TEAM_2_PTS'] / (2 * (game_data['TEAM_2_FGA'] + 0.44 * game_data['TEAM_2_FTA']))

# Calculate Effective Field Goal Percentage (eFG%) for each team
game_data['TEAM_1_eFG%'] = (game_data['TEAM_1_FGM'] + 0.5 * game_data['TEAM_1_FG3M']) / game_data['TEAM_1_FGA']
game_data['TEAM_2_eFG%'] = (game_data['TEAM_2_FGM'] + 0.5 * game_data['TEAM_2_FG3M']) / game_data['TEAM_2_FGA']

new_column_order = ["GAME_ID", "GAME_DATE", "MATCHUP", "TEAM_NAME", "TEAM_ID", "TEAM_2_NAME", "TEAM_2_ID", "WL", "HOME/AWAY", "TEAM_1_PTS", "TEAM_1_POSS", "TEAM_1_PPP", "TEAM_1_DEF_PPP", "TEAM_1_TS%", "TEAM_1_eFG%", "TEAM_1_FGA", "TEAM_1_FGM", "TEAM_1_FG3M", "TEAM_1_FTA", "TEAM_1_FG_PCT", "TEAM_1_OREB","TEAM_1_DREB", "TEAM_1_AST", "TEAM_1_TOV", "TEAM_1_WIN_PCT", "TEAM_2_HOME/AWAY", "TEAM_2_PTS", "TEAM_2_POSS", "TEAM_2_PPP","TEAM_2_DEF_PPP", "TEAM_2_TS%", "TEAM_2_eFG%", "TEAM_2_FGA", "TEAM_2_FGM", "TEAM_2_FG3M", "TEAM_2_FTA", "TEAM_2_FG_PCT", "TEAM_2_OREB","TEAM_2_DREB","TEAM_2_AST", "TEAM_2_TOV", "TEAM_2_WIN_PCT"]
sorted_game_data = game_data[new_column_order]
sorted_game_data.rename(columns={'TEAM_NAME': 'TEAM_1_NAME', 'TEAM_ID': 'TEAM_1_ID', 'HOME/AWAY': 'TEAM_1_HOME/AWAY', 'WL': 'TEAM_1_WIN/LOSS'}, inplace=True)

sorted_game_data


Unnamed: 0,GAME_ID,GAME_DATE,MATCHUP,TEAM_1_NAME,TEAM_1_ID,TEAM_2_NAME,TEAM_2_ID,TEAM_1_WIN/LOSS,TEAM_1_HOME/AWAY,TEAM_1_PTS,...,TEAM_2_FGA,TEAM_2_FGM,TEAM_2_FG3M,TEAM_2_FTA,TEAM_2_FG_PCT,TEAM_2_OREB,TEAM_2_DREB,TEAM_2_AST,TEAM_2_TOV,TEAM_2_WIN_PCT
0,0022300001,2023-11-03,IND vs. CLE,Indiana Pacers,1610612754,Cleveland Cavaliers,1610612739,W,1,10110.0,...,7148.0,3425.0,1108.0,1671.0,0.479,807.0,2742.0,2299.0,1112.0,0.585
1,0022300002,2023-11-03,NYK @ MIL,New York Knicks,1610612752,Milwaukee Bucks,1610612749,L,0,9249.0,...,7258.0,3538.0,1163.0,1960.0,0.487,772.0,2850.0,2176.0,1059.0,0.598
2,0022300003,2023-11-03,MIA vs. WAS,Miami Heat,1610612748,Washington Wizards,1610612764,W,1,9032.0,...,7493.0,3523.0,1015.0,1657.0,0.470,755.0,2613.0,2288.0,1147.0,0.183
3,0022300004,2023-11-03,CHI vs. BKN,Chicago Bulls,1610612741,Brooklyn Nets,1610612751,L,1,9206.0,...,7307.0,3334.0,1089.0,1711.0,0.456,938.0,2675.0,2102.0,1076.0,0.390
4,0022300005,2023-11-03,OKC vs. GSW,Oklahoma City Thunder,1610612760,Golden State Warriors,1610612744,L,1,9847.0,...,7515.0,3582.0,1211.0,1644.0,0.477,996.0,2834.0,2402.0,1174.0,0.561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1225,0022301226,2023-12-08,DAL @ POR,Dallas Mavericks,1610612742,Portland Trail Blazers,1610612757,W,0,9664.0,...,7356.0,3227.0,939.0,1680.0,0.439,1036.0,2469.0,1894.0,1249.0,0.256
1226,0022301227,2023-12-08,NYK @ BOS,New York Knicks,1610612752,Boston Celtics,1610612738,L,0,9249.0,...,7396.0,3601.0,1351.0,1654.0,0.487,876.0,2923.0,2207.0,979.0,0.780
1227,0022301228,2023-12-08,SAC @ PHX,Sacramento Kings,1610612758,Phoenix Suns,1610612756,W,0,9558.0,...,7063.0,3482.0,1020.0,1915.0,0.493,831.0,2783.0,2218.0,1221.0,0.598
1228,0022301229,2023-12-07,IND @ MIL,Indiana Pacers,1610612754,Milwaukee Bucks,1610612749,W,0,10110.0,...,7258.0,3538.0,1163.0,1960.0,0.487,772.0,2850.0,2176.0,1059.0,0.598


# Saving data... One year at a time 


In [None]:
from nba_api.stats.endpoints import boxscoretraditionalv2
from nba_api.stats.static import teams, players
from concurrent.futures import ThreadPoolExecutor, as_completed

# Replace this with your actual dataframe

df = sorted_game_data
df_subset = df.head(10)

# Function to fetch player stats for a given game_id
def fetch_player_stats(game_id):
    box_score = boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id)
    player_stats = box_score.player_stats.get_data_frame()
    player_stats['GAME_ID'] = game_id
    return player_stats

# List to store player stats dataframes
all_player_stats = []

# Using ThreadPoolExecutor for parallel requests
with ThreadPoolExecutor(max_workers=10) as executor:
    # Create a list of future tasks
    futures = [executor.submit(fetch_player_stats, game_id) for game_id in df['GAME_ID']]
    
    for future in as_completed(futures):
        try:
            player_stats = future.result()
            all_player_stats.append(player_stats)
        except Exception as e:
            print(f"An error occurred: {e}")

# Concatenate all player stats into a single dataframe
player_stats_df = pd.concat(all_player_stats, ignore_index=True)

# Display the player statistics dataframe
print(player_stats_df)


# Display the player statistics dataframe
player_stats_df


An error occurred: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
An error occurred: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
An error occurred: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
An error occurred: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
An error occurred: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
An error occurred: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
An error occurred: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
An error occurred: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
An error occurred: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
An error occurred: HTTPSConnectionPool(host='s

In [None]:
sorted_game_data.to_csv(f'data/game_by_game_season_stats/game_data_{YEAR}.csv', index=False)