## Import Required Libraries

In [53]:
!pip install --q -r requirements.txt

In [54]:
from nba_api.stats.endpoints import teamdetails, commonteamyears, teamgamelogs, playergamelogs, commonplayerinfo, boxscoreadvancedv3, boxscoretraditionalv3, boxscoremiscv3, scoreboardv2
from nba_api.live.nba.endpoints import playbyplay
import pandas as pd
from tqdm import tqdm
import os
import pandavro as pdx
import numpy as np
import time
from datetime import datetime, date
from warnings import filterwarnings
import json
import multiprocessing as mp
filterwarnings('ignore')

## Get All Team IDs

In [55]:
if 'all_teams.parquet' not in os.listdir('Static Files'):
    teams = commonteamyears.CommonTeamYears()
    print('Established connection')
    teams = teams.get_dict()
    print('Retrieved list of dicts')
    team_ids = []
    for team in tqdm(teams['resultSets'][0]['rowSet']):
        team_id = team[1]
        team_name = team[2]
        start_year = team[3]
        end_year = team[4]
        team_ids.append((team_id, team_name, start_year, end_year))
        time.sleep(0.5)
    print('Established list of tuples')
    teams = pd.DataFrame(team_ids, columns=['team_id', 'year_founded', 'year_depreciated', 'abbreviation'])
    print('Created DataFrame')
    teams.to_parquet('Static Files/all_teams.parquet', index=False)

## Get All Team Details

In [56]:
if 'team_details.parquet' not in os.listdir('Static Files'):
    teams = pd.read_parquet('Static Files/all_teams.parquet')
    initial_df = []
    for team in tqdm(teams['team_id']):
        get_coaches = teamdetails.TeamDetails(team_id=team, timeout=60)
        next_df = get_coaches.get_data_frames()[0]
        initial_df.append(next_df)
        time.sleep(0.5)
    output_df = pd.concat(initial_df)
    output_df.to_parquet('Static Files/team_details.parquet')

## Get All Team Game Logs

In [57]:
if 'team_game_logs.parquet' not in os.listdir('Static Files'):
    start = 0
    end = 1
    initial_df = []
    for integer in tqdm(range(1,25)):
        if end < 10:
            get_game_stats = teamgamelogs.TeamGameLogs(season_nullable=f'200{start}-0{end}')
            next_df = get_game_stats.get_data_frames()[0]
            initial_df.append(next_df)
            time.sleep(0.5)
            start += 1
            end += 1
        else:
            start += 1
            end += 1
            get_game_stats = teamgamelogs.TeamGameLogs(season_nullable=f'20{start}-{end}')
            next_df = get_game_stats.get_data_frames()[0]
            initial_df.append(next_df)
            time.sleep(0.5)

    output_df = pd.concat(initial_df)
    output_df.to_parquet('Static Files/team_game_logs.parquet')

## Get All Current Player Injury Reports

In [58]:
if 'player_injury_reports.avro' not in os.listdir('Static Files'):
    player_injuries = pd.concat(pd.read_html('https://www.espn.com/nba/injuries')).fillna('No Info Available')
    player_injuries['date_updated'] = datetime.today()
    pdx.to_avro('Static Files/player_injury_reports.avro', player_injuries)
elif ('player_injury_reports.avro' in os.listdir('Static Files')):
    player_injuries = pdx.read_avro('Static Files/player_injury_reports.avro')
    if player_injuries['date_updated'].all() != datetime.today():
        player_injuries = pd.concat(pd.read_html('https://www.espn.com/nba/injuries')).fillna('No Info Available')
        player_injuries['date_updated'] = datetime.today()
        pdx.to_avro('Static Files/player_injury_reports.avro', player_injuries)

## Get All Player Game Logs

In [59]:
if 'player_game_logs.parquet' not in os.listdir('Static Files'):
    start = 0
    end = 1
    initial_df = []
    for integer in tqdm(range(1,25)):
        if end < 10:
            get_game_stats = playergamelogs.PlayerGameLogs(season_nullable=f'200{start}-0{end}')
            next_df = get_game_stats.get_data_frames()[0]
            initial_df.append(next_df)
            time.sleep(0.5)
            start += 1
            end += 1
        else:
            start += 1
            end += 1
            get_game_stats = playergamelogs.PlayerGameLogs(season_nullable=f'20{start}-{end}')
            next_df = get_game_stats.get_data_frames()[0]
            initial_df.append(next_df)
            time.sleep(0.5)

    output_df = pd.concat(initial_df)
    output_df.to_parquet('Static Files/player_game_logs.parquet')

## Get Player Statuses

In [60]:
try:
    players = pd.read_parquet('Static Files/player_game_logs.parquet')[['PLAYER_ID']].drop_duplicates()
    statuses = pdx.read_avro('Static Files/player_statuses.avro')
    df3 = players.merge(statuses, left_on='PLAYER_ID', right_on='PERSON_ID', how='left', indicator=True)
    df = df3.loc[df3['_merge'] == 'left_only', 'PLAYER_ID']
    d = players[players['PLAYER_ID'].isin(df)]
    d = statuses.merge(d, left_on='PERSON_ID', right_on='PLAYER_ID', how='right')
except:
    pass
if 'player_statuses.avro' not in os.listdir('Static Files'):
    players = pd.read_parquet('Static Files/player_game_logs.parquet')[['PLAYER_ID']].drop_duplicates()
    for player in tqdm(players['PLAYER_ID']):
        player_status = commonplayerinfo.CommonPlayerInfo(player_id=player).get_data_frames()[0][['PERSON_ID', 'ROSTERSTATUS']]
        pdx.to_avro('Static Files/player_statuses.avro', player_status, append=True)
        statuses = pdx.read_avro('Static Files/player_statuses.avro')
        players = pd.read_parquet('Static Files/player_game_logs.parquet')[['PLAYER_ID']].drop_duplicates()
        # carrying out anti join using merge method
        df3 = players.merge(statuses, left_on='PLAYER_ID', right_on='PERSON_ID', how='left', indicator=True)
        df = df3.loc[df3['_merge'] == 'left_only', 'PLAYER_ID']
        d = players[players['PLAYER_ID'].isin(df)]
        d = statuses.merge(d, left_on='PERSON_ID', right_on='PLAYER_ID', how='right')
        time.sleep(0.5)
elif ('player_statuses.avro' in os.listdir('Static Files')) & len(d[['PLAYER_ID']].drop_duplicates()['PLAYER_ID']) > 0:
    statuses = pdx.read_avro('Static Files/player_statuses.avro')
    players = pd.read_parquet('Static Files/player_game_logs.parquet')[['PLAYER_ID']].drop_duplicates()
    # carrying out anti join using merge method
    df3 = players.merge(statuses, left_on='PLAYER_ID', right_on='PERSON_ID', how='left', indicator=True)
    df = df3.loc[df3['_merge'] == 'left_only', 'PLAYER_ID']
    d = players[players['PLAYER_ID'].isin(df)]
    d = statuses.merge(d, left_on='PERSON_ID', right_on='PLAYER_ID', how='right')
    for player in tqdm(d[['PLAYER_ID']].drop_duplicates()['PLAYER_ID']):
        player_status = commonplayerinfo.CommonPlayerInfo(player_id=player).get_data_frames()[0][['PERSON_ID', 'ROSTERSTATUS']]
        pdx.to_avro('Static Files/player_statuses.avro', player_status, append=True)
        time.sleep(0.5)

## Get Current Player Game Logs

In [61]:
if 'current_player_game_logs.parquet' not in os.listdir('Static Files'):
    player_game_logs = pd.read_parquet('Static Files/player_game_logs.parquet')
    current_player_game_logs = player_game_logs.merge(statuses, left_on='PLAYER_ID', right_on='PERSON_ID').query('ROSTERSTATUS == "Active"').to_parquet('Static Files/current_player_game_logs.parquet')

## Test Out Live Play-By-Play End Point

In [62]:
if 'play_by_play_example.parquet' not in os.listdir('Static Files'):
    # Game ID for the live game you are tracking (replace with actual game ID)
    game_id = "0022301196"

    # DataFrame to store all player stats during the game
    player_stats_df = pd.DataFrame()

    # Track the last processed play to avoid duplicates
    last_processed_play = None
    pbp_data = playbyplay.PlayByPlay(game_id).get_dict()
    plays = pbp_data['game']['actions']

    # Sort plays by actionNumber to process them in order
    plays.sort(key=lambda x: x['actionNumber'])

    # Print the total number of plays
    print(f"Total number of plays: {len(plays)}")

    for play in tqdm(plays, desc="Processing plays"):
        action_number = play['actionNumber']
        player_id = play.get('personId', None)
        action_type = play.get('actionType', '')
        clock = play.get('clock', '')
        description = play.get("description", "")
        descriptor = play.get("descriptor", "")
        period = play.get("period", "")
        periodType = play.get("periodType", "")
        qualifiers = play.get("qualifiers", "")
        shotDistance = play.get("shotDistance", "")
        shotResult = play.get("shotResult", "")
        side = play.get("side", "")

        # Only process the play if it's new (not processed yet)
        if last_processed_play is None or action_number > last_processed_play:

            try:
                # Fetch traditional stats
                traditional_boxscore = boxscoretraditionalv3.BoxScoreTraditionalV3(game_id=game_id)
                traditional_stats = traditional_boxscore.get_data_frames()[0]

                # Fetch advanced stats
                advanced_boxscore = boxscoreadvancedv3.BoxScoreAdvancedV3(game_id=game_id)
                advanced_stats = advanced_boxscore.get_data_frames()[0]

                # Fetch miscellaneous stats
                misc_boxscore = boxscoremiscv3.BoxScoreMiscV3(game_id=game_id)
                misc_stats = misc_boxscore.get_data_frames()[0]

                # Find the stats for the player involved in the play
                traditional_player_stats = traditional_stats[traditional_stats['personId'] == player_id]
                advanced_player_stats = advanced_stats[advanced_stats['personId'] == player_id]
                misc_player_stats = misc_stats[misc_stats['personId'] == player_id]

                if not traditional_player_stats.empty and not advanced_player_stats.empty and not misc_player_stats.empty:
                    # Extract all columns from traditional stats
                    traditional_columns = traditional_player_stats.columns.tolist()
                    traditional_values = traditional_player_stats.iloc[0].tolist()

                    # Extract all columns from advanced stats
                    advanced_columns = advanced_player_stats.columns.tolist()
                    advanced_values = advanced_player_stats.iloc[0].tolist()

                    # Extract all columns from miscellaneous stats
                    misc_columns = misc_player_stats.columns.tolist()
                    misc_values = misc_player_stats.iloc[0].tolist()

                    # Combine all stats into one dictionary
                    player_data = {
                        'player_id': player_id,
                        'action_type': action_type,
                        'game_id': game_id,
                        'clock': clock,
                        'description': description,
                        'descriptor': descriptor,
                        'period': period,
                        'period_type': periodType,
                        'qualifiers': qualifiers,
                        'shot_distance': shotDistance,
                        'shot_result': shotResult,
                        'side': side
                    }

                    # Add traditional stats to player_data
                    player_data.update(dict(zip(traditional_columns, traditional_values)))

                    # Add advanced stats to player_data
                    player_data.update(dict(zip(advanced_columns, advanced_values)))

                    # Add miscellaneous stats to player_data
                    player_data.update(dict(zip(misc_columns, misc_values)))

                    # Append the new data to the DataFrame
                    player_stats_df = pd.concat([player_stats_df, pd.DataFrame([player_data])], ignore_index=True)

            except Exception as e:
                tqdm.write(f"Error processing play {action_number}: {e}")

            # Update the last processed play
            last_processed_play = action_number

            # Sleep before fetching the next batch of plays (adjust this time to API rate limit)
            time.sleep(1)
    try:
        player_stats_df['team_name'] = player_stats_df['teamCity'] + " " + player_stats_df['teamName']
        player_stats_df.drop(columns=['gameId', 'teamTricode', 'teamSlug', 'personId', 'firstName', 'familyName', 'playerSlug', 'estimatedOffensiveRating', 'estimatedDefensiveRating', 'estimatedNetRating', 'offensiveReboundPercentage', 'defensiveReboundPercentage', 'turnoverRatio', 'estimatedUsagePercentage', 'estimatedPace', 'jerseyNum', 'teamCity', 'teamName'], inplace=True)
    except KeyError:
        player_stats_df.rename(columns={'teamId':'team_id', 'nameI':'player_name', 'fieldGoalsMade':'fg_made', 'fieldGoalsAttempted':'fg_attempts', 'fieldGoalPercentage':'fg_percentage','threePointersMade':'fg3_made', 'threePointersAttempted':'fg3_attempts', 'threePointersPercentage':'fg3_percentage', 'freeThrowsMade':'ft_made', 'freeThrowsAttempted':'ft_attempts', 'freeThrowPercentage':'ft_percentage', 'reboundsOffensive':'offensive_rebounds', 'reboundsDefensive':'defensive_rebounds', 'reboundsTotal':'total_rebounds', 'foulsPersonal':'personal_fouls', 'plusMinusPoints':'plus_minus', 'offensiveRating':'offensive_rating', 'defensiveRating':'defensive_rating', 'netRating':'net_rating', 'assistPercentage':'assist_percentage', 'assistToTurnover':'assist_turnover_ratio', 'assistRatio':'assist_ratio', 'reboundPercentage':'rebound_percentage', 'effectiveFieldGoalPercentage':'eff_fg_percentage', 'trueShootingPercentage':'true_shooting_percentage', 'usagePercentage':'usage_percentage', 'pointsOffTurnovers':'turnover_points', 'pointsSecondChance':'second_chance_points', 'pointsFastBreak':'fast_break_points', 'pointsPaint':'paint_points', 'blocksAgainst':'blocks_against', 'foulsDrawn':'fouls_drawn'}, inplace=True)

        player_stats_df['clock'] = player_stats_df['clock'].replace('PT', '').replace('M', '').replace('.00S', '')
        pd.set_option("display.max_columns", 0)
        best_players = player_stats_df.groupby('team_name')['PIE'].idxmax()
        best_players = player_stats_df.loc[best_players, ['player_name', 'team_name', 'PIE']]
        team_1 = best_players['team_name'].iloc[0]
        player_1 = best_players['player_name'].iloc[0]
        player_1_pie = best_players['PIE'].iloc[0]
        team_2 = best_players['team_name'].iloc[1]
        player_2 = best_players['player_name'].iloc[1]
        player_2_pie = best_players['PIE'].iloc[1]
        display(player_stats_df.head())
        print(f'{team_1}: {player_1} ({player_1_pie}) | {team_2}: {player_2} ({player_2_pie})')
        player_stats_df.replace(value=np.nan, to_replace='').to_parquet('Static Files/play_by_play_example.parquet')


In [63]:
if '2024-25_game_ids.parquet' not in os.listdir('Static Files'):
    start = datetime.strptime("2024-10-04", "%Y-%m-%d")
    end = datetime.strptime("2025-04-15", "%Y-%m-%d")

    date_range = pd.date_range(start=start, end=end).to_list()

    games = {}
    # Assuming 'range' contains the list of dates for the 2024-25 season
    for date in tqdm(date_range):
        json_string = json.loads(scoreboardv2.ScoreboardV2(game_date=date).get_json())

        i = 0  # resultSets index for 'GameHeader'
        headers = json_string['resultSets'][i].get('headers', '')

        # Iterate through each game in the 'rowSet'
        for game in json_string['resultSets'][i]['rowSet']:
            game_id = game[2]  # Index 2 is the 'GAME_ID'

            # Add game_id to the list of games for the specific date
            if date in games:
                games[date].append(game_id)
            else:
                games[date] = [game_id]

        time.sleep(0.5)  # Delay to avoid rate limits
    games = {key.strftime('%Y-%m-%d'): value for key, value in games.items()}
    games_2025 = pd.DataFrame(list(games.items()), columns=['game_date', 'game_id'])
    games_2025 = games_2025.explode(column='game_id')
    games_2025.to_parquet('Static Files/2024-25_game_ids.parquet')

In [64]:
game_log_df = pd.read_parquet('Static Files/current_player_game_logs.parquet')
game_log_df2 = pd.read_parquet('Static Files/team_game_logs.parquet')

def simulation(game_id):
    print(f"Starting mock live updates for game: {game_id}")
    play_by_play = playbyplay.PlayByPlay(game_id)
    plays = play_by_play.get_dict()['game']['actions']
    df = pd.DataFrame(columns=['game_id','period','period_time_remaining','description','home_score','away_score'])
    plays_df = []
    for _ in range(len(plays)):
        if plays:
            latest_play = plays[_]
            try:
                period = latest_play['period']
                period_time_remaining = latest_play['clock'].replace('PT', '').replace('M', ':')[:5]
                description = latest_play['description']
                home_score = latest_play['scoreHome']
                away_score = latest_play['scoreAway']
                df2 = pd.DataFrame({'game_id':game_id, 'period':period, 'period_time_remaining':period_time_remaining, 'description':description, 'home_score':home_score, 'away_score':away_score}, index=[0])
                plays_df.append(df2)
                # print(f"Q{period} {game_clock}: {description} | Score: {home_score} - {away_score}")
            except KeyError:
                pass
        else:
            print("no plays yet...")
    output = pd.concat(plays_df, ignore_index=True)
    return output
current_game = simulation('0022000196')

Starting mock live updates for game: 0022000196


## Get Advanced Player Game Stats

In [65]:

def fetch_advanced_stats():

    if 'current_player_advanced_game_stats.parquet' not in os.listdir('Static Files'):
        # Filter games for the 2023-24 season
        games = game_log_df[game_log_df['SEASON_YEAR'] == '2023-24']['GAME_ID'].unique()
        rows = []

        # Iterate over each game
        for game_id in tqdm(games):
            try:
                # Fetch box score data for the current game
                game = boxscoreadvancedv3.BoxScoreAdvancedV3(game_id).get_data_frames()[0]

                # Extract relevant fields and append rows
                rows.extend([{
                    'game_id': game_id,
                    'team_id': player_stats.teamId,
                    'team_name': f"{player_stats.teamCity} {player_stats.teamName}",
                    'player_id': player_stats.personId,
                    'player_name': f"{player_stats.firstName} {player_stats.familyName}",
                    'position': player_stats.position,
                    'comment': player_stats.comment,
                    'offensive_rating': player_stats.offensiveRating,
                    'defensive_rating': player_stats.defensiveRating,
                    'net_rating': player_stats.netRating,
                    'assist_percentage': player_stats.assistPercentage,
                    'assist_turnover_ratio': player_stats.assistToTurnover,
                    'assist_ratio': player_stats.assistRatio,
                    'usage_percentage': player_stats.usagePercentage,
                    'possessions': player_stats.possessions,
                    'effective_fg_percentage': player_stats.effectiveFieldGoalPercentage,
                    'true_shooting_percentage': player_stats.trueShootingPercentage,
                    'player_impact_score': player_stats.PIE
                } for player_stats in game.itertuples()])

                # Sleep to avoid hitting the rate limit
                time.sleep(0.5)
            except Exception as e:
                print(f"Error processing game {game_id}: {e}")
                continue

        # Convert list of rows to DataFrame
        df = pd.DataFrame(rows)

        # Save to Parquet
        df.to_parquet('Static Files/current_player_advanced_game_stats.parquet')

# Run the function
fetch_advanced_stats()


## Get Advanced Team Game Stats

In [66]:
def fetch_team_advanced_stats():

    if 'current_team_advanced_game_stats.parquet' not in os.listdir('Static Files'):
        # Filter games for the 2023-24 season
        games = game_log_df2[game_log_df2['SEASON_YEAR'] == '2023-24']['GAME_ID'].unique()
        rows = []

        # Iterate over each game
        for game_id in tqdm(games):
            try:
                # Fetch box score data for the current game
                game = boxscoreadvancedv3.BoxScoreAdvancedV3(game_id).get_data_frames()[1]

                # Extract relevant fields and append rows
                rows.extend([{
                    'game_id': game_id,
                    'team_id': team_stats.teamId,
                    'team_name': f"{team_stats.teamCity} {team_stats.teamName}",
                    'offensive_rating': team_stats.offensiveRating,
                    'defensive_rating': team_stats.defensiveRating,
                    'net_rating': team_stats.netRating,
                    'assist_percentage': team_stats.assistPercentage,
                    'assist_turnover_ratio': team_stats.assistToTurnover,
                    'assist_ratio': team_stats.assistRatio,
                    'possessions': team_stats.possessions,
                    'effective_fg_percentage': team_stats.effectiveFieldGoalPercentage,
                    'true_shooting_percentage': team_stats.trueShootingPercentage
                } for team_stats in game.itertuples()])

                # Sleep to avoid hitting the rate limit
                time.sleep(1)
            except:
                print(f"last processed game: {game_id}")
                df = pd.DataFrame(rows)
                df.to_parquet('Static Files/current_team_advanced_game_stats.parquet')

        # Convert list of rows to DataFrame
        df = pd.DataFrame(rows)

        # Save to Parquet
        df.to_parquet('Static Files/current_team_advanced_game_stats.parquet')

# Run the function
fetch_team_advanced_stats()
