## Import Required Libraries

In [None]:
!pip install --q -r requirements.txt

In [None]:
from nba_api.stats.endpoints import teamdetails, commonteamyears, teamgamelogs, playergamelogs, commonplayerinfo, boxscoreadvancedv3, boxscoretraditionalv3, boxscoremiscv3, scoreboardv2
from nba_api.live.nba.endpoints import playbyplay
import pandas as pd
from tqdm import tqdm
import os
import pandavro as pdx
import numpy as np
import time
from datetime import datetime, date
from warnings import filterwarnings
import json
import multiprocessing as mp
filterwarnings('ignore')

## Get All Team IDs

In [None]:
if 'all_teams.parquet' not in os.listdir('Static Files'):
    teams = commonteamyears.CommonTeamYears()
    print('Established connection')
    teams = teams.get_dict()
    print('Retrieved list of dicts')
    team_ids = []
    for team in tqdm(teams['resultSets'][0]['rowSet']):
        team_id = team[1]
        team_name = team[2]
        start_year = team[3]
        end_year = team[4]
        team_ids.append((team_id, team_name, start_year, end_year))
        time.sleep(0.5)
    print('Established list of tuples')
    teams = pd.DataFrame(team_ids, columns=['team_id', 'year_founded', 'year_depreciated', 'abbreviation'])
    print('Created DataFrame')
    teams.to_parquet('Static Files/all_teams.parquet', index=False)

## Get All Team Details

In [None]:
if 'team_details.parquet' not in os.listdir('Static Files'):
    teams = pd.read_parquet('Static Files/all_teams.parquet')
    initial_df = []
    for team in tqdm(teams['team_id']):
        get_coaches = teamdetails.TeamDetails(team_id=team, timeout=60)
        next_df = get_coaches.get_data_frames()[0]
        initial_df.append(next_df)
        time.sleep(0.5)
    output_df = pd.concat(initial_df)
    output_df.to_parquet('Static Files/team_details.parquet')

## Get All Team Game Logs

In [None]:
if 'team_game_logs.parquet' not in os.listdir('Static Files'):
    start = 0
    end = 1
    initial_df = []
    for integer in tqdm(range(1,25)):
        if end < 10:
            get_game_stats = teamgamelogs.TeamGameLogs(season_nullable=f'200{start}-0{end}')
            next_df = get_game_stats.get_data_frames()[0]
            initial_df.append(next_df)
            time.sleep(0.5)
            start += 1
            end += 1
        else:
            start += 1
            end += 1
            get_game_stats = teamgamelogs.TeamGameLogs(season_nullable=f'20{start}-{end}')
            next_df = get_game_stats.get_data_frames()[0]
            initial_df.append(next_df)
            time.sleep(0.5)

    output_df = pd.concat(initial_df)
    output_df.to_parquet('Static Files/team_game_logs.parquet')

## Get All Current Player Injury Reports

In [None]:
if 'player_injury_reports.avro' not in os.listdir('Static Files'):
    player_injuries = pd.concat(pd.read_html('https://www.espn.com/nba/injuries')).fillna('No Info Available')
    player_injuries['date_updated'] = datetime.today()
    pdx.to_avro('Static Files/player_injury_reports.avro', player_injuries)
elif ('player_injury_reports.avro' in os.listdir('Static Files')):
    player_injuries = pdx.read_avro('Static Files/player_injury_reports.avro')
    if player_injuries['date_updated'].all() != datetime.today():
        player_injuries = pd.concat(pd.read_html('https://www.espn.com/nba/injuries')).fillna('No Info Available')
        player_injuries['date_updated'] = datetime.today()
        pdx.to_avro('Static Files/player_injury_reports.avro', player_injuries)

## Get All Player Game Logs

In [None]:
if 'player_game_logs.parquet' not in os.listdir('Static Files'):
    start = 0
    end = 1
    initial_df = []
    for integer in tqdm(range(1,25)):
        if end < 10:
            get_game_stats = playergamelogs.PlayerGameLogs(season_nullable=f'200{start}-0{end}')
            next_df = get_game_stats.get_data_frames()[0]
            initial_df.append(next_df)
            time.sleep(0.5)
            start += 1
            end += 1
        else:
            start += 1
            end += 1
            get_game_stats = playergamelogs.PlayerGameLogs(season_nullable=f'20{start}-{end}')
            next_df = get_game_stats.get_data_frames()[0]
            initial_df.append(next_df)
            time.sleep(0.5)

    output_df = pd.concat(initial_df)
    output_df.to_parquet('Static Files/player_game_logs.parquet')

## Get All Games Next Season

In [None]:
if '2024-25_game_ids.parquet' not in os.listdir('Static Files'):
    start = datetime.strptime("2024-10-04", "%Y-%m-%d")
    end = datetime.strptime("2025-04-15", "%Y-%m-%d")

    date_range = pd.date_range(start=start, end=end).to_list()

    games = {}
    # Assuming 'range' contains the list of dates for the 2024-25 season
    for date in tqdm(date_range):
        json_string = json.loads(scoreboardv2.ScoreboardV2(game_date=date).get_json())

        i = 0  # resultSets index for 'GameHeader'
        headers = json_string['resultSets'][i].get('headers', '')

        # Iterate through each game in the 'rowSet'
        for game in json_string['resultSets'][i]['rowSet']:
            game_id = game[2]  # Index 2 is the 'GAME_ID'

            # Add game_id to the list of games for the specific date
            if date in games:
                games[date].append(game_id)
            else:
                games[date] = [game_id]

        time.sleep(0.5)  # Delay to avoid rate limits
    games = {key.strftime('%Y-%m-%d'): value for key, value in games.items()}
    games_2025 = pd.DataFrame(list(games.items()), columns=['game_date', 'game_id'])
    games_2025 = games_2025.explode(column='game_id')
    games_2025.to_parquet('Static Files/2024-25_game_ids.parquet')

## Test Play by Play Endpoint

In [None]:
game_log_df = pd.read_parquet('Static Files/current_player_game_logs.parquet')
game_log_df2 = pd.read_parquet('Static Files/team_game_logs.parquet')

def simulation(game_id):
    print(f"Starting mock live updates for game: {game_id}")
    play_by_play = playbyplay.PlayByPlay(game_id)
    plays = play_by_play.get_dict()['game']['actions']
    plays_df = []
    for _ in range(len(plays)):
        if plays:
            latest_play = plays[_]
            try:
                period = latest_play['period']
                period_time_remaining = latest_play['clock'].replace('PT', '').replace('M', ':')[:5]
                description = latest_play['description']
                home_score = latest_play['scoreHome']
                away_score = latest_play['scoreAway']
                df2 = pd.DataFrame({'game_id':game_id, 'period':period, 'period_time_remaining':period_time_remaining, 'description':description, 'home_score':home_score, 'away_score':away_score}, index=[0])
                plays_df.append(df2)
                # print(f"Q{period} {game_clock}: {description} | Score: {home_score} - {away_score}")
            except KeyError:
                pass
        else:
            print("no plays yet...")
    output = pd.concat(plays_df, ignore_index=True)
    return output
current_game = simulation('0022000196')

## Get Advanced Player Game Stats

In [None]:
if 'current_player_advanced_game_stats.parquet' not in os.listdir('Static Files'):
    # Filter games for the 2023-24 season
    games = game_log_df[game_log_df['SEASON_YEAR'] == '2023-24']['GAME_ID'].unique()
    rows = []

    # Iterate over each game
    for game_id in tqdm(games):
        try:
            # Fetch box score data for the current game
            game = boxscoreadvancedv3.BoxScoreAdvancedV3(game_id).get_data_frames()[0]

            # Extract relevant fields and append rows
            rows.extend([{
                'game_id': game_id,
                'team_id': player_stats.teamId,
                'team_name': f"{player_stats.teamCity} {player_stats.teamName}",
                'player_id': player_stats.personId,
                'player_name': f"{player_stats.firstName} {player_stats.familyName}",
                'position': player_stats.position,
                'comment': player_stats.comment,
                'offensive_rating': player_stats.offensiveRating,
                'defensive_rating': player_stats.defensiveRating,
                'net_rating': player_stats.netRating,
                'assist_percentage': player_stats.assistPercentage,
                'assist_turnover_ratio': player_stats.assistToTurnover,
                'assist_ratio': player_stats.assistRatio,
                'usage_percentage': player_stats.usagePercentage,
                'possessions': player_stats.possessions,
                'effective_fg_percentage': player_stats.effectiveFieldGoalPercentage,
                'true_shooting_percentage': player_stats.trueShootingPercentage,
                'player_impact_score': player_stats.PIE
            } for player_stats in game.itertuples()])

            # Sleep to avoid hitting the rate limit
            time.sleep(0.5)
        except Exception as e:
            print(f"Error processing game {game_id}: {e}")
            continue

    # Convert list of rows to DataFrame
    df = pd.DataFrame(rows)

    # Save to Parquet
    df.to_parquet('Static Files/current_player_advanced_game_stats.parquet')


## Get Advanced Team Game Stats

In [None]:
def fetch_team_advanced_stats():

    if 'current_team_advanced_game_stats.parquet' not in os.listdir('Static Files'):
        # Filter games for the 2023-24 season
        games = game_log_df2[game_log_df2['SEASON_YEAR'] == '2023-24']['GAME_ID'].unique()
        rows = []

        # Iterate over each game
        for game_id in tqdm(games):
            try:
                # Fetch box score data for the current game
                game = boxscoreadvancedv3.BoxScoreAdvancedV3(game_id).get_data_frames()[1]

                # Extract relevant fields and append rows
                rows.extend([{
                    'game_id': game_id,
                    'team_id': team_stats.teamId,
                    'team_name': f"{team_stats.teamCity} {team_stats.teamName}",
                    'offensive_rating': team_stats.offensiveRating,
                    'defensive_rating': team_stats.defensiveRating,
                    'net_rating': team_stats.netRating,
                    'assist_percentage': team_stats.assistPercentage,
                    'assist_turnover_ratio': team_stats.assistToTurnover,
                    'assist_ratio': team_stats.assistRatio,
                    'possessions': team_stats.possessions,
                    'effective_fg_percentage': team_stats.effectiveFieldGoalPercentage,
                    'true_shooting_percentage': team_stats.trueShootingPercentage
                } for team_stats in game.itertuples()])

                # Sleep to avoid hitting the rate limit
                time.sleep(1)
            except:
                print(f"last processed game: {game_id}")
                df = pd.DataFrame(rows)
                df.to_parquet('Static Files/current_team_advanced_game_stats.parquet')

        # Convert list of rows to DataFrame
        df = pd.DataFrame(rows)

        # Save to Parquet
        df.to_parquet('Static Files/current_team_advanced_game_stats.parquet')

# Run the function
fetch_team_advanced_stats()


In [None]:
next_games = pd.read_parquet('Static Files/2024-25_game_ids.parquet')
next_games['game_date'] = pd.to_datetime(next_games['game_date'])
next_games['game_date'].groupby([next_games.game_date.dt.year, next_games.game_date.dt.month]).nunique()

In [None]:
games = pd.read_parquet('Static Files/team_game_logs.parquet').query('SEASON_YEAR == "2023-24"')['GAME_ID']
games

In [223]:
import duckdb
game_ids = pd.read_parquet('Static Files/2024-25_game_ids.parquet')
duckdb.sql('SELECT game_id, game_date, ROW_NUMBER() OVER (PARTITION BY game_date ORDER BY game_id) AS game_number FROM game_ids QUALIFY game_number <= 5').to_df()


Unnamed: 0,game_id,game_date,game_number
0,0012400050,2024-10-15,1
1,0012400051,2024-10-15,2
2,0012400052,2024-10-15,3
3,0012400053,2024-10-15,4
4,0012400054,2024-10-15,5
...,...,...,...
800,0022401156,2025-04-09,1
801,0022401157,2025-04-09,2
802,0022401158,2025-04-09,3
803,0022401159,2025-04-09,4


In [None]:
from nba_api.live.nba.endpoints import boxscore
from nba_api.stats.endpoints import boxscoreadvancedv3
import json
import pandas as pd
import numpy as np

# Fetch boxscore data
boxscore_data = boxscore.BoxScore(game_id='0022301198').get_json()
game_info = json.loads(boxscore_data)['game']
home_team_info = game_info['homeTeam']
away_team_info = game_info['awayTeam']
player_info = home_team_info['players']

# Initialize list to store player data
players_data = []

# Extract player information and statistics
for player in player_info:
    player_data = {
        'name': player.get('name'),
        'not_playing_reason': player.get('notPlayingDescription'),
        'position': player.get('position'),
        'is_starter': player.get('starter'),
        'on_court': player.get('oncourt')
    }
    # Add statistics to player data
    stats = player['statistics']
    player_data.update({k.lower().replace('percentage', 'pct').replace('calculated', 'calc'): v for k, v in stats.items()})

    # Fetch advanced statistics from boxscoreadvancedv3 endpoint
    advanced_stats = boxscoreadvancedv3.BoxScoreAdvancedV3(game_id='0022301198').get_data_frames()[0]
    player_advanced_stats = advanced_stats[advanced_stats['personId'] == player.get('personId')]
    player_data['pie'] = float(player_advanced_stats['PIE'].values[0]) if not player_advanced_stats.empty else np.nan

    # True Shooting Percentage
    fga = stats.get('fieldGoalsAttempted', 0)
    fta = stats.get('freeThrowsAttempted', 0)
    player_data['true_shooting_pct'] = (stats.get('points', 0) / (2 * (fga + 0.44 * fta))) if (fga + 0.44 * fta) != 0 else 0

    # Effective Field Goal Percentage
    fgm = stats.get('fieldGoalsMade', 0)
    tpm = stats.get('threePointersMade', 0)
    player_data['effective_fg_pct'] = ((fgm + 0.5 * tpm) / fga) if fga != 0 else 0

    players_data.append(player_data)

# Create DataFrame from players_data
df_players = pd.DataFrame(players_data)

# Add game information
df_players['home_team'] = f"{home_team_info['teamCity']} {home_team_info['teamName']}"
df_players['away_team'] = f"{away_team_info['teamCity']} {away_team_info['teamName']}"
df_players['venue'] = game_info['arena']['arenaName']
df_players['home_team_score'] = home_team_info['score']
df_players['away_team_score'] = away_team_info['score']
df_players['game_result'] = df_players['home_team'] + " Win" if home_team_info['score'] > away_team_info['score'] else df_players['away_team'] + " Win"

# Display the DataFrame
df_players

In [None]:
games = pd.read_parquet('Static Files/team_game_logs.parquet').query('SEASON_YEAR == "2023-24"')['GAME_ID'].unique()
game_plays = {}
for game in tqdm(list(games)):
    plays = playbyplay.PlayByPlay(game).get_json()
    game_plays.update({game:len(json.loads(plays)['game']['actions'])})
    time.sleep(0.5)
pd.DataFrame(game_plays)

In [None]:
pd.DataFrame.from_dict(game_plays, orient='index').reset_index().describe()

In [4]:
from nba_api.live.nba.endpoints import playbyplay
import json

plays = playbyplay.PlayByPlay('0022301198').get_json()
json.loads(plays)['game']['actions']

[{'actionNumber': 2,
  'clock': 'PT12M00.00S',
  'timeActual': '2024-04-14T19:41:19.2Z',
  'period': 1,
  'periodType': 'REGULAR',
  'actionType': 'period',
  'subType': 'start',
  'qualifiers': [],
  'personId': 0,
  'x': None,
  'y': None,
  'possession': 0,
  'scoreHome': '0',
  'scoreAway': '0',
  'edited': '2024-04-14T19:41:19Z',
  'orderNumber': 20000,
  'isTargetScoreLastPeriod': False,
  'xLegacy': None,
  'yLegacy': None,
  'isFieldGoal': 0,
  'side': None,
  'description': 'Period Start',
  'personIdsFilter': []},
 {'actionNumber': 4,
  'clock': 'PT11M58.00S',
  'timeActual': '2024-04-14T19:41:20.9Z',
  'period': 1,
  'periodType': 'REGULAR',
  'teamId': 1610612762,
  'teamTricode': 'UTA',
  'actionType': 'jumpball',
  'subType': 'recovered',
  'descriptor': 'startperiod',
  'qualifiers': [],
  'personId': 1630548,
  'x': None,
  'y': None,
  'possession': 1610612762,
  'scoreHome': '0',
  'scoreAway': '0',
  'edited': '2024-04-14T19:41:20Z',
  'orderNumber': 40000,
  'isTarg