In [2]:
import pandas as pd
import time
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import os
import yaml
from nba_api.stats.endpoints import teamgamelog, boxscoretraditionalv3, boxscoresummaryv2, leaguegamefinder, \
    BoxScoreTraditionalV3
from nba_api.stats.static import teams, players

In [3]:
# load yaml
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)
    DATA_PATH = config.get("data_path")
    if DATA_PATH is None:
        print("ERROR: No data path provided")
    USE_DRIVE = bool(config.get("use_drive", False))

print(os.path.join(DATA_PATH, "get_data/"))

C:/Users/trist/OneDrive/Dokumente/UZH/Semester_6/Intro_Data_Science/Intro_Data_Science_Project/data\get_data/


In [4]:
# load from drive if requested
if USE_DRIVE:
    from google.colab import drive
    drive.mount('/content/drive')

# Get Data

In [10]:
seasons = [f"{year}-{(year+1) % 100:02d}" for year in range(1996, 2025)]

Found 1189 game IDs for the 1996-97 season and a total of 2378 entries.
Found 1189 game IDs for the 1997-98 season and a total of 2378 entries.
Found 725 game IDs for the 1998-99 season and a total of 1450 entries.
Found 1189 game IDs for the 1999-00 season and a total of 2378 entries.
Found 1189 game IDs for the 2000-01 season and a total of 2378 entries.
Found 1189 game IDs for the 2001-02 season and a total of 2378 entries.
Found 1189 game IDs for the 2002-03 season and a total of 2378 entries.
Found 1189 game IDs for the 2003-04 season and a total of 2378 entries.
Found 1230 game IDs for the 2004-05 season and a total of 2460 entries.
Found 1230 game IDs for the 2005-06 season and a total of 2460 entries.
Found 1230 game IDs for the 2006-07 season and a total of 2460 entries.
Found 1230 game IDs for the 2007-08 season and a total of 2460 entries.
Found 1230 game IDs for the 2008-09 season and a total of 2460 entries.
Found 1230 game IDs for the 2009-10 season and a total of 2460 en

In [22]:
def get_season_game_ids(season):
    """
    Get all regular season game IDs for a specified season.

    Args:
        season (str): Season in format 'YYYY-YY' (e.g., '2023-24')

    Returns:
        numpy.ndarray: Array of unique game IDs
    """
    gamefinder = leaguegamefinder.LeagueGameFinder(
        season_nullable=season,
        season_type_nullable='Regular Season',
        league_id_nullable='00'  # NBA league ID
    )

    games_df = gamefinder.get_data_frames()[0]
    game_ids = games_df['GAME_ID'].unique()

    print(f"Found {len(game_ids)} unique regular season game IDs for {season}")
    return game_ids

In [19]:
def get_game_boxscore_v2(game_id, max_retries=5, delay=2):
    """
    Get boxscore statistics for a specific game ID using dictionary access.

    Args:
        game_id (str): NBA game ID
        max_retries (int): Maximum number of retry attempts
        delay (float): Initial delay between retries in seconds

    Returns:
        tuple: (team_stats, player_stats) DataFrames
    """
    per_call_delay = 0.5
    for attempt in range(max_retries):
        try:
            # Get boxscore data
            time.sleep(per_call_delay)
            boxscore = boxscoretraditionalv3.BoxScoreTraditionalV3(game_id=game_id)
            box_dict = boxscore.get_dict()['boxScoreTraditional']

            # Get game date from summary endpoint
            summary = boxscoresummaryv2.BoxScoreSummaryV2(game_id=game_id)
            header_df = summary.get_data_frames()[0]
            game_date = header_df['GAME_DATE_EST'].iloc[0]

            # Extract team information
            home_team = box_dict['homeTeam']
            away_team = box_dict['awayTeam']

            # Create team rows with all required data
            home_team_data = {
                'gameId': int(game_id),
                'teamId': int(home_team['teamId']),
                'teamCity': home_team['teamCity'],
                'teamName': home_team['teamName'],
                'teamTricode': home_team['teamTricode'],
                'teamSlug': home_team['teamSlug'],
                'opponentTeamId': int(away_team['teamId']),
                'opponentTeamCity': away_team['teamCity'],
                'opponentTeamName': away_team['teamName'],
                'opponentTeamTricode': away_team['teamTricode'],
                'opponentTeamSlug': away_team['teamSlug'],
                'home': 1,
                'gameDate': game_date
            }

            away_team_data = {
                'gameId': int(game_id),
                'teamId': int(away_team['teamId']),
                'teamCity': away_team['teamCity'],
                'teamName': away_team['teamName'],
                'teamTricode': away_team['teamTricode'],
                'teamSlug': away_team['teamSlug'],
                'opponentTeamId': int(home_team['teamId']),
                'opponentTeamCity': home_team['teamCity'],
                'opponentTeamName': home_team['teamName'],
                'opponentTeamTricode': home_team['teamTricode'],
                'opponentTeamSlug': home_team['teamSlug'],
                'home': 0,
                'gameDate': game_date
            }

            # Add all statistics from the team dictionaries
            for key, value in home_team['statistics'].items():
                home_team_data[key] = value

            for key, value in away_team['statistics'].items():
                away_team_data[key] = value

            # Create team_stats DataFrame
            team_stats = pd.DataFrame([home_team_data, away_team_data])

            # Add opponent points
            home_points = home_team['statistics']['points']
            away_points = away_team['statistics']['points']

            team_stats.loc[team_stats['home'] == 1, 'opponentPoints'] = away_points
            team_stats.loc[team_stats['home'] == 0, 'opponentPoints'] = home_points

            # Add win column (1 if team won, 0 if lost)
            team_stats['win'] = (team_stats['points'] > team_stats['opponentPoints']).astype(int)

            # ----- PLAYER STATISTICS -----
            # Extract player data from both home and away teams
            player_data_list = []

            # Process home team players
            for player in home_team['players']:
                player_data = {
                    'gameId': int(game_id),
                    'teamId': int(home_team['teamId']),
                    'teamName': home_team['teamName'],
                    'teamTricode': home_team['teamTricode'],
                    'teamCity': home_team['teamCity'],
                    'teamSlug': home_team['teamSlug'],
                    'opponentTeamId': int(away_team['teamId']),
                    'opponentTeamCity': away_team['teamCity'],
                    'opponentTeamName': away_team['teamName'],
                    'opponentTeamTricode': away_team['teamTricode'],
                    'opponentTeamSlug': away_team['teamSlug'],
                    'personId': int(player['personId']),
                    'firstName': player['firstName'],
                    'familyName': player['familyName'],
                    'nameI': player['nameI'],
                    'playerSlug': player['playerSlug'],
                    'position': player['position'],
                    'comment': player['comment'],
                    'jerseyNum': player['jerseyNum'],
                    'home': 1,
                    'gameDate': game_date,
                    'win': 1 if home_points > away_points else 0
                }

                # Add all player statistics
                for key, value in player['statistics'].items():
                    player_data[key] = value

                player_data_list.append(player_data)

            # Process away team players
            for player in away_team['players']:
                player_data = {
                    'gameId': int(game_id),
                    'teamId': int(away_team['teamId']),
                    'teamName': away_team['teamName'],
                    'teamTricode': away_team['teamTricode'],
                    'teamCity': away_team['teamCity'],
                    'teamSlug': away_team['teamSlug'],
                    'opponentTeamId': int(home_team['teamId']),
                    'opponentTeamCity': home_team['teamCity'],
                    'opponentTeamName': home_team['teamName'],
                    'opponentTeamTricode': home_team['teamTricode'],
                    'opponentTeamSlug': home_team['teamSlug'],
                    'personId': int(player['personId']),
                    'firstName': player['firstName'],
                    'familyName': player['familyName'],
                    'nameI': player['nameI'],
                    'playerSlug': player['playerSlug'],
                    'position': player['position'],
                    'comment': player['comment'],
                    'jerseyNum': player['jerseyNum'],
                    'home': 0,
                    'gameDate': game_date,
                    'win': 1 if away_points > home_points else 0
                }

                # Add all player statistics
                for key, value in player['statistics'].items():
                    player_data[key] = value

                player_data_list.append(player_data)

            # Create player_stats DataFrame
            player_stats = pd.DataFrame(player_data_list)

            return team_stats, player_stats

        except Exception as e:
            print(f"Error on attempt {attempt+1}: {e}")
            time.sleep(delay)
            delay *= 2

    print(f"Failed to retrieve boxscore for game {game_id} after {max_retries} attempts")
    return None, None

In [20]:
game_id = "0029600370"
team_stats, player_stats = get_game_boxscore_v2(game_id)

In [None]:
seasons = [f"{year}-{(year+1) % 100:02d}" for year in range(2009, 2025)]

for season in seasons:
    game_ids = get_season_game_ids(season)
    team_frames = []
    player_frames = []

    for game_id in game_ids:
        team_stats, player_stats = get_game_boxscore_v2(game_id)
        if team_stats is not None:
            team_frames.append(team_stats)
        if player_stats is not None:
            player_frames.append(player_stats)

    if team_frames:
        team_df = pd.concat(team_frames, ignore_index=True)
        team_df.to_csv(os.path.join(DATA_PATH, f"get_data/game_logs_teams_{season}.csv"), index=False)
        print(f"Team data for season {season} saved.")
    if player_frames:
        player_df = pd.concat(player_frames, ignore_index=True)
        player_df.to_csv(os.path.join(DATA_PATH, f"get_data/game_logs_players_{season}.csv"), index=False)
        print(f"Player data for season {season} saved.")

Found 1189 unique regular season game IDs for 1996-97
Team data for season 1996-97 saved.
Player data for season 1996-97 saved.
Found 1189 unique regular season game IDs for 1997-98
Team data for season 1997-98 saved.
Player data for season 1997-98 saved.
Found 725 unique regular season game IDs for 1998-99
Team data for season 1998-99 saved.
Player data for season 1998-99 saved.
Found 1189 unique regular season game IDs for 1999-00
Team data for season 1999-00 saved.
Player data for season 1999-00 saved.
Found 1189 unique regular season game IDs for 2000-01
Team data for season 2000-01 saved.
Player data for season 2000-01 saved.
Found 1189 unique regular season game IDs for 2001-02
Team data for season 2001-02 saved.
Player data for season 2001-02 saved.
Found 1189 unique regular season game IDs for 2002-03
Team data for season 2002-03 saved.
Player data for season 2002-03 saved.
Found 1189 unique regular season game IDs for 2003-04
Team data for season 2003-04 saved.
Player data for

# Transform Team Data

In [7]:
season_df = pd.read_csv(os.path.join(DATA_PATH, f"get_data/game_logs_teams_1996-97.csv"))

In [8]:
# Get Columns
print(season_df.columns)

Index(['gameId', 'teamId', 'teamCity', 'teamName', 'teamTricode', 'teamSlug',
       'opponentTeamId', 'opponentTeamCity', 'opponentTeamName',
       'opponentTeamTricode', 'opponentTeamSlug', 'home', 'gameDate',
       'minutes', 'fieldGoalsMade', 'fieldGoalsAttempted',
       'fieldGoalsPercentage', 'threePointersMade', 'threePointersAttempted',
       'threePointersPercentage', 'freeThrowsMade', 'freeThrowsAttempted',
       'freeThrowsPercentage', 'reboundsOffensive', 'reboundsDefensive',
       'reboundsTotal', 'assists', 'steals', 'blocks', 'turnovers',
       'foulsPersonal', 'points', 'plusMinusPoints', 'opponentPoints', 'win'],
      dtype='object')


In [9]:
column_mapping = {
    'points': 'teamScore',
    'opponentPoints': 'opponentScore',
    'minutes': 'numMinutes'
}

In [12]:
# Replace columns
season_dfs = []
seasons = [f"{year}-{(year+1) % 100:02d}" for year in range(1996, 2009)]
for season in seasons:
    season_df = pd.read_csv(os.path.join(DATA_PATH, f"get_data/game_logs_teams_{season}.csv"))
    # Rename columns
    season_df.rename(columns=column_mapping, inplace=True)
    season_dfs.append(season_df)

In [13]:
# Concatenate all seasons into one CSV
all_team_df = pd.concat(season_dfs, ignore_index=True)
all_team_df.to_csv(os.path.join(DATA_PATH, "get_data/game_logs_teams_all.csv"), index=False)

# Transform Player Data

In [15]:
season_df = pd.read_csv(os.path.join(DATA_PATH, f"get_data/game_logs_players_1996-97.csv"))

In [16]:
print(season_df.columns)

Index(['gameId', 'teamId', 'teamName', 'teamTricode', 'teamCity', 'teamSlug',
       'opponentTeamId', 'opponentTeamCity', 'opponentTeamName',
       'opponentTeamTricode', 'opponentTeamSlug', 'personId', 'firstName',
       'familyName', 'nameI', 'playerSlug', 'position', 'comment', 'jerseyNum',
       'home', 'gameDate', 'win', 'minutes', 'fieldGoalsMade',
       'fieldGoalsAttempted', 'fieldGoalsPercentage', 'threePointersMade',
       'threePointersAttempted', 'threePointersPercentage', 'freeThrowsMade',
       'freeThrowsAttempted', 'freeThrowsPercentage', 'reboundsOffensive',
       'reboundsDefensive', 'reboundsTotal', 'assists', 'steals', 'blocks',
       'turnovers', 'foulsPersonal', 'points', 'plusMinusPoints'],
      dtype='object')


In [18]:
player_column_mapping = {
    'familyName': 'lastName',
    'teamCity': 'playerteamCity',
    'teamName': 'playerteamName',
    'teamTricode': 'playerteamTricode',
    'teamSlug': 'playerteamSlug',
    'minutes': 'numMinutes'
}

In [19]:
player_dfs = []
seasons = [f"{year}-{(year+1) % 100:02d}" for year in range(1996, 2009)]
for season in seasons:
    player_df = pd.read_csv(os.path.join(DATA_PATH, f"get_data/game_logs_players_{season}.csv"))
    # Rename columns
    player_df.rename(columns=player_column_mapping, inplace=True)
    # Add missing columns
    player_df['gameType'] = 'Regular Season'
    player_df['gameLabel'] = ''
    player_df['gameSubLabel'] = ''

    player_dfs.append(player_df)

  player_df = pd.read_csv(os.path.join(DATA_PATH, f"get_data/game_logs_players_{season}.csv"))


In [20]:
# Concatenate all seasons into one CSV
all_player_df = pd.concat(player_dfs, ignore_index=True)
all_player_df.to_csv(os.path.join(DATA_PATH, "get_data/game_logs_players_all.csv"), index=False)