In [1]:
import sys
print(sys.executable)


/Users/vlachaki/Desktop/Ironhack_projects/final_project/transfer-fit-nba/.venv/bin/python


In [2]:
import nba_api
print("nba_api ok")




nba_api ok


## Injesting teams & players

In [3]:
import pandas as pd
from nba_api.stats.static import teams, players

teams_df = pd.DataFrame(teams.get_teams()).rename(columns={
    "id": "team_id",
    "full_name": "team_name",
    "abbreviation": "team_abbr"
})[["team_id", "team_name", "team_abbr"]]

players_df = pd.DataFrame(players.get_players()).rename(columns={
    "id": "player_id",
    "full_name": "player_name"
})[["player_id", "player_name"]]

teams_df.head(), players_df.head(), (len(teams_df), len(players_df))



(      team_id             team_name team_abbr
 0  1610612737         Atlanta Hawks       ATL
 1  1610612738        Boston Celtics       BOS
 2  1610612739   Cleveland Cavaliers       CLE
 3  1610612740  New Orleans Pelicans       NOP
 4  1610612741         Chicago Bulls       CHI,
    player_id          player_name
 0      76001       Alaa Abdelnaby
 1      76002      Zaid Abdul-Aziz
 2      76003  Kareem Abdul-Jabbar
 3         51   Mahmoud Abdul-Rauf
 4       1505    Tariq Abdul-Wahad,
 (30, 5103))

In [4]:
# save them to disk in my project under data/raw/
import os
os.makedirs("data/raw", exist_ok=True)

teams_df.to_csv("data/raw/teams.csv", index=False)
players_df.to_csv("data/raw/players.csv", index=False)

print("wrote:", "data/raw/teams.csv", "and", "data/raw/players.csv")


wrote: data/raw/teams.csv and data/raw/players.csv


In [5]:
# confirm that the files exist
import os
os.listdir("data/raw")[:10]


['teams.csv',
 'team_game_fact_2023_24_regular.csv',
 'pbp',
 'players.csv',
 'pbp_event_sample_0022300061.csv',
 'player_game_fact_2023_24_regular.csv']

## Ingesting Games + box score (currently only season 2023-2024)

In [6]:
# we will use only one season for now, 2023-2024 to see if the injection works well.

In [7]:
import pandas as pd
from nba_api.stats.endpoints import leaguegamelog

season = "2023-24"

lg = leaguegamelog.LeagueGameLog(season=season, season_type_all_star="Regular Season")
team_games_raw = lg.get_data_frames()[0]

team_games_raw.columns, team_games_raw.head(3), team_games_raw.shape


(Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
        'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M',
        'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST',
        'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS', 'VIDEO_AVAILABLE'],
       dtype='object'),
   SEASON_ID     TEAM_ID TEAM_ABBREVIATION              TEAM_NAME     GAME_ID  \
 0     22023  1610612743               DEN         Denver Nuggets  0022300061   
 1     22023  1610612744               GSW  Golden State Warriors  0022300062   
 2     22023  1610612747               LAL     Los Angeles Lakers  0022300061   
 
     GAME_DATE      MATCHUP WL  MIN  FGM  ...  DREB  REB  AST  STL  BLK  TOV  \
 0  2023-10-24  DEN vs. LAL  W  240   48  ...    33   42   29    9    6   12   
 1  2023-10-24  GSW vs. PHX  L  240   36  ...    31   49   19   11    6   11   
 2  2023-10-24    LAL @ DEN  L  240   41  ...    31   44   23    5    4   12   
 
    PF  PTS  PLUS

In [8]:
# transform it into our canonical columns and add opponent_team_id + home_away

In [9]:
import pandas as pd

season = "2023-24"
league = "NBA"

df = team_games_raw.copy()

df = df.rename(columns={
    "SEASON_ID": "season_id_raw",
    "TEAM_ID": "team_id",
    "TEAM_ABBREVIATION": "team_abbr",
    "TEAM_NAME": "team_name",
    "GAME_ID": "game_id",
    "GAME_DATE": "game_date",
    "MATCHUP": "matchup",
    "WL": "wl",
    "MIN": "minutes",
    "FGM": "fgm",
    "FGA": "fga",
    "FG_PCT": "fg_pct",
    "FG3M": "fg3m",
    "FG3A": "fg3a",
    "FG3_PCT": "fg3_pct",
    "FTM": "ftm",
    "FTA": "fta",
    "FT_PCT": "ft_pct",
    "OREB": "oreb",
    "DREB": "dreb",
    "REB": "reb",
    "AST": "ast",
    "STL": "stl",
    "BLK": "blk",
    "TOV": "tov",
    "PF": "pf",
    "PTS": "pts",
    "PLUS_MINUS": "plus_minus",
    "VIDEO_AVAILABLE": "video_available",
})

df["league"] = league
df["season"] = season
df["game_date"] = pd.to_datetime(df["game_date"])

df["home_away"] = df["matchup"].apply(lambda s: "H" if "vs." in s else "A")
df["opp_abbr"] = df["matchup"].str.split().str[-1]

abbr_to_id = dict(zip(teams_df["team_abbr"], teams_df["team_id"]))
df["opponent_team_id"] = df["opp_abbr"].map(abbr_to_id)

team_game_fact = df[[
    "league","season","season_id_raw",
    "game_id","game_date",
    "team_id","team_abbr","team_name",
    "opponent_team_id","home_away","matchup","wl",
    "pts","minutes",
    "fgm","fga","fg_pct",
    "fg3m","fg3a","fg3_pct",
    "ftm","fta","ft_pct",
    "oreb","dreb","reb",
    "ast","tov","stl","blk","pf",
    "plus_minus",
    "video_available"
]].copy()

team_game_fact.head(), team_game_fact["opponent_team_id"].isna().sum()


(  league   season season_id_raw     game_id  game_date     team_id team_abbr  \
 0    NBA  2023-24         22023  0022300061 2023-10-24  1610612743       DEN   
 1    NBA  2023-24         22023  0022300062 2023-10-24  1610612744       GSW   
 2    NBA  2023-24         22023  0022300061 2023-10-24  1610612747       LAL   
 3    NBA  2023-24         22023  0022300062 2023-10-24  1610612756       PHX   
 4    NBA  2023-24         22023  0022300071 2023-10-25  1610612740       NOP   
 
                team_name  opponent_team_id home_away  ... oreb dreb  reb  ast  \
 0         Denver Nuggets        1610612747         H  ...    9   33   42   29   
 1  Golden State Warriors        1610612756         H  ...   18   31   49   19   
 2     Los Angeles Lakers        1610612743         A  ...   13   31   44   23   
 3           Phoenix Suns        1610612744         A  ...   17   43   60   23   
 4   New Orleans Pelicans        1610612763         A  ...   11   41   52   22   
 
    tov  stl  blk 

In [10]:
# saving team_game_fact to disk
team_game_fact.to_csv("data/raw/team_game_fact_2023_24_regular.csv", index=False)


In [11]:
import os
os.listdir("data/raw")[:10]

['teams.csv',
 'team_game_fact_2023_24_regular.csv',
 'pbp',
 'players.csv',
 'pbp_event_sample_0022300061.csv',
 'player_game_fact_2023_24_regular.csv']

## Injesting player game logs for 2023-24 regular season

In [12]:
from nba_api.stats.endpoints import leaguegamelog
import pandas as pd

season = "2023-24"

lg_players = leaguegamelog.LeagueGameLog(
    season=season,
    season_type_all_star="Regular Season",
    player_or_team_abbreviation="P"
)
player_games_raw = lg_players.get_data_frames()[0]

player_games_raw.columns, player_games_raw.head(3), player_games_raw.shape


(Index(['SEASON_ID', 'PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_ABBREVIATION',
        'TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'FGM',
        'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT',
        'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
        'PLUS_MINUS', 'FANTASY_PTS', 'VIDEO_AVAILABLE'],
       dtype='object'),
   SEASON_ID  PLAYER_ID     PLAYER_NAME     TEAM_ID TEAM_ABBREVIATION  \
 0     22023     202704  Reggie Jackson  1610612743               DEN   
 1     22023     203932    Aaron Gordon  1610612743               DEN   
 2     22023     203999    Nikola Jokić  1610612743               DEN   
 
         TEAM_NAME     GAME_ID   GAME_DATE      MATCHUP WL  ...  REB  AST  STL  \
 0  Denver Nuggets  0022300061  2023-10-24  DEN vs. LAL  W  ...    3    1    1   
 1  Denver Nuggets  0022300061  2023-10-24  DEN vs. LAL  W  ...    7    5    2   
 2  Denver Nuggets  0022300061  2023-10-24  DEN vs. LAL  W  ...   13   11 

In [13]:
import pandas as pd

season = "2023-24"
league = "NBA"

df = player_games_raw.copy()

df = df.rename(columns={
    "SEASON_ID": "season_id_raw",
    "PLAYER_ID": "player_id",
    "PLAYER_NAME": "player_name",
    "TEAM_ID": "team_id",
    "TEAM_ABBREVIATION": "team_abbr",
    "TEAM_NAME": "team_name",
    "GAME_ID": "game_id",
    "GAME_DATE": "game_date",
    "MATCHUP": "matchup",
    "WL": "wl",
    "MIN": "minutes",
    "FGM": "fgm",
    "FGA": "fga",
    "FG_PCT": "fg_pct",
    "FG3M": "fg3m",
    "FG3A": "fg3a",
    "FG3_PCT": "fg3_pct",
    "FTM": "ftm",
    "FTA": "fta",
    "FT_PCT": "ft_pct",
    "OREB": "oreb",
    "DREB": "dreb",
    "REB": "reb",
    "AST": "ast",
    "STL": "stl",
    "BLK": "blk",
    "TOV": "tov",
    "PF": "pf",
    "PTS": "pts",
    "PLUS_MINUS": "plus_minus",
    "FANTASY_PTS": "fantasy_pts",
    "VIDEO_AVAILABLE": "video_available",
})

df["league"] = league
df["season"] = season
df["game_date"] = pd.to_datetime(df["game_date"])

df["home_away"] = df["matchup"].apply(lambda s: "H" if "vs." in s else "A")
df["opp_abbr"] = df["matchup"].str.split().str[-1]

abbr_to_id = dict(zip(teams_df["team_abbr"], teams_df["team_id"]))
df["opponent_team_id"] = df["opp_abbr"].map(abbr_to_id)

# Position not in this endpoint; we’ll join later from roster/season info.
df["position"] = None

player_game_fact = df[[
    "league","season","season_id_raw",
    "game_id","game_date",
    "player_id","player_name",
    "team_id","team_abbr","team_name",
    "opponent_team_id","home_away","matchup","wl",
    "position",
    "pts","minutes",
    "fgm","fga","fg_pct",
    "fg3m","fg3a","fg3_pct",
    "ftm","fta","ft_pct",
    "oreb","dreb","reb",
    "ast","tov","stl","blk","pf",
    "plus_minus","fantasy_pts",
    "video_available"
]].copy()

player_game_fact.head(), player_game_fact["opponent_team_id"].isna().sum()


(  league   season season_id_raw     game_id  game_date  player_id  \
 0    NBA  2023-24         22023  0022300061 2023-10-24     202704   
 1    NBA  2023-24         22023  0022300061 2023-10-24     203932   
 2    NBA  2023-24         22023  0022300061 2023-10-24     203999   
 3    NBA  2023-24         22023  0022300061 2023-10-24    1629008   
 4    NBA  2023-24         22023  0022300061 2023-10-24    1629618   
 
           player_name     team_id team_abbr       team_name  ...  dreb reb  \
 0      Reggie Jackson  1610612743       DEN  Denver Nuggets  ...     3   3   
 1        Aaron Gordon  1610612743       DEN  Denver Nuggets  ...     5   7   
 2        Nikola Jokić  1610612743       DEN  Denver Nuggets  ...    10  13   
 3  Michael Porter Jr.  1610612743       DEN  Denver Nuggets  ...    10  12   
 4       Jalen Pickett  1610612743       DEN  Denver Nuggets  ...     0   0   
 
   ast tov stl  blk  pf  plus_minus  fantasy_pts  video_available  
 0   1   2   1    0   0          1

In [14]:
player_game_fact.to_csv("data/raw/player_game_fact_2023_24_regular.csv", index=False)


In [15]:
import os
os.listdir("data/raw")[:10]

['teams.csv',
 'team_game_fact_2023_24_regular.csv',
 'pbp',
 'players.csv',
 'pbp_event_sample_0022300061.csv',
 'player_game_fact_2023_24_regular.csv']

## Injesting Play-by-Play stats (single-game smoke pull first, then generalize)

What we’ll use:
- nba_api.stats.endpoints.playbyplayv3 (newer, structured). If it fails, we fall back to playbyplay.

In [16]:
# grabbing a sample game
sample_game_id = team_game_fact["game_id"].iloc[0]
sample_game_id


'0022300061'

In [17]:
from nba_api.stats.endpoints import playbyplayv3
import pandas as pd

game_id = "0022300061"

pbp = playbyplayv3.PlayByPlayV3(game_id=game_id)
pbp_raw = pbp.get_data_frames()[0]

pbp_raw.columns, pbp_raw.head(5), pbp_raw.shape


(Index(['gameId', 'actionNumber', 'clock', 'period', 'teamId', 'teamTricode',
        'personId', 'playerName', 'playerNameI', 'xLegacy', 'yLegacy',
        'shotDistance', 'shotResult', 'isFieldGoal', 'scoreHome', 'scoreAway',
        'pointsTotal', 'location', 'description', 'actionType', 'subType',
        'videoAvailable', 'shotValue', 'actionId'],
       dtype='object'),
        gameId  actionNumber        clock  period      teamId teamTricode  \
 0  0022300061             2  PT12M00.00S       1           0               
 1  0022300061             4  PT12M00.00S       1  1610612743         DEN   
 2  0022300061             7  PT11M42.00S       1  1610612747         LAL   
 3  0022300061            10  PT11M15.00S       1  1610612743         DEN   
 4  0022300061            11  PT10M57.00S       1  1610612747         LAL   
 
    personId playerName playerNameI  xLegacy  ...  scoreHome  scoreAway  \
 0         0                               0  ...          0          0   
 1    2

In [18]:
# we now create pbp_event for that game

In [19]:
import pandas as pd

league = "NBA"
season = "2023-24"
game_id = "0022300061"

df = pbp_raw.copy()

pbp_event = pd.DataFrame({
    "league": league,
    "season": season,
    "game_id": df["gameId"].astype(str),
    "event_id": df["actionNumber"].astype(int),
    "period": df["period"].astype(int),
    "clock": df["clock"],                      # keep raw clock string for now
    "team_id": df["teamId"],                   # nullable
    "player_id": df["personId"],               # nullable
    "player_name": df["playerName"],           # display/debug
    "event_type": df["actionType"],
    "event_subtype": df["subType"],
    "points": df["shotValue"].fillna(0).astype(int),
    "description": df["description"],
    "score_home": df["scoreHome"],
    "score_away": df["scoreAway"],
})

pbp_event.head(10), pbp_event.shape


(  league   season     game_id  event_id  period        clock     team_id  \
 0    NBA  2023-24  0022300061         2       1  PT12M00.00S           0   
 1    NBA  2023-24  0022300061         4       1  PT12M00.00S  1610612743   
 2    NBA  2023-24  0022300061         7       1  PT11M42.00S  1610612747   
 3    NBA  2023-24  0022300061        10       1  PT11M15.00S  1610612743   
 4    NBA  2023-24  0022300061        11       1  PT10M57.00S  1610612747   
 5    NBA  2023-24  0022300061        13       1  PT10M40.00S  1610612743   
 6    NBA  2023-24  0022300061        14       1  PT10M33.00S  1610612747   
 7    NBA  2023-24  0022300061        16       1  PT10M16.00S  1610612743   
 8    NBA  2023-24  0022300061        18       1  PT10M03.00S  1610612747   
 9    NBA  2023-24  0022300061        19       1  PT10M01.00S  1610612747   
 
    player_id player_name   event_type                   event_subtype  points  \
 0          0                   period                           star

In [20]:
pbp_event.to_csv("data/raw/pbp_event_sample_0022300061.csv", index=False)


In [21]:
# confirm that the files exist
import os
os.listdir("data/raw")[:10]

['teams.csv',
 'team_game_fact_2023_24_regular.csv',
 'pbp',
 'players.csv',
 'pbp_event_sample_0022300061.csv',
 'player_game_fact_2023_24_regular.csv']

In [23]:
import time
import random
import pandas as pd
from pathlib import Path
from nba_api.stats.endpoints import playbyplayv3

season = "2023-24"
league = "NBA"

PROJECT_ROOT = Path.cwd().parent
out_dir = PROJECT_ROOT / "data" / "raw" / "pbp" / f"season={season}"
out_dir.mkdir(parents=True, exist_ok=True)

game_ids = team_game_fact["game_id"].astype(str).drop_duplicates().tolist()
print("games to fetch:", len(game_ids))
print("output dir:", out_dir)

def fetch_one_game(game_id: str) -> pd.DataFrame:
    pbp = playbyplayv3.PlayByPlayV3(game_id=game_id, timeout=20)
    df = pbp.get_data_frames()[0]
    df.insert(0, "league", league)
    df.insert(1, "season", season)
    return df

failed = []
skipped = 0
saved = 0

for i, gid in enumerate(game_ids, start=1):
    out_path = out_dir / f"game_id={gid}.csv"
    if out_path.exists():
        skipped += 1
        continue

    ok = False
    for attempt in range(1, 6):
        try:
            df = fetch_one_game(gid)
            df.to_csv(out_path, index=False)
            saved += 1
            ok = True
            break
        except Exception:
            time.sleep(min(30, 2 ** attempt) + random.uniform(0.1, 0.7))

    if not ok:
        failed.append(gid)

    time.sleep(0.25 + random.uniform(0.05, 0.2))

    if i % 25 == 0:
        print(f"progress: {i}/{len(game_ids)} | saved={saved} skipped={skipped} failed={len(failed)}")

if failed:
    pd.DataFrame({"game_id": failed}).to_csv(out_dir / "_failed_game_ids.csv", index=False)

print("done:", {"saved": saved, "skipped": skipped, "failed": len(failed), "out_dir": str(out_dir)})


games to fetch: 1230
output dir: /Users/vlachaki/Desktop/Ironhack_projects/final_project/transfer-fit-nba/data/raw/pbp/season=2023-24
progress: 600/1230 | saved=1 skipped=599 failed=0
progress: 725/1230 | saved=6 skipped=719 failed=0
progress: 750/1230 | saved=31 skipped=719 failed=0
progress: 775/1230 | saved=56 skipped=719 failed=0
progress: 800/1230 | saved=81 skipped=719 failed=0
progress: 825/1230 | saved=106 skipped=719 failed=0
progress: 850/1230 | saved=131 skipped=719 failed=0
progress: 875/1230 | saved=156 skipped=719 failed=0
progress: 900/1230 | saved=181 skipped=719 failed=0
progress: 925/1230 | saved=206 skipped=719 failed=0
progress: 950/1230 | saved=231 skipped=719 failed=0
progress: 975/1230 | saved=256 skipped=719 failed=0
progress: 1000/1230 | saved=281 skipped=719 failed=0
progress: 1025/1230 | saved=306 skipped=719 failed=0
progress: 1050/1230 | saved=331 skipped=719 failed=0
progress: 1075/1230 | saved=356 skipped=719 failed=0
progress: 1100/1230 | saved=381 skipp

In [24]:
# building the canonical pbp_event_2023_24 by reading those per-game raw CSVs and mapping columns.

In [25]:
import pandas as pd
from pathlib import Path
import glob

season = "2023-24"
league = "NBA"

PROJECT_ROOT = Path.cwd().parent
in_dir = PROJECT_ROOT / "data" / "raw" / "pbp" / f"season={season}"
out_path = PROJECT_ROOT / "data" / "processed" / f"pbp_event_{season}_regular.csv"
out_path.parent.mkdir(parents=True, exist_ok=True)

files = sorted(glob.glob(str(in_dir / "game_id=*.csv")))
print("files:", len(files))

chunks = []
for f in files:
    df = pd.read_csv(f)
    chunks.append(pd.DataFrame({
        "league": df["league"],
        "season": df["season"],
        "game_id": df["gameId"].astype(str),
        "event_id": df["actionNumber"].astype(int),
        "period": df["period"].astype(int),
        "clock": df["clock"],
        "team_id": df["teamId"],
        "player_id": df["personId"],
        "event_type": df["actionType"],
        "event_subtype": df["subType"],
        "description": df["description"],
        "score_home": df["scoreHome"],
        "score_away": df["scoreAway"],
    }))

pbp_event = pd.concat(chunks, ignore_index=True)
pbp_event.to_csv(out_path, index=False)

pbp_event.shape, out_path


files: 1230


((598705, 13),
 PosixPath('/Users/vlachaki/Desktop/Ironhack_projects/final_project/transfer-fit-nba/data/processed/pbp_event_2023-24_regular.csv'))