# Import functions from .py file

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

# Project root = parent of /notebooks
ROOT = Path.cwd().parent

RAW = ROOT / "data" / "raw"
PROCESSED = ROOT / "data" / "processed"

RAW.mkdir(parents=True, exist_ok=True)
PROCESSED.mkdir(parents=True, exist_ok=True)

ROOT, RAW, PROCESSED


(PosixPath('/Users/vlachaki/Desktop/Ironhack_projects/final_project/transfer-fit-nba'),
 PosixPath('/Users/vlachaki/Desktop/Ironhack_projects/final_project/transfer-fit-nba/data/raw'),
 PosixPath('/Users/vlachaki/Desktop/Ironhack_projects/final_project/transfer-fit-nba/data/processed'))

In [2]:
from pathlib import Path
import sys

ROOT = Path.cwd().parent  # because notebook is in /notebooks
sys.path.append(str(ROOT))

from src.ingest import ProjectPaths, fetch_teams_players, fetch_team_games_raw

paths = ProjectPaths(ROOT).ensure()
paths


ProjectPaths(root=PosixPath('/Users/vlachaki/Desktop/Ironhack_projects/final_project/transfer-fit-nba'))

In [3]:
import importlib
import src.ingest as ingest
importlib.reload(ingest)

from src.ingest import ProjectPaths, fetch_teams_players, fetch_team_game_fact
from pathlib import Path
import sys

ROOT = Path.cwd().parent
paths = ProjectPaths(ROOT).ensure()

fetch_teams_players(paths)  # ensures teams.csv exists
tgf = fetch_team_game_fact("2023-24", paths)

tgf.shape, tgf["opponent_team_id"].isna().sum()


((2460, 33), np.int64(0))

In [6]:
import importlib, src.ingest as ingest
importlib.reload(ingest)

from src.ingest import ProjectPaths, fetch_team_game_fact, fetch_player_game_fact
from pathlib import Path

ROOT = Path.cwd().parent
paths = ProjectPaths(ROOT).ensure()

tgf = fetch_team_game_fact("2023-24", paths)
pgf = fetch_player_game_fact("2023-24", paths)

tgf["game_id"].head(), pgf["game_id"].head(), pgf["opponent_team_id"].isna().sum()


(0    0022301188
 1    0022301192
 2    0022301191
 3    0022301189
 4    0022301200
 Name: game_id, dtype: object,
 0    0022301195
 1    0022301199
 2    0022301199
 3    0022301189
 4    0022301191
 Name: game_id, dtype: object,
 np.int64(0))

In [7]:
pgf["game_id"].head().tolist(), tgf["game_id"].head().tolist(), int(pgf["opponent_team_id"].isna().sum())


(['0022301195', '0022301199', '0022301199', '0022301189', '0022301191'],
 ['0022301188', '0022301192', '0022301191', '0022301189', '0022301200'],
 0)

In [8]:
from pathlib import Path
import sys

ROOT = Path.cwd().parent
sys.path.append(str(ROOT))

from src.ingest import ProjectPaths, fetch_teams_players, fetch_team_game_fact, fetch_player_game_fact

paths = ProjectPaths(ROOT).ensure()
fetch_teams_players(paths)

team_game_fact = fetch_team_game_fact("2023-24", paths)
player_game_fact = fetch_player_game_fact("2023-24", paths)

team_game_fact.shape, player_game_fact.shape


((2460, 33), (26401, 36))

In [9]:
import importlib, src.ingest as ingest
importlib.reload(ingest)

from src.ingest import ProjectPaths, fetch_shots_season
from pathlib import Path

ROOT = Path.cwd().parent
paths = ProjectPaths(ROOT).ensure()

shots = fetch_shots_season("2023-24", paths)
shots.shape, shots.columns[:10]


((218700, 26),
 Index(['GRID_TYPE', 'game_id', 'game_event_id', 'player_id', 'player_name',
        'team_id', 'team_name', 'period', 'minutes_remaining',
        'seconds_remaining'],
       dtype='object'))

In [10]:
import importlib, src.ingest as ingest
importlib.reload(ingest)

from src.ingest import ProjectPaths, fetch_shots_season
from pathlib import Path

ROOT = Path.cwd().parent
paths = ProjectPaths(ROOT).ensure()

shots = fetch_shots_season("2023-24", paths, out_name="shots_2023-24_regular_canonical.csv")
shots.shape, set(shots.columns) - set(["league","season","game_id","game_event_id","game_date","team_id","team_name","player_id","player_name","period","minutes_remaining","seconds_remaining","shot_type","action_type","shot_zone_basic","shot_zone_area","shot_zone_range","shot_distance","loc_x","loc_y","shot_attempted","shot_made","htm","vtm"])


((218700, 24), set())

In [12]:
import importlib, src.ingest as ingest
importlib.reload(ingest)

from pathlib import Path
from src.ingest import ProjectPaths, build_player_shot_profile_season

ROOT = Path.cwd().parent
paths = ProjectPaths(ROOT).ensure()

shots_path = paths.processed / "shots_2023-24_regular_canonical.csv"
player_shot = build_player_shot_profile_season(shots_path, "2023-24", paths)

player_shot.shape, player_shot.filter(like="_pctile").isna().sum().head()


((649, 29),
 rim_rate_pctile        168
 mid_rate_pctile        168
 three_rate_pctile      168
 corner3_rate_pctile    168
 rim_fg_pct_pctile      168
 dtype: int64)

In [13]:
import importlib, src.ingest as ingest
importlib.reload(ingest)

from pathlib import Path
from src.ingest import ProjectPaths, build_team_shot_profile_season

ROOT = Path.cwd().parent
paths = ProjectPaths(ROOT).ensure()

shots_path = paths.processed / "shots_2023-24_regular_canonical.csv"
team_shot = build_team_shot_profile_season(shots_path, "2023-24", paths)

team_shot.shape, team_shot.filter(like="_pctile").isna().sum().sum()


((30, 28), np.int64(0))

In [18]:
from pathlib import Path
import time, random
import pandas as pd
from nba_api.stats.endpoints import synergyplaytypes

# -----------------------------
# CONFIG
# -----------------------------
SEASON = "2023-24"
ROOT = Path.cwd().parent  # notebook is in /notebooks
PROCESSED = ROOT / "data" / "processed"
PLAYTYPE_DICT = PROCESSED / "playtype_dict.csv"

OUT_BASE = PROCESSED / "playtypes" / f"season={SEASON}"
OUT_PLAYER = OUT_BASE / "player_parts"
OUT_TEAM = OUT_BASE / "team_parts"
OUT_PLAYER.mkdir(parents=True, exist_ok=True)
OUT_TEAM.mkdir(parents=True, exist_ok=True)

PLAYER_OUT = PROCESSED / f"player_playtype_season_{SEASON}.csv"
TEAM_OUT   = PROCESSED / f"team_playtype_season_{SEASON}.csv"
FAILS_OUT  = PROCESSED / f"failed_playtypes_{SEASON}.csv"

# Fail-fast settings (still with retries + backoff)
TIMEOUT = 60
MAX_RETRIES = 4
BASE_SLEEP = 1.5
COURTESY_SLEEP_SUCCESS = (2.0, 4.0)  # random pause on success

NBA_HEADERS = {
    "Host": "stats.nba.com",
    "Connection": "keep-alive",
    "Accept": "application/json, text/plain, */*",
    "Origin": "https://www.nba.com",
    "Referer": "https://www.nba.com/",
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
    "x-nba-stats-origin": "stats",
    "x-nba-stats-token": "true",
}

def fetch_synergy_df(season: str, player_or_team: str, play_type: str, type_grouping: str) -> pd.DataFrame:
    last_err = None
    for i in range(1, MAX_RETRIES + 1):
        try:
            ep = synergyplaytypes.SynergyPlayTypes(
                league_id="00",
                season=season,
                season_type_all_star="Regular Season",
                per_mode_simple="Per Possession",
                player_or_team_abbreviation=player_or_team,  # "P" or "T"
                play_type_nullable=play_type,
                type_grouping_nullable=type_grouping,        # "Offensive" / "Defensive"
                headers=NBA_HEADERS,
                timeout=TIMEOUT,
            )
            df = ep.get_data_frames()[0].copy()
            time.sleep(random.uniform(*COURTESY_SLEEP_SUCCESS))
            return df
        except Exception as e:
            last_err = e
            sleep_s = (BASE_SLEEP * (2 ** (i - 1))) + random.uniform(0.0, 1.0)
            print(f"retry {i}/{MAX_RETRIES} | {player_or_team}/{type_grouping}/{play_type} | {type(e).__name__}: {e} | sleep {sleep_s:.1f}s")
            time.sleep(sleep_s)
    raise last_err

def part_filename(type_grouping: str, side: str, playtype_api: str) -> str:
    # stable and filesystem-safe
    return f"{type_grouping}_{side}_{playtype_api}.csv".replace(" ", "")

ptd = pd.read_csv(PLAYTYPE_DICT)
fails = []

total = len(ptd)
for idx, row in ptd.iterrows():
    grouping = str(row["type_grouping"])
    api = str(row["playtype_api"])
    side = str(row["side"])
    label = str(row["playtype_label"])
    fn = part_filename(grouping, side, api)

    # -----------------------------
    # PLAYER
    # -----------------------------
    p_path = OUT_PLAYER / fn
    if not p_path.exists():
        print(f"[{idx+1}/{total}] PLAYER fetch {grouping}/{api}")
        try:
            dfp = fetch_synergy_df(SEASON, "P", api, grouping)
            dfp["league"] = "NBA"
            dfp["season"] = SEASON
            dfp["playtype_label"] = label
            dfp["playtype_api"] = api
            dfp["type_grouping"] = grouping
            dfp["side"] = side
            dfp.to_csv(p_path, index=False)
        except Exception as e:
            fails.append({"who":"player","type_grouping":grouping,"side":side,"playtype_api":api,"error":repr(e)})
    else:
        print(f"[{idx+1}/{total}] PLAYER skip {grouping}/{api} (exists)")

    # -----------------------------
    # TEAM
    # -----------------------------
    t_path = OUT_TEAM / fn
    if not t_path.exists():
        print(f"[{idx+1}/{total}] TEAM   fetch {grouping}/{api}")
        try:
            dft = fetch_synergy_df(SEASON, "T", api, grouping)
            dft["league"] = "NBA"
            dft["season"] = SEASON
            dft["playtype_label"] = label
            dft["playtype_api"] = api
            dft["type_grouping"] = grouping
            dft["side"] = side
            dft.to_csv(t_path, index=False)
        except Exception as e:
            fails.append({"who":"team","type_grouping":grouping,"side":side,"playtype_api":api,"error":repr(e)})
    else:
        print(f"[{idx+1}/{total}] TEAM   skip {grouping}/{api} (exists)")

# -----------------------------
# COMBINE PARTS
# -----------------------------
player_parts = sorted(OUT_PLAYER.glob("*.csv"))
team_parts = sorted(OUT_TEAM.glob("*.csv"))

player_all = pd.concat([pd.read_csv(p) for p in player_parts], ignore_index=True) if player_parts else pd.DataFrame()
team_all   = pd.concat([pd.read_csv(p) for p in team_parts], ignore_index=True) if team_parts else pd.DataFrame()

if len(player_all):
    player_all.to_csv(PLAYER_OUT, index=False)
if len(team_all):
    team_all.to_csv(TEAM_OUT, index=False)

fails_df = pd.DataFrame(fails)
fails_df.to_csv(FAILS_OUT, index=False)

print({
    "player_parts": len(player_parts),
    "team_parts": len(team_parts),
    "player_rows": int(len(player_all)),
    "team_rows": int(len(team_all)),
    "fails": int(len(fails_df)),
    "player_out": str(PLAYER_OUT),
    "team_out": str(TEAM_OUT),
    "fails_out": str(FAILS_OUT),
    "parts_dir": str(OUT_BASE),
})


[1/20] PLAYER fetch Offensive/Isolation
retry 1/4 | P/Offensive/Isolation | ReadTimeout: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=60) | sleep 2.2s
retry 2/4 | P/Offensive/Isolation | ReadTimeout: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=60) | sleep 3.9s
retry 3/4 | P/Offensive/Isolation | ReadTimeout: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=60) | sleep 6.2s
retry 4/4 | P/Offensive/Isolation | ReadTimeout: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=60) | sleep 12.9s
[1/20] TEAM   fetch Offensive/Isolation
retry 1/4 | T/Offensive/Isolation | ReadTimeout: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=60) | sleep 1.6s
retry 2/4 | T/Offensive/Isolation | ReadTimeout: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=60) | sleep 3.8s


KeyboardInterrupt: 