In [None]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd().parent))
import pandas as pd
import numpy as np
seasonal = pd.read_parquet('../data/raw/seasonal_20_25.parquet')
pbp = pd.read_parquet('../data/raw/pbp_20_25.parquet')
roster = pd.read_parquet('../data/raw/seasonal_roster_20_25.parquet')
id = pd.read_parquet('../data/raw/ids.parquet')


In [None]:
seasonal.columns.tolist()

In [None]:
roster["position"].value_counts()
roster.columns.tolist()

In [None]:
id.columns.tolist()

In [None]:
roster.head()

In [None]:
seasonal_ids = set(seasonal['player_id'])
roster_ids = set(roster['player_id'])


In [None]:
len(seasonal_ids & roster_ids)  # how many IDs appear in both?


In [None]:
seasonal['player_id'].duplicated().sum()



In [None]:
seasonal['player_id'].nunique()  # unique players

In [None]:
len(seasonal)  # total rows


In [None]:
# In your notebook, test this:
ROSTER_KEEP_COLS = [
        "player_id",
        "season",
        "team",
        "position",
        "age",
        "height",
        "weight",
        "years_exp",
        "college",
        "rookie_year",
        "draft_number",
        "draft_club",
    ]
SEASON_KEEP_COLS = [
        # Keys
        "player_id", "season", "season_type", "games",

        # Passing
        "completions", "attempts", "passing_yards", "passing_tds",
        "interceptions", "sacks", "passing_first_downs", "passing_epa",

        # Rushing
        "carries", "rushing_yards", "rushing_tds",
        "rushing_first_downs", "rushing_epa",

        # Receiving
        "targets", "receptions", "receiving_yards", "receiving_tds",
        "receiving_first_downs", "receiving_epa",

        # Efficiency
        "pacr", "racr", "dakota", "yptmpa",

        # Opportunity / role
        "target_share", "air_yards_share", "wopr_x", "dom", "w8dom",

        # Fantasy outputs
        "fantasy_points", "fantasy_points_ppr", "ppr_sh"
    ]

filtered_roster = roster[roster["position"].isin(["QB", "RB", "WR", "TE"])]
season_filtered = seasonal[SEASON_KEEP_COLS]
end_roster = (
    filtered_roster[ROSTER_KEEP_COLS + ["week"]]
    .sort_values(["season", "week"])
    .groupby(["player_id", "season"], as_index=False)
    .last()
    .drop(columns=["week"], errors="ignore")
    )
assert end_roster.groupby(["player_id", "season"]).size().max() == 1
left_count = season_filtered.merge(end_roster, on=["player_id", "season"], how="left").shape[0]
inner_count = season_filtered.merge(end_roster, on=["player_id", "season"], how="inner").shape[0]

print(f"Left: {left_count}, Inner: {inner_count}, Dropped: {left_count - inner_count}")


In [None]:
from src.data.pipeline import load_data, clean_data
seasonal, roster = load_data()
df = clean_data(roster, seasonal)

df.isnull().sum()


In [3]:
df = pd.read_parquet('../data/processed/processed_data.parquet')
df = df.sort_values(['player_id', 'season'])
sample = df[df['player_id'] == df['player_id'].iloc[0]][['player_id', 'season', 'fantasy_points_ppr']]
sample['prev_ppr'] = sample.groupby('player_id')['fantasy_points_ppr'].shift(1)
sample


Unnamed: 0,player_id,season,fantasy_points_ppr,prev_ppr
1,00-0007091,2015,16.94,


In [4]:
PER_GAME_COLS = [
    # Passing volume
    "completions",
    "attempts",
    "passing_yards",
    "passing_tds",
    "interceptions",
    "sacks",
    "passing_first_downs",

    # Rushing volume
    "carries",
    "rushing_yards",
    "rushing_tds",
    "rushing_first_downs",

    # Receiving volume
    "targets",
    "receptions",
    "receiving_yards",
    "receiving_tds",
    "receiving_first_downs",
]

LAG_COLS = [
    # Availability
    "games",

    # Per-game volume features (created from PER_GAME_COLS)
    "completions_pg",
    "attempts_pg",
    "passing_yards_pg",
    "passing_tds_pg",
    "interceptions_pg",
    "sacks_pg",
    "passing_first_downs_pg",

    "carries_pg",
    "rushing_yards_pg",
    "rushing_tds_pg",
    "rushing_first_downs_pg",

    "targets_pg",
    "receptions_pg",
    "receiving_yards_pg",
    "receiving_tds_pg",
    "receiving_first_downs_pg",

    # EPA & efficiency (RAW, NOT per-game)
    "passing_epa",
    "rushing_epa",
    "receiving_epa",
    "pacr",
    "racr",
    "dakota",
    "yptmpa",

    # Opportunity / role shares
    "target_share",
    "air_yards_share",
    "wopr_x",
    "dom",
    "w8dom",

    # Prior fantasy output (RAW totals)
    "fantasy_points",
    "fantasy_points_ppr",
    "ppr_sh",
]

for col in PER_GAME_COLS:
    df[f"{col}_pg"] = df[col]/df["games"]
df.loc[df["games"] ==0, [f"{c}_pg" for c in PER_GAME_COLS]]= 0

df["target_ppr"] = df["fantasy_points_ppr"]
df[LAG_COLS] = (df.groupby("player_id")[LAG_COLS].shift(1))
df[['player_id', 'season', 'target_ppr']].head(10)
before = len(df)
df = df.dropna(subset=LAG_COLS)
after = len(df)
print(f"Dropped {before - after} rows ({(before-after)/before*100:.1f}%)")


Dropped 1058 rows (31.3%)


In [None]:
df_fresh = pd.read_parquet('../data/processed/processed_data.parquet')
df_fresh.columns.tolist()