In [1]:
import pandas as pd
from pathlib import Path

# -------------------------
# 0) Paths (robust if running from /notebooks)
# -------------------------
HERE = Path.cwd().resolve()
PROJECT_ROOT = HERE.parent if HERE.name == "notebooks" else HERE

DATA_PROCESSED = PROJECT_ROOT / "data_processed"
OUT_PATH = DATA_PROCESSED / "pl_master_21-25_v1.csv"  # clear name for the master file

FILES_V4 = [
    "pl_matchlist_21-22_v4.csv",
    "pl_matchlist_22-23_v4.csv",
    "pl_matchlist_23-24_v4.csv",
    "pl_matchlist_24-25_v4.csv",
]

# -------------------------
# 1) Load all season files (v4)
# -------------------------
dfs = []
base_cols = None

for fname in FILES_V4:
    path = DATA_PROCESSED / fname
    if not path.exists():
        raise FileNotFoundError(f"Missing file: {path}")

    df = pd.read_csv(path)

    # Establish base schema from the first file
    if base_cols is None:
        base_cols = list(df.columns)

    # Hard schema check: same columns and same order
    if list(df.columns) != base_cols:
        missing = [c for c in base_cols if c not in df.columns]
        extra = [c for c in df.columns if c not in base_cols]
        raise ValueError(
            f"Schema mismatch in {fname}\n"
            f"Missing vs base: {missing}\n"
            f"Extra vs base: {extra}\n"
            f"Order differs: {list(df.columns) != base_cols}"
        )

    dfs.append(df)

# -------------------------
# 2) Concatenate into master
# -------------------------
master = pd.concat(dfs, ignore_index=True)

# -------------------------
# 3) Core validations
# -------------------------
expected_rows = 4 * 760
assert len(master) == expected_rows, f"Expected {expected_rows} rows, got {len(master)}"

# Useful extra sanity checks
# 4 seasons present
seasons = sorted(master["Season"].dropna().unique().tolist())
print("Seasons in master:", seasons)

# Rows per season should be 760
rows_by_season = master.groupby("Season").size().sort_index()
print("\nRows per season:")
print(rows_by_season)

# Match IDs: should be 380 per season, 1520 total unique
unique_match_by_season = master.groupby("Season")["match_id"].nunique().sort_index()
print("\nUnique match_id per season:")
print(unique_match_by_season)

print("\nTotal rows:", len(master))
print("Total unique match_id:", master["match_id"].nunique())

# -------------------------
# 4) Save master CSV (comma-separated, UTF-8)
# -------------------------
master.to_csv(OUT_PATH, index=False, sep=",", encoding="utf-8")
print(f"\n✅ Saved master dataset to: {OUT_PATH}")


Seasons in master: ['21-22', '22-23', '23-24', '24-25']

Rows per season:
Season
21-22    760
22-23    760
23-24    760
24-25    760
dtype: int64

Unique match_id per season:
Season
21-22    380
22-23    380
23-24    380
24-25    380
Name: match_id, dtype: int64

Total rows: 3040
Total unique match_id: 1520

✅ Saved master dataset to: C:\Users\tozes\Documents\IronHack\pl_21-25_analysis\data_processed\pl_master_21-25_v1.csv
