In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

# -------------------------
# 0) Paths (robust if you run from /notebooks)
# -------------------------
cwd = Path.cwd()
PROJECT_ROOT = cwd.parent if cwd.name.lower() == "notebooks" else cwd

DATA_PROCESSED = PROJECT_ROOT / "data_processed"
OUT_DIR = DATA_PROCESSED / "sql_model"
OUT_DIR.mkdir(parents=True, exist_ok=True)

MASTER_PATH = DATA_PROCESSED / "pl_master_21-25_v1.csv"
if not MASTER_PATH.exists():
    raise FileNotFoundError(f"Master file not found: {MASTER_PATH.resolve()}")

# -------------------------
# 1) Load master
# -------------------------
df = pd.read_csv(MASTER_PATH)

# CORRECTION: Replace the vertical bar with an underscore in match_id.
df['match_id'] = df['match_id'].str.replace('|', '_', regex=False)

# Ensure Date is parsed consistently (your data is already yyyy-mm-dd, this just makes it explicit)
df["Date"] = pd.to_datetime(df["Date"], format="%Y-%m-%d", errors="coerce")

# -------------------------
# 2) dim_season
# -------------------------
seasons = sorted(df["Season"].dropna().unique())
dim_season = pd.DataFrame({"season": seasons})
dim_season["season_id"] = range(1, len(dim_season) + 1)
dim_season = dim_season[["season_id", "season"]]

season_map = dict(zip(dim_season["season"], dim_season["season_id"]))

# -------------------------
# 3) dim_team (union across all relevant team columns)
# -------------------------
team_cols = ["Team", "home_team", "away_team", "opponent"]
all_teams = pd.unique(pd.concat([df[c].dropna().astype(str) for c in team_cols], ignore_index=True))
all_teams = sorted(all_teams)

dim_team = pd.DataFrame({"team_name": all_teams})
dim_team["team_id"] = range(1, len(dim_team) + 1)
dim_team = dim_team[["team_id", "team_name"]]

team_map = dict(zip(dim_team["team_name"], dim_team["team_id"]))

# -------------------------
# 4) dim_match (1 row per match_id)
# -------------------------
dim_match = (
    df[["match_id", "Season", "Date", "home_team", "away_team", "home_goals", "away_goals"]]
    .drop_duplicates("match_id")
    .copy()
)

dim_match["season_id"] = dim_match["Season"].map(season_map).astype("int64")
dim_match["home_team_id"] = dim_match["home_team"].astype(str).map(team_map).astype("int64")
dim_match["away_team_id"] = dim_match["away_team"].astype(str).map(team_map).astype("int64")

dim_match["match_date"] = dim_match["Date"].dt.date
dim_match["match_label"] = (
    dim_match["home_team"].astype(str)
    + " "
    + dim_match["home_goals"].astype(str)
    + "-"
    + dim_match["away_goals"].astype(str)
    + " "
    + dim_match["away_team"].astype(str)
)

dim_match = dim_match[
    ["match_id", "season_id", "match_date", "home_team_id", "away_team_id", "home_goals", "away_goals", "match_label"]
].sort_values(["season_id", "match_date", "match_id"])

# -------------------------
# 5) fact_team_match (1 row per team per match)
# -------------------------
fact = df.copy()

fact["season_id"] = fact["Season"].map(season_map).astype("int64")
fact["team_id"] = fact["Team"].astype(str).map(team_map).astype("int64")
fact["opponent_team_id"] = fact["opponent"].astype(str).map(team_map).astype("int64")

# Convert boolean to int (SQL-friendly)
fact["is_home"] = fact["is_home"].astype(int)

# Drop redundant string columns that will live in dimensions
# (keep match_id + keys + metrics)
drop_cols = ["Season", "Team", "opponent", "home_team", "away_team", "Match", "Date"]
fact = fact.drop(columns=[c for c in drop_cols if c in fact.columns])

# Reorder: keys first
key_cols = ["match_id", "season_id", "team_id", "opponent_team_id", "is_home"]
other_cols = [c for c in fact.columns if c not in key_cols]
fact = fact[key_cols + other_cols]

# -------------------------
# 6) Quick validations vs master expectations
# -------------------------
assert len(df) == 3040, f"Expected 3040 rows in master, got {len(df)}"
assert len(dim_match) == 1520, f"Expected 1520 unique matches, got {len(dim_match)}"
assert len(fact) == 3040, f"Fact should have same rows as master, got {len(fact)}"
assert fact[["match_id", "team_id"]].duplicated().sum() == 0, "Duplicate (match_id, team_id) found in fact!"

# -------------------------
# 7) Save outputs (CSV) to load into SQL easily
# -------------------------
dim_season.to_csv(OUT_DIR / "dim_season.csv", index=False, encoding="utf-8")
dim_team.to_csv(OUT_DIR / "dim_team.csv", index=False, encoding="utf-8")
dim_match.to_csv(OUT_DIR / "dim_match.csv", index=False, encoding="utf-8")
fact.to_csv(OUT_DIR / "fact_team_match.csv", index=False, encoding="utf-8")

print("✅ Saved:")
print(" -", (OUT_DIR / "dim_season.csv").resolve())
print(" -", (OUT_DIR / "dim_team.csv").resolve())
print(" -", (OUT_DIR / "dim_match.csv").resolve())
print(" -", (OUT_DIR / "fact_team_match.csv").resolve())

print("\nCounts:")
print("dim_season:", len(dim_season))
print("dim_team:", len(dim_team))
print("dim_match:", len(dim_match))
print("fact_team_match:", len(fact))


✅ Saved:
 - C:\Users\tozes\Documents\IronHack\pl_21-25_analysis\data_processed\sql_model\dim_season.csv
 - C:\Users\tozes\Documents\IronHack\pl_21-25_analysis\data_processed\sql_model\dim_team.csv
 - C:\Users\tozes\Documents\IronHack\pl_21-25_analysis\data_processed\sql_model\dim_match.csv
 - C:\Users\tozes\Documents\IronHack\pl_21-25_analysis\data_processed\sql_model\fact_team_match.csv

Counts:
dim_season: 4
dim_team: 26
dim_match: 1520
fact_team_match: 3040


In [2]:
import pandas as pd
from pathlib import Path

BASE = Path(r"C:\Users\tozes\Documents\IronHack\pl_21-25_analysis")
IN_DIR = BASE / "data_processed" / "sql_model"
OUT_DIR = BASE / "data_processed" / "sql_model_snake"

OUT_DIR.mkdir(parents=True, exist_ok=True)

# --- files ---
FILES = {
    "dim_season": "dim_season.csv",
    "dim_team": "dim_team.csv",
    "dim_match": "dim_match.csv",
    "fact_team_match": "fact_team_match.csv",
}

# --- mapping only for columns that need changing (spaces, hyphens, case) ---
RENAME = {
    # core metrics
    "xG": "xg",
    "xGA": "xga",
    "xGD": "xgd",
    "Open Play xG": "open_play_xg",
    "Open Play xGA": "open_play_xga",
    "Open Play xGD": "open_play_xgd",
    "Set Piece xG": "set_piece_xg",
    "Set Piece xGA": "set_piece_xga",
    "Set Piece xGD": "set_piece_xgd",
    "npxG": "npxg",
    "npxGA": "npxga",
    "npxGD": "npxgd",
    "Goals": "goals",
    "Goals Conceded": "goals_conceded",
    "GD": "gd",
    "GD-xGD": "gd_minus_xgd",
    "Possession": "possession",
    "Field Tilt": "field_tilt",
    "Avg Pass Height": "avg_pass_height",
    "xT": "xt",
    "xT Against": "xt_against",
    "Passes in Opposition Half": "passes_in_opposition_half",
    "Passes into Box": "passes_into_box",
    "Shots": "shots",
    "Shots Faced": "shots_faced",
    "Shots per 1.0 xT": "shots_per_1_0_xt",
    "Shots Faced per 1.0 xT Against": "shots_faced_per_1_0_xt_against",
    "PPDA": "ppda",
    "High Recoveries": "high_recoveries",
    "High Recoveries Against": "high_recoveries_against",
    "Crosses": "crosses",
    "Corners": "corners",
    "Fouls": "fouls",
    "On-Ball Pressure": "on_ball_pressure",
    "On-Ball Pressure Share": "on_ball_pressure_share",
    "Off-Ball Pressure": "off_ball_pressure",
    "Off-Ball Pressure Share": "off_ball_pressure_share",
    "Game Control": "game_control",
    "Game Control Share": "game_control_share",
    "Throw-Ins into the Box": "throw_ins_into_box",
}

def save_snake(name: str):
    in_path = IN_DIR / FILES[name]
    df = pd.read_csv(in_path)

    # rename columns (only those in RENAME)
    df = df.rename(columns=RENAME)

    # optional: force all columns to lower snake when they are already safe
    # (uncomment if you want: team_id -> team_id stays the same)
    # df.columns = [c.lower() for c in df.columns]

    out_path = OUT_DIR / FILES[name]
    df.to_csv(out_path, index=False, encoding="utf-8")

    print(f"✅ saved: {out_path}")

for t in FILES:
    save_snake(t)

print("\nDone. Use the CSVs from:", OUT_DIR)


✅ saved: C:\Users\tozes\Documents\IronHack\pl_21-25_analysis\data_processed\sql_model_snake\dim_season.csv
✅ saved: C:\Users\tozes\Documents\IronHack\pl_21-25_analysis\data_processed\sql_model_snake\dim_team.csv
✅ saved: C:\Users\tozes\Documents\IronHack\pl_21-25_analysis\data_processed\sql_model_snake\dim_match.csv
✅ saved: C:\Users\tozes\Documents\IronHack\pl_21-25_analysis\data_processed\sql_model_snake\fact_team_match.csv

Done. Use the CSVs from: C:\Users\tozes\Documents\IronHack\pl_21-25_analysis\data_processed\sql_model_snake
