In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import re

# -------------------------
# Project paths (notebook is inside /notebooks)
# -------------------------
PROJECT_ROOT = Path.cwd().parent
DATA_PROCESSED = PROJECT_ROOT / "data_processed"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_PROCESSED exists:", DATA_PROCESSED.exists())


PROJECT_ROOT: C:\Users\tozes\Documents\IronHack\pl_21-25_analysis
DATA_PROCESSED exists: True


In [2]:
# -------------------------
# Input files
# -------------------------
FILES = {
    "21-22": "pl_matchlist_21-22_v2.csv",
    "22-23": "pl_matchlist_22-23_v2.csv",
    "23-24": "pl_matchlist_23-24_v2.csv",
    "24-25": "pl_matchlist_24-25_v2.csv",
}


In [3]:
# -------------------------
# Match parsing helper
# Expected format: "Home Team 1-2 Away Team"
# -------------------------
SCORE_RE = re.compile(r"^(.*?)\s+(\d+)\s*[-–]\s*(\d+)\s+(.*)$")  # handles '-' and '–'

def parse_match_string(match_str: str):
    """
    Parse a match label into home/away teams and goals.
    Returns (home_team, home_goals, away_goals, away_team).
    """
    if pd.isna(match_str):
        return (np.nan, np.nan, np.nan, np.nan)

    s = str(match_str).strip()
    m = SCORE_RE.match(s)
    if not m:
        # If parsing fails, return NaNs so we can QA failures
        return (np.nan, np.nan, np.nan, np.nan)

    home_team = m.group(1).strip()
    home_goals = int(m.group(2))
    away_goals = int(m.group(3))
    away_team = m.group(4).strip()
    return (home_team, home_goals, away_goals, away_team)


In [7]:
qa_summary = []

for season, fname in FILES.items():
    path = DATA_PROCESSED / fname
    if not path.exists():
        raise FileNotFoundError(f"Missing file: {path}")

    df = pd.read_csv(path)

    # -------------------------
    # 1) Parse Match into home/away teams and goals
    # Expected format: "Home Team 1-2 Away Team"
    # -------------------------
    parsed = df["Match"].apply(parse_match_string)
    df[["home_team", "home_goals", "away_goals", "away_team"]] = pd.DataFrame(parsed.tolist(), index=df.index)

    # QA: parsing failures (should be 0)
    parse_failures = int(df["home_team"].isna().sum())

    # -------------------------
    # 2) Team perspective (is_home, opponent, goals_for/against)
    # -------------------------
    df["is_home"] = df["Team"] == df["home_team"]
    df["opponent"] = np.where(df["is_home"], df["away_team"], df["home_team"])

    # IMPORTANT:
    # np.where returns a NumPy array. NumPy does NOT understand pandas' "Int64" dtype.
    # So we convert to a pandas Series first, then cast to pandas nullable integer.
    goals_for_arr = np.where(df["is_home"], df["home_goals"], df["away_goals"])
    goals_against_arr = np.where(df["is_home"], df["away_goals"], df["home_goals"])

    df["goals_for"] = pd.to_numeric(pd.Series(goals_for_arr, index=df.index), errors="coerce").astype("Int64")
    df["goals_against"] = pd.to_numeric(pd.Series(goals_against_arr, index=df.index), errors="coerce").astype("Int64")

    # -------------------------
    # 3) Result + points (W/D/L)
    # -------------------------
    df["result"] = np.where(
        df["goals_for"] > df["goals_against"], "W",
        np.where(df["goals_for"] < df["goals_against"], "L", "D")
    )

    points_arr = np.where(df["result"] == "W", 3, np.where(df["result"] == "D", 1, 0))
    df["points"] = pd.Series(points_arr, index=df.index).astype("Int64")

    # -------------------------
    # 4) Stable match_id (season + date + home + away)
    # Note: stable for league matches; each match_id should appear exactly twice (home & away team rows).
    # -------------------------
    df["match_id"] = (
        df["Season"].astype(str) + "|" +
        df["Date"].astype(str) + "|" +
        df["home_team"].astype(str) + "|" +
        df["away_team"].astype(str)
    )

    # -------------------------
    # 5) Derived metrics
    # -------------------------
    df["xT_diff"] = df["xT"] - df["xT Against"]

    # Avoid division by zero
    df["shot_quality"] = np.where(df["Shots"] > 0, df["xG"] / df["Shots"], np.nan)
    df["shot_quality_conceded"] = np.where(df["Shots Faced"] > 0, df["xGA"] / df["Shots Faced"], np.nan)

    # -------------------------
    # 6) Sanity check vs existing Goals columns (should match)
    # -------------------------
    goals_mismatch = int((df["Goals"].astype("Int64") != df["goals_for"]).sum())
    conceded_mismatch = int((df["Goals Conceded"].astype("Int64") != df["goals_against"]).sum())

    # -------------------------
    # 7) Save v4
    # -------------------------
    out_name = fname.replace("_v2.csv", "_v4.csv")
    out_path = DATA_PROCESSED / out_name
    df.to_csv(out_path, index=False, sep=",", encoding="utf-8")

    # -------------------------
    # 8) Collect QA summary for this season
    # -------------------------
    qa_summary.append({
        "season": season,
        "input_file": fname,
        "output_file": out_name,
        "rows": int(len(df)),
        "parse_failures": parse_failures,
        "unique_match_id": int(df["match_id"].nunique()),
        "match_id_count_eq_2_rows": int((df["match_id"].value_counts() == 2).sum()),
        "goals_mismatch_rows": goals_mismatch,
        "goals_conceded_mismatch_rows": conceded_mismatch,
    })

qa = pd.DataFrame(qa_summary)
qa


Unnamed: 0,season,input_file,output_file,rows,parse_failures,unique_match_id,match_id_count_eq_2_rows,goals_mismatch_rows,goals_conceded_mismatch_rows
0,21-22,pl_matchlist_21-22_v2.csv,pl_matchlist_21-22_v4.csv,760,0,380,380,0,0
1,22-23,pl_matchlist_22-23_v2.csv,pl_matchlist_22-23_v4.csv,760,0,380,380,0,0
2,23-24,pl_matchlist_23-24_v2.csv,pl_matchlist_23-24_v4.csv,760,0,380,380,0,0
3,24-25,pl_matchlist_24-25_v2.csv,pl_matchlist_24-25_v4.csv,760,0,380,380,0,0


In [8]:
import pandas as pd
from pathlib import Path

DATA_PROCESSED = Path("..") / "data_processed"

df = pd.read_csv(DATA_PROCESSED / "pl_matchlist_21-22_v4.csv")
print([c for c in ["xT_diff", "shot_quality", "shot_quality_conceded"] if c in df.columns])
df[["xT", "xT Against", "xT_diff", "xG", "Shots", "shot_quality", "xGA", "Shots Faced", "shot_quality_conceded"]].head()


['xT_diff', 'shot_quality', 'shot_quality_conceded']


Unnamed: 0,xT,xT Against,xT_diff,xG,Shots,shot_quality,xGA,Shots Faced,shot_quality_conceded
0,1.16,0.51,0.65,4.51,19,0.237368,0.68,9,0.075556
1,0.51,1.16,-0.65,0.68,9,0.075556,4.51,19,0.237368
2,0.21,2.11,-1.9,0.41,4,0.1025,3.71,24,0.154583
3,2.11,0.21,1.9,3.71,24,0.154583,0.41,4,0.1025
4,0.5,1.8,-1.3,1.11,7,0.158571,4.74,29,0.163448
