<a href="https://colab.research.google.com/github/Ashvin7/pl-xg-ml/blob/main/02_feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os, glob

print("CWD:", os.getcwd())

# look for your EPL csvs anywhere in /content
epl_anywhere = sorted(glob.glob("/content/**/epl_*.csv", recursive=True))
print("EPL CSVs found anywhere under /content:", len(epl_anywhere))
print("Sample:", epl_anywhere[:10])

# check expected locations
print("\nIn /content:", len(glob.glob("/content/epl_*.csv")))
print("In /content/sample_data:", len(glob.glob("/content/sample_data/epl_*.csv")))
print("In data/raw:", len(glob.glob("data/raw/epl_*.csv")))


CWD: /content
EPL CSVs found anywhere under /content: 16
Sample: ['/content/sample_data/epl_2017-18_league_table.csv', '/content/sample_data/epl_2017-18_squad_standard.csv', '/content/sample_data/epl_2018-19_league_table.csv', '/content/sample_data/epl_2018-19_squad_standard.csv', '/content/sample_data/epl_2019-20_league_table.csv', '/content/sample_data/epl_2019-20_squad_standard.csv', '/content/sample_data/epl_2020-21_league_table.csv', '/content/sample_data/epl_2020-21_squad_standard.csv', '/content/sample_data/epl_2021-22_league_table.csv', '/content/sample_data/epl_2021-22_squad_standard.csv']

In /content: 0
In /content/sample_data: 16
In data/raw: 0


In [2]:
import os, shutil, glob

os.makedirs("data/raw", exist_ok=True)

found = sorted(glob.glob("/content/**/epl_*.csv", recursive=True))
print("Found:", len(found))

moved = 0
for src in found:
    dst = os.path.join("data/raw", os.path.basename(src))
    # don't overwrite if already there
    if not os.path.exists(dst):
        shutil.copy2(src, dst)
        moved += 1

print("Copied into data/raw:", moved)
print("Now in data/raw:", len(glob.glob("data/raw/epl_*.csv")))
print("League tables:", len(glob.glob("data/raw/epl_*_league_table.csv")))
print("Squad standard:", len(glob.glob("data/raw/epl_*_squad_standard.csv")))


Found: 16
Copied into data/raw: 16
Now in data/raw: 16
League tables: 8
Squad standard: 8


# Phase 2 — Model Prep + Feature Engineering (2017–18 to 2024–25)

**Goal:** Take the Phase 1 raw tables (league table + squad standard stats) and build a single modeling dataset:
- One row per **team-season**
- Clean numeric features
- Train/test split keys (time-based)
- Saved to `data/processed/phase2_model_dataset.csv`

This dataset will be used in Phase 3 for baseline + ML models.


In [3]:
import os, re, glob
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 120)

RAW_DIR = "data/raw"
OUT_DIR = "data/processed"
os.makedirs(OUT_DIR, exist_ok=True)

In [4]:
league_files = sorted(glob.glob(f"{RAW_DIR}/epl_*_league_table.csv"))
squad_files  = sorted(glob.glob(f"{RAW_DIR}/epl_*_squad_standard.csv"))

print("League files:", len(league_files))
print("Squad files:", len(squad_files))
print("Sample league:", league_files[:2])
print("Sample squad:", squad_files[:2])

assert len(league_files) == 8, "Expected 8 league table CSVs (2017–18 to 2024–25)"
assert len(squad_files)  == 8, "Expected 8 squad standard CSVs (2017–18 to 2024–25)"

League files: 8
Squad files: 8
Sample league: ['data/raw/epl_2017-18_league_table.csv', 'data/raw/epl_2018-19_league_table.csv']
Sample squad: ['data/raw/epl_2017-18_squad_standard.csv', 'data/raw/epl_2018-19_squad_standard.csv']


## Helpers

- Extract season from filename (`epl_2017-18_...`)
- Standardize column names
- Coerce numeric columns safely


In [5]:
def season_from_filename(path: str) -> str:
    # expects epl_2017-18_...
    m = re.search(r"epl_(\d{4}-\d{2})_", os.path.basename(path))
    if not m:
        raise ValueError(f"Could not parse season from filename: {path}")
    return m.group(1)

def clean_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]
    return df

def to_numeric_safe(s):
    return pd.to_numeric(s.astype(str).str.replace(",", "").str.strip(), errors="coerce")

## Load League Table (team-season outcomes + league-level xG/xGA)

From each season's **League Table**, we keep:
- team, points, matches (MP), goals_for, goals_against, goal_diff
- xg, xga (league-level)
- derived:
  - points_per_match
  - gd_per_match
  - xgd_league = xg - xga
  - xgd_per_match


In [6]:
league_rows = []

for fn in league_files:
    season = season_from_filename(fn)

    # Common FBRef league table cols in your screenshots:
    # squad, mp, pts, gf, ga, gd, xg, xga
    required = ["squad", "mp", "pts", "gf", "ga", "gd", "xg", "xga"]

    df = None
    # Try header=1 first, then header=0 to account for inconsistent file structures
    for header_row_idx in [1, 0]:
        temp_df = pd.read_csv(fn, header=header_row_idx)
        temp_df = clean_columns(temp_df)
        current_missing = [c for c in required if c not in temp_df.columns]
        if not current_missing: # If no missing columns, we found the right header
            df = temp_df
            break

    if df is None:
        # This print will only occur if neither header=1 nor header=0 works
        print(f"\nColumns for {season} when missing cols (after trying headers 0 and 1): {temp_df.columns.tolist()}")
        raise ValueError(f"Could not find required columns in {season} after trying header=0 and header=1. Missing: {required}")

    out = df[required].copy()
    out.rename(columns={
        "squad": "team",
        "mp": "matches",
        "pts": "points",
        "gf": "goals_for",
        "ga": "goals_against",
        "gd": "goal_diff",
        "xg": "xg_league",
        "xga": "xga_league",
    }, inplace=True)

    # numeric coercion
    num_cols = ["matches","points","goals_for","goals_against","goal_diff","xg_league","xga_league"]
    for c in num_cols:
        out[c] = to_numeric_safe(out[c])

    out["season"] = season

    # derived
    out["points_per_match"] = out["points"] / out["matches"]
    out["gd_per_match"] = out["goal_diff"] / out["matches"]
    out["xgd_league"] = out["xg_league"] - out["xga_league"]
    out["xgd_per_match"] = out["xgd_league"] / out["matches"]

    league_rows.append(out)

league_df = pd.concat(league_rows, ignore_index=True)
league_df.head(), league_df.shape

(              team  matches  points  goals_for  goals_against  goal_diff  xg_league  xga_league   season  \
 0  Manchester City       38     100        106             27         79       78.6        23.8  2017-18   
 1   Manchester Utd       38      81         68             28         40       55.7        40.7  2017-18   
 2        Tottenham       38      77         74             36         38       64.7        33.9  2017-18   
 3        Liverpool       38      75         84             38         46       72.9        33.8  2017-18   
 4          Chelsea       38      70         62             38         24       54.4        33.8  2017-18   
 
    points_per_match  gd_per_match  xgd_league  xgd_per_match  
 0          2.631579      2.078947        54.8       1.442105  
 1          2.131579      1.052632        15.0       0.394737  
 2          2.026316      1.000000        30.8       0.810526  
 3          1.973684      1.210526        39.1       1.028947  
 4          1.842105    

## Load Squad Standard Stats (team-season style & shot-quality xG)

From each season's **Squad Standard Stats**, we keep:
- team, mp, xg (squad-level)
- derived:
  - xg_per_match_squad

In [7]:
squad_rows = []

for fn in squad_files:
    season = season_from_filename(fn)

    required = ["squad", "mp", "xg"]

    df = None
    # Try header=2, then header=1, then header=0 to account for inconsistent file structures
    for header_row_idx in [2, 1, 0]:
        temp_df = pd.read_csv(fn, header=header_row_idx)
        temp_df = clean_columns(temp_df)
        current_missing = [c for c in required if c not in temp_df.columns]
        if not current_missing: # If no missing columns, we found the right header
            df = temp_df
            break

    if df is None:
        # This print will only occur if neither header=1 nor header=0 works
        print(f"\nColumns for {season} when missing cols (after trying headers 0, 1 and 2): {temp_df.columns.tolist()}")
        raise ValueError(f"Could not find required columns in {season} after trying header=0, 1 and 2. Missing: {required}")

    out = df[required].copy()
    out.rename(columns={"squad":"team","mp":"matches","xg":"xg_squad"}, inplace=True)

    out["matches"] = to_numeric_safe(out["matches"])
    out["xg_squad"] = to_numeric_safe(out["xg_squad"])
    out["season"] = season

    out["xg_per_match_squad"] = out["xg_squad"] / out["matches"]

    squad_rows.append(out)

squad_df = pd.concat(squad_rows, ignore_index=True)
squad_df.head(), squad_df.shape

(          team  matches  xg_squad   season  xg_per_match_squad
 0      Arsenal       38      68.3  2017-18            1.797368
 1  Bournemouth       38      38.8  2017-18            1.021053
 2     Brighton       38      37.0  2017-18            0.973684
 3      Burnley       38      32.3  2017-18            0.850000
 4      Chelsea       38      54.4  2017-18            1.431579,
 (160, 5))

## Merge + Sanity Checks

Expected:
- 8 seasons × 20 teams = **160 rows**
- One row per team-season

In [8]:
df = league_df.merge(
    squad_df,
    on=["season", "team", "matches"],
    how="inner"
)

print("Merged shape:", df.shape)
print("Seasons:", sorted(df["season"].unique()))
print("Teams per season (min/max):", df.groupby("season")["team"].nunique().min(), df.groupby("season")["team"].nunique().max())

assert df.shape[0] == 160, "Expected 160 team-season rows (20 teams × 8 seasons). Check merges."
df.head()


Merged shape: (160, 15)
Seasons: ['2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23', '2023-24', '2024-25']
Teams per season (min/max): 20 20


Unnamed: 0,team,matches,points,goals_for,goals_against,goal_diff,xg_league,xga_league,season,points_per_match,gd_per_match,xgd_league,xgd_per_match,xg_squad,xg_per_match_squad
0,Manchester City,38,100,106,27,79,78.6,23.8,2017-18,2.631579,2.078947,54.8,1.442105,78.6,2.068421
1,Manchester Utd,38,81,68,28,40,55.7,40.7,2017-18,2.131579,1.052632,15.0,0.394737,55.7,1.465789
2,Tottenham,38,77,74,36,38,64.7,33.9,2017-18,2.026316,1.0,30.8,0.810526,64.7,1.702632
3,Liverpool,38,75,84,38,46,72.9,33.8,2017-18,1.973684,1.210526,39.1,1.028947,72.9,1.918421
4,Chelsea,38,70,62,38,24,54.4,33.8,2017-18,1.842105,0.631579,20.6,0.542105,54.4,1.431579


## Targets + Feature Set

**Primary target:** `points` (regression)

We'll keep points and also include helpful alternative targets for later:
- `goal_diff`
- `xgd_league`

We also create:
- `season_start_year` (e.g., 2017 from "2017-18")
- `season_idx` for time ordering


In [9]:
df["season_start_year"] = df["season"].str.slice(0,4).astype(int)

# time index (0..7) based on chronological seasons
season_order = sorted(df["season"].unique(), key=lambda s: int(s[:4]))
season_to_idx = {s:i for i,s in enumerate(season_order)}
df["season_idx"] = df["season"].map(season_to_idx)

df = df.sort_values(["season_start_year","team"]).reset_index(drop=True)
df[["season","season_start_year","season_idx"]].drop_duplicates().head(10)


Unnamed: 0,season,season_start_year,season_idx
0,2017-18,2017,0
20,2018-19,2018,1
40,2019-20,2019,2
60,2020-21,2020,3
80,2021-22,2021,4
100,2022-23,2022,5
120,2023-24,2023,6
140,2024-25,2024,7


## Train/Test Split (time-based)

To avoid leakage, we do a simple time split:
- Train: 2017–18 → 2022–23
- Test: 2023–24 → 2024–25

We label each row with `split`.


In [10]:
df["split"] = np.where(df["season_start_year"] <= 2022, "train", "test")
df["split"].value_counts()

Unnamed: 0_level_0,count
split,Unnamed: 1_level_1
train,120
test,40


## Final Dataset + Save

We save:
- identifiers: season, team
- split keys: season_start_year, season_idx, split
- target(s): points, goal_diff
- features: xg/xga/xgd metrics, per-match metrics, squad xg metrics

In [11]:
final_cols = [
    # identifiers
    "season","team",
    # split keys
    "season_start_year","season_idx","split",
    # targets
    "points","goal_diff",
    # core league features
    "matches","goals_for","goals_against",
    "points_per_match","gd_per_match",
    "xg_league","xga_league","xgd_league","xgd_per_match",
    # squad features
    "xg_squad","xg_per_match_squad",
]

phase2_df = df[final_cols].copy()

out_path = f"{OUT_DIR}/phase2_model_dataset.csv"
phase2_df.to_csv(out_path, index=False)

print("Saved:", out_path)
phase2_df.head()

Saved: data/processed/phase2_model_dataset.csv


Unnamed: 0,season,team,season_start_year,season_idx,split,points,goal_diff,matches,goals_for,goals_against,points_per_match,gd_per_match,xg_league,xga_league,xgd_league,xgd_per_match,xg_squad,xg_per_match_squad
0,2017-18,Arsenal,2017,0,train,63,23,38,74,51,1.657895,0.605263,68.3,47.8,20.5,0.539474,68.3,1.797368
1,2017-18,Bournemouth,2017,0,train,44,-16,38,45,61,1.157895,-0.421053,38.8,59.2,-20.4,-0.536842,38.8,1.021053
2,2017-18,Brighton,2017,0,train,40,-20,38,34,54,1.052632,-0.526316,37.0,50.8,-13.8,-0.363158,37.0,0.973684
3,2017-18,Burnley,2017,0,train,54,-3,38,36,39,1.421053,-0.078947,32.3,51.2,-18.9,-0.497368,32.3,0.85
4,2017-18,Chelsea,2017,0,train,70,24,38,62,38,1.842105,0.631579,54.4,33.8,20.6,0.542105,54.4,1.431579


In [12]:
print("Rows:", phase2_df.shape[0])
print("Nulls per col (top):")
display(phase2_df.isna().sum().sort_values(ascending=False).head(15))

# quick season summary check
display(phase2_df.groupby("season")[["points","xgd_league","xg_league","xga_league"]].mean())

Rows: 160
Nulls per col (top):


Unnamed: 0,0
season,0
team,0
season_start_year,0
season_idx,0
split,0
points,0
goal_diff,0
matches,0
goals_for,0
goals_against,0


Unnamed: 0_level_0,points,xgd_league,xg_league,xga_league
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-18,52.05,0.005,47.29,47.285
2018-19,53.45,-0.005,51.105,51.11
2019-20,52.4,-0.005,52.055,52.06
2020-21,52.85,-0.015,49.115,49.13
2021-22,52.6,0.005,50.895,50.89
2022-23,52.65,0.005,53.89,53.885
2023-24,52.3,-0.01,58.87,58.88
2024-25,52.35,0.015,53.905,53.89
