<a href="https://colab.research.google.com/github/Ashvin7/pl-xg-ml/blob/main/05_match_results_ingestion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Phase 5 (Prep): Match Results Ingestion (2017–2025)

Goal: Load the historical EPL match-level dataset (2017–2025), validate schema, clean types, standardize column names, and export a canonical “immutable” match dataset for downstream modeling and simulation.

Inputs: /content/epl_2017_18_onwards.csv
Outputs:

data/processed/epl_match_results_2017_2025.csv (canonical match dataset)

data/processed/match_team_season_agg_2017_2025.csv (optional team-season aggregates)

In [1]:
import os, glob
import pandas as pd
import numpy as np

RAW_MATCH_PATH = "/content/epl_2017_18_onwards.csv"

OUT_DIR = "data/processed"
OUT_MATCH_PATH = f"{OUT_DIR}/epl_match_results_2017_2025.csv"
OUT_TEAM_SEASON_PATH = f"{OUT_DIR}/match_team_season_agg_2017_2025.csv"

os.makedirs(OUT_DIR, exist_ok=True)

print("Exists RAW_MATCH_PATH?", os.path.exists(RAW_MATCH_PATH))
print("Will write to:", OUT_MATCH_PATH)


Exists RAW_MATCH_PATH? True
Will write to: data/processed/epl_match_results_2017_2025.csv


Load raw match CSV and inspect

In [2]:
df_raw = pd.read_csv(RAW_MATCH_PATH)

print("Shape:", df_raw.shape)
print("Columns:", list(df_raw.columns))
df_raw.head(5)


Shape: (3010, 22)
Columns: ['Season', 'MatchDate', 'HomeTeam', 'AwayTeam', 'FullTimeHomeGoals', 'FullTimeAwayGoals', 'FullTimeResult', 'HalfTimeHomeGoals', 'HalfTimeAwayGoals', 'HalfTimeResult', 'HomeShots', 'AwayShots', 'HomeShotsOnTarget', 'AwayShotsOnTarget', 'HomeCorners', 'AwayCorners', 'HomeFouls', 'AwayFouls', 'HomeYellowCards', 'AwayYellowCards', 'HomeRedCards', 'AwayRedCards']


Unnamed: 0,Season,MatchDate,HomeTeam,AwayTeam,FullTimeHomeGoals,FullTimeAwayGoals,FullTimeResult,HalfTimeHomeGoals,HalfTimeAwayGoals,HalfTimeResult,...,HomeShotsOnTarget,AwayShotsOnTarget,HomeCorners,AwayCorners,HomeFouls,AwayFouls,HomeYellowCards,AwayYellowCards,HomeRedCards,AwayRedCards
0,2017/18,2017-08-11,Arsenal,Leicester,4,3,H,2,2,D,...,10,3,9,4,9,12,0,1,0,0
1,2017/18,2017-08-12,Brighton,Man City,0,2,A,0,0,D,...,2,4,3,10,6,9,0,2,0,0
2,2017/18,2017-08-12,Chelsea,Burnley,2,3,A,0,3,A,...,6,5,8,5,16,11,3,3,2,0
3,2017/18,2017-08-12,Crystal Palace,Huddersfield,0,3,A,0,2,A,...,4,6,12,9,7,19,1,3,0,0
4,2017/18,2017-08-12,Everton,Stoke,1,0,H,1,0,H,...,4,1,6,7,13,10,1,1,0,0


Normalize column names

This makes later code robust even if the source file has slightly different headings (spaces, caps, etc.).

In [3]:
df = df_raw.copy()

# normalize: lowercase, strip, replace spaces and dashes with underscores
df.columns = (
    df.columns
    .astype(str)
    .str.strip()
    .str.lower()
    .str.replace(" ", "_", regex=False)
    .str.replace("-", "_", regex=False)
)

print("Normalized columns:", list(df.columns))
df.head(3)


Normalized columns: ['season', 'matchdate', 'hometeam', 'awayteam', 'fulltimehomegoals', 'fulltimeawaygoals', 'fulltimeresult', 'halftimehomegoals', 'halftimeawaygoals', 'halftimeresult', 'homeshots', 'awayshots', 'homeshotsontarget', 'awayshotsontarget', 'homecorners', 'awaycorners', 'homefouls', 'awayfouls', 'homeyellowcards', 'awayyellowcards', 'homeredcards', 'awayredcards']


Unnamed: 0,season,matchdate,hometeam,awayteam,fulltimehomegoals,fulltimeawaygoals,fulltimeresult,halftimehomegoals,halftimeawaygoals,halftimeresult,...,homeshotsontarget,awayshotsontarget,homecorners,awaycorners,homefouls,awayfouls,homeyellowcards,awayyellowcards,homeredcards,awayredcards
0,2017/18,2017-08-11,Arsenal,Leicester,4,3,H,2,2,D,...,10,3,9,4,9,12,0,1,0,0
1,2017/18,2017-08-12,Brighton,Man City,0,2,A,0,0,D,...,2,4,3,10,6,9,0,2,0,0
2,2017/18,2017-08-12,Chelsea,Burnley,2,3,A,0,3,A,...,6,5,8,5,16,11,3,3,2,0


Identify required fields (and map if needed)

For Phase 5 match modeling, we typically need:

date

season

home_team, away_team

home_goals, away_goals (or equivalent)

optional: xg, shot counts, etc.

This cell detects common variants and maps them into a standard schema.

In [5]:
# helpers: find first matching column name from a list
def pick_col(candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

# candidate lists for common match datasets
col_date   = pick_col(["date", "matchdate", "match_date", "game_date"])
col_season = pick_col(["season", "year", "season_year"])
col_home   = pick_col(["home_team", "home", "hometeam", "team_home"])
col_away   = pick_col(["away_team", "away", "awayteam", "team_away"])

# goals columns: lots of naming conventions exist
col_hg = pick_col(["home_goals", "hg", "fthg", "fulltimehomegoals", "home_score", "home_goals_ft"])
col_ag = pick_col(["away_goals", "ag", "ftag", "fulltimeawaygoals", "away_score", "away_goals_ft"])

print("Detected:")
print("date:", col_date)
print("season:", col_season)
print("home:", col_home)
print("away:", col_away)
print("home_goals:", col_hg)
print("away_goals:", col_ag)

required = [col_date, col_season, col_home, col_away, col_hg, col_ag]
if any(x is None for x in required):
    missing = ["date","season","home_team","away_team","home_goals","away_goals"]
    detected = [col_date, col_season, col_home, col_away, col_hg, col_ag]
    print("\n❌ Missing one or more required columns.")
    print("Required:", missing)
    print("Detected:", detected)
    raise ValueError("Update the column mapping lists above to match your CSV.")

Detected:
date: matchdate
season: season
home: hometeam
away: awayteam
home_goals: fulltimehomegoals
away_goals: fulltimeawaygoals


Build canonical match table + type cleanup

We’ll create:

date as datetime

season as string (e.g., 2017-18)

goals as integers

derived: result (H/D/A), and points for each team (useful later)

In [7]:
matches = df[[col_date, col_season, col_home, col_away, col_hg, col_ag]].copy()

matches = matches.rename(columns={
    col_date: "date",
    col_season: "season",
    col_home: "home_team",
    col_away: "away_team",
    col_hg: "home_goals",
    col_ag: "away_goals",
})

# parse date
matches["date"] = pd.to_datetime(matches["date"], errors="coerce")

# normalize season to string
matches["season"] = matches["season"].astype(str).str.strip()

# numeric goals
matches["home_goals"] = pd.to_numeric(matches["home_goals"], errors="coerce")
matches["away_goals"] = pd.to_numeric(matches["away_goals"], errors="coerce")

# drop rows missing critical fields
before = len(matches)
matches = matches.dropna(subset=["date","season","home_team","away_team","home_goals","away_goals"]).copy()
after = len(matches)

matches["home_goals"] = matches["home_goals"].astype(int)
matches["away_goals"] = matches["away_goals"].astype(int)

print("Dropped rows due to missing critical fields:", before - after)
matches.head(5)

Dropped rows due to missing critical fields: 0


Unnamed: 0,date,season,home_team,away_team,home_goals,away_goals
0,2017-08-11,2017/18,Arsenal,Leicester,4,3
1,2017-08-12,2017/18,Brighton,Man City,0,2
2,2017-08-12,2017/18,Chelsea,Burnley,2,3
3,2017-08-12,2017/18,Crystal Palace,Huddersfield,0,3
4,2017-08-12,2017/18,Everton,Stoke,1,0


Add results + points (H/D/A)

In [9]:
def match_result(hg, ag):
    if hg > ag:
        return "H"
    if hg < ag:
        return "A"
    return "D"

matches["result"] = [match_result(hg, ag) for hg, ag in zip(matches["home_goals"], matches["away_goals"])]

matches["home_points"] = np.where(matches["result"]=="H", 3, np.where(matches["result"]=="D", 1, 0))
matches["away_points"] = np.where(matches["result"]=="A", 3, np.where(matches["result"]=="D", 1, 0))

matches.head(5)

Unnamed: 0,date,season,home_team,away_team,home_goals,away_goals,result,home_points,away_points
0,2017-08-11,2017/18,Arsenal,Leicester,4,3,H,3,0
1,2017-08-12,2017/18,Brighton,Man City,0,2,A,0,3
2,2017-08-12,2017/18,Chelsea,Burnley,2,3,A,0,3
3,2017-08-12,2017/18,Crystal Palace,Huddersfield,0,3,A,0,3
4,2017-08-12,2017/18,Everton,Stoke,1,0,H,3,0


Quality checks

We want:

no duplicates (or at least understand them)

season looks right

teams are not null

goal distributions are sane

In [11]:
print("Rows:", len(matches))
print("Seasons:", sorted(matches["season"].unique())[:10], " ... total:", matches["season"].nunique())

dup_cols = ["date","season","home_team","away_team","home_goals","away_goals"]
dups = matches.duplicated(subset=dup_cols).sum()
print("Exact duplicate match rows:", dups)

print("\nGoals summary:")
print(matches[["home_goals","away_goals"]].describe())

# quick sanity: very large scores
weird = matches[(matches["home_goals"] > 10) | (matches["away_goals"] > 10)]
print("\nMatches with >10 goals for a side:", len(weird))
weird.head(5)

Rows: 3010
Seasons: ['2017/18', '2018/19', '2019/20', '2020/21', '2021/22', '2022/23', '2023/24', '2024/25']  ... total: 8
Exact duplicate match rows: 0

Goals summary:
        home_goals   away_goals
count  3010.000000  3010.000000
mean      1.556811     1.296013
std       1.333265     1.220634
min       0.000000     0.000000
25%       1.000000     0.000000
50%       1.000000     1.000000
75%       2.000000     2.000000
max       9.000000     9.000000

Matches with >10 goals for a side: 0


Unnamed: 0,date,season,home_team,away_team,home_goals,away_goals,result,home_points,away_points


Sort + export canonical dataset

This becomes the “immutable” match dataset for Phase 5 modeling.

In [13]:
matches = matches.sort_values(["season", "date", "home_team", "away_team"]).reset_index(drop=True)

matches.to_csv(OUT_MATCH_PATH, index=False)
print("✅ Saved:", OUT_MATCH_PATH)
print("File size (rows):", len(matches))

✅ Saved: data/processed/epl_match_results_2017_2025.csv
File size (rows): 3010


Optional: team-season aggregates from match results

This is still season-level but derived from actual match outcomes (not xG).
Helpful for sanity checking, baselines, and features.

In [15]:
home_side = matches[["season","home_team","home_goals","away_goals","home_points"]].copy()
home_side = home_side.rename(columns={
    "home_team":"team",
    "home_goals":"goals_for",
    "away_goals":"goals_against",
    "home_points":"points"
})
home_side["venue"] = "home"

away_side = matches[["season","away_team","away_goals","home_goals","away_points"]].copy()
away_side = away_side.rename(columns={
    "away_team":"team",
    "away_goals":"goals_for",
    "home_goals":"goals_against",
    "away_points":"points"
})
away_side["venue"] = "away"

long = pd.concat([home_side, away_side], ignore_index=True)

agg = (
    long
    .groupby(["season","team"], as_index=False)
    .agg(
        matches=("points","size"),
        points=("points","sum"),
        goals_for=("goals_for","sum"),
        goals_against=("goals_against","sum"),
    )
)

agg["goal_diff"] = agg["goals_for"] - agg["goals_against"]
agg["ppg"] = agg["points"] / agg["matches"]

agg = agg.sort_values(["season","points","goal_diff"], ascending=[True, False, False]).reset_index(drop=True)

agg.head(10)

Unnamed: 0,season,team,matches,points,goals_for,goals_against,goal_diff,ppg
0,2017/18,Man City,38,100,106,27,79,2.631579
1,2017/18,Man United,38,81,68,28,40,2.131579
2,2017/18,Tottenham,38,77,74,36,38,2.026316
3,2017/18,Liverpool,38,75,84,38,46,1.973684
4,2017/18,Chelsea,38,70,62,38,24,1.842105
5,2017/18,Arsenal,38,63,74,51,23,1.657895
6,2017/18,Burnley,38,54,36,39,-3,1.421053
7,2017/18,Everton,38,49,44,58,-14,1.289474
8,2017/18,Leicester,38,47,56,60,-4,1.236842
9,2017/18,Newcastle,38,44,39,47,-8,1.157895


Export team-season aggregates (optional artifact)

In [17]:
agg.to_csv(OUT_TEAM_SEASON_PATH, index=False)
print("✅ Saved:", OUT_TEAM_SEASON_PATH)
print("Rows:", len(agg), "| Seasons:", agg["season"].nunique(), "| Teams/season (example):")
print(agg.groupby("season")["team"].nunique().head())

✅ Saved: data/processed/match_team_season_agg_2017_2025.csv
Rows: 160 | Seasons: 8 | Teams/season (example):
season
2017/18    20
2018/19    20
2019/20    20
2020/21    20
2021/22    20
Name: team, dtype: int64
