# NOTEBOOK-FIRST â€” Simple EPL Team Goals Prediction (Linear Regression, Offline)

This notebook builds a **simple, offline** model to predict:
- **Home goals**
- **Away goals**

Using:
- **Multiple EPL seasons** from **football-data.co.uk**
- **Only** `sklearn.linear_model.LinearRegression`
- Minimal, leakage-safe **rolling features** + **rest days**
- A single **time-based** train/test split (earliest 80% train, latest 20% test)

Outputs saved:
- `data/processed/features.parquet`
- `models/home_model.joblib`, `models/away_model.joblib`
- `figures/` plots
- `reports/summary.md`


## 1. Setup & imports

In [None]:
!pip -q install pyarrow

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error

import joblib

np.random.seed(42)

os.makedirs("data/processed", exist_ok=True)
os.makedirs("models", exist_ok=True)
os.makedirs("reports", exist_ok=True)
os.makedirs("figures", exist_ok=True)

## 2. Data download & loading

We download multiple EPL seasons from football-data.co.uk. If a season file is missing, we skip it so the notebook still runs.

In [None]:
BASE_URL = "https://www.football-data.co.uk/mmz4281"

# EPL seasons to attempt. Some may not exist depending on current hosting.
# 2018/19 -> 1819, 2019/20 -> 1920, 2020/21 -> 2021, 2021/22 -> 2122, etc.
SEASON_CODES = ["1819", "1920", "2021", "2122", "2223", "2324", "2425"]

def season_label(code: str) -> str:
    if code == "2021":
        return "2020/2021"
    y1 = 2000 + int(code[:2])
    y2 = 2000 + int(code[2:])
    return f"{y1}/{y2}"

def safe_read_csv(url: str):
    try:
        return pd.read_csv(url)
    except Exception as e:
        print(f"  - Skipping: {url}\n    Reason: {e}")
        return None

dfs = []
for code in SEASON_CODES:
    url = f"{BASE_URL}/{code}/E0.csv"
    print(f"Fetching EPL season {season_label(code)} ({code}) ...")
    df_season = safe_read_csv(url)
    if df_season is None:
        continue
    df_season["season"] = season_label(code)
    df_season["_season_code"] = code
    dfs.append(df_season)

if len(dfs) == 0:
    raise RuntimeError("No seasons were downloaded. Check internet access or football-data.co.uk availability.")

raw = pd.concat(dfs, ignore_index=True)
print("Raw combined shape:", raw.shape)
raw.head()

## 3. Cleaning & schema checks

We standardize to:

`match_id, date, season, home_team, away_team, home_goals, away_goals`

In [None]:
needed = ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "season"]
missing = [c for c in needed if c not in raw.columns]
if missing:
    raise ValueError(f"Missing required columns in downloaded data: {missing}")

df = raw[needed].copy()
df = df.rename(columns={
    "Date": "date",
    "HomeTeam": "home_team",
    "AwayTeam": "away_team",
    "FTHG": "home_goals",
    "FTAG": "away_goals",
})

df["date"] = pd.to_datetime(df["date"], dayfirst=True, errors="coerce")
df = df.dropna(subset=["date", "home_team", "away_team", "home_goals", "away_goals"]).copy()

df["home_goals"] = pd.to_numeric(df["home_goals"], errors="coerce")
df["away_goals"] = pd.to_numeric(df["away_goals"], errors="coerce")
df = df.dropna(subset=["home_goals", "away_goals"]).copy()
df["home_goals"] = df["home_goals"].astype(int)
df["away_goals"] = df["away_goals"].astype(int)

df = df.sort_values(["date", "season", "home_team", "away_team"]).reset_index(drop=True)
df["match_id"] = np.arange(len(df), dtype=int)

print("Clean shape:", df.shape)
print("Date range:", df["date"].min().date(), "->", df["date"].max().date())
df.head()

## 4. Feature engineering (rolling features, rest days)

Leakage-safe rolling features per team using only past matches:
- Rolling mean goals for (last 5)
- Rolling mean goals against (last 5)
- Rolling mean points (last 5)
- Rest days since last match

We compute these on a team-match long table, then merge back for home and away teams.

In [None]:
team_rows = []
for r in df.itertuples(index=False):
    # Points from the match
    if r.home_goals > r.away_goals:
        ph, pa = 3, 0
    elif r.home_goals < r.away_goals:
        ph, pa = 0, 3
    else:
        ph, pa = 1, 1

    team_rows.append({
        "match_id": r.match_id,
        "date": r.date,
        "season": r.season,
        "team": r.home_team,
        "is_home": 1,
        "gf": r.home_goals,
        "ga": r.away_goals,
        "points": ph,
    })
    team_rows.append({
        "match_id": r.match_id,
        "date": r.date,
        "season": r.season,
        "team": r.away_team,
        "is_home": 0,
        "gf": r.away_goals,
        "ga": r.home_goals,
        "points": pa,
    })

team_df = pd.DataFrame(team_rows).sort_values(["team", "date", "match_id"]).reset_index(drop=True)

team_df["gf_rolling_5"] = team_df.groupby("team")["gf"].transform(lambda s: s.shift(1).rolling(5, min_periods=1).mean())
team_df["ga_rolling_5"] = team_df.groupby("team")["ga"].transform(lambda s: s.shift(1).rolling(5, min_periods=1).mean())
team_df["points_rolling_5"] = team_df.groupby("team")["points"].transform(lambda s: s.shift(1).rolling(5, min_periods=1).mean())

team_df["rest_days"] = team_df.groupby("team")["date"].diff().dt.days

team_df["rest_days"] = team_df["rest_days"].fillna(7)
team_df[["gf_rolling_5", "ga_rolling_5", "points_rolling_5"]] = team_df[["gf_rolling_5", "ga_rolling_5", "points_rolling_5"]].fillna(0.0)

team_df.head()

In [None]:
home_feat = team_df[team_df["is_home"] == 1][
    ["match_id", "team", "gf_rolling_5", "ga_rolling_5", "points_rolling_5", "rest_days"]
].rename(columns={
    "team": "home_team",
    "gf_rolling_5": "home_gf_rolling_5",
    "ga_rolling_5": "home_ga_rolling_5",
    "points_rolling_5": "home_points_rolling_5",
    "rest_days": "home_rest_days",
})

away_feat = team_df[team_df["is_home"] == 0][
    ["match_id", "team", "gf_rolling_5", "ga_rolling_5", "points_rolling_5", "rest_days"]
].rename(columns={
    "team": "away_team",
    "gf_rolling_5": "away_gf_rolling_5",
    "ga_rolling_5": "away_ga_rolling_5",
    "points_rolling_5": "away_points_rolling_5",
    "rest_days": "away_rest_days",
})

feat = df.merge(home_feat, on=["match_id", "home_team"], how="left").merge(away_feat, on=["match_id", "away_team"], how="left")

feature_cols = [
    "home_gf_rolling_5", "home_ga_rolling_5", "home_points_rolling_5", "home_rest_days",
    "away_gf_rolling_5", "away_ga_rolling_5", "away_points_rolling_5", "away_rest_days",
]
feat[feature_cols] = feat[feature_cols].fillna(0.0)

feat.head()

## 5. Time-based train/test split

Single split: earliest 80% train, latest 20% test.

In [None]:
feat = feat.sort_values("date").reset_index(drop=True)

cut_idx = int(0.8 * len(feat))
cut_date = feat.loc[cut_idx, "date"]

train_df = feat[feat["date"] <= cut_date].copy()
test_df  = feat[feat["date"] >  cut_date].copy()

print("Train size:", len(train_df), "Test size:", len(test_df))
print("Train range:", train_df["date"].min().date(), "->", train_df["date"].max().date())
print("Test range :", test_df["date"].min().date(), "->", test_df["date"].max().date())

FEATURES = ["home_team", "away_team"] + feature_cols

X_train = train_df[FEATURES]
X_test  = test_df[FEATURES]

y_train_home = train_df["home_goals"].astype(float)
y_train_away = train_df["away_goals"].astype(float)

y_test_home = test_df["home_goals"].astype(float)
y_test_away = test_df["away_goals"].astype(float)

## 6. Train linear regression models (home/away)

Two separate `LinearRegression` models in pipelines.

In [None]:
categorical = ["home_team", "away_team"]
numeric = feature_cols

preprocess = ColumnTransformer(
    transformers=[
        ("teams", OneHotEncoder(handle_unknown="ignore"), categorical),
        ("num", "passthrough", numeric),
    ]
)

model_home = Pipeline([("preprocess", preprocess), ("lr", LinearRegression())])
model_away = Pipeline([("preprocess", preprocess), ("lr", LinearRegression())])

model_home.fit(X_train, y_train_home)
model_away.fit(X_train, y_train_away)

print("Models trained.")

## 7. Evaluation (tables + plots)

We compute MAE/RMSE on the test set and save two simple plots.

In [None]:
pred_home = model_home.predict(X_test)
pred_away = model_away.predict(X_test)

pred_total = pred_home + pred_away
true_total = (y_test_home + y_test_away).values

def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

metrics = pd.DataFrame({
    "target": ["home_goals", "away_goals", "total_goals"],
    "MAE": [
        mean_absolute_error(y_test_home, pred_home),
        mean_absolute_error(y_test_away, pred_away),
        mean_absolute_error(true_total, pred_total),
    ],
    "RMSE": [
        rmse(y_test_home, pred_home),
        rmse(y_test_away, pred_away),
        rmse(true_total, pred_total),
    ],
})

metrics.round(4)

In [None]:
# Plot 1: Predicted vs actual (home goals)
plt.figure(figsize=(6, 5))
plt.scatter(y_test_home, pred_home, alpha=0.5)
mx = max(float(y_test_home.max()), float(np.max(pred_home))) + 0.5
plt.plot([0, mx], [0, mx], linestyle="--")
plt.xlabel("Actual home goals")
plt.ylabel("Predicted home goals")
plt.title("Predicted vs Actual: Home Goals (Test Set)")
plt.tight_layout()
plt.savefig("figures/pred_vs_actual_home.png", dpi=150)
plt.show()

# Plot 2: Residual histogram (home goals)
res = pred_home - y_test_home.values
plt.figure(figsize=(6, 4))
plt.hist(res, bins=25, alpha=0.7)
plt.axvline(0, linestyle="--")
plt.xlabel("Residual (pred - actual)")
plt.ylabel("Count")
plt.title("Residuals: Home Goals (Test Set)")
plt.tight_layout()
plt.savefig("figures/residuals_home.png", dpi=150)
plt.show()

## 8. Save artifacts + `predict_match()`

We save the feature table and models, write a short report, then define `predict_match`.

In [None]:
feat.to_parquet("data/processed/features.parquet", index=False)
joblib.dump(model_home, "models/home_model.joblib")
joblib.dump(model_away, "models/away_model.joblib")

summary_lines = [
    "# Summary",
    "",
    f"- Seasons used: {sorted(feat['season'].unique().tolist())}",
    f"- Rows: {len(feat)}",
    "",
    "## Test Metrics",
    metrics.to_markdown(index=False),
    "",
    "## Notes",
    "- Model: sklearn LinearRegression (two models: home and away).",
    "- Features: leakage-safe rolling means (last 5 matches) + rest days + one-hot team IDs.",
]
with open("reports/summary.md", "w") as f:
    f.write("\n".join(summary_lines))

print("Saved artifacts to data/processed, models, reports, figures.")

In [None]:
def _team_history_before_date(team_name: str, cutoff_date: pd.Timestamp) -> pd.DataFrame:
    return team_df[(team_df["team"] == team_name) & (team_df["date"] < cutoff_date)].sort_values("date")

def predict_match(home_team: str, away_team: str, match_date: str) -> dict:
    """
    Offline prediction using historical matches strictly before match_date.
    Returns expected home goals, away goals, and total goals.
    """
    d = pd.to_datetime(match_date, errors="coerce")
    if pd.isna(d):
        raise ValueError(f"Invalid match_date: {match_date}")

    if home_team == away_team:
        raise ValueError("home_team and away_team must be different.")

    known_teams = set(team_df["team"].unique().tolist())
    if home_team not in known_teams:
        raise ValueError(f"Unknown home_team: {home_team}")
    if away_team not in known_teams:
        raise ValueError(f"Unknown away_team: {away_team}")

    h_hist = _team_history_before_date(home_team, d)
    a_hist = _team_history_before_date(away_team, d)

    if len(h_hist) == 0 or len(a_hist) == 0:
        raise ValueError("Not enough history before match_date for one or both teams.")

    # Last 5 matches for each team
    h_last5 = h_hist.tail(5)
    a_last5 = a_hist.tail(5)

    # Rolling stats (means over last 5 historical matches)
    home_gf = float(h_last5["gf"].mean())
    home_ga = float(h_last5["ga"].mean())
    home_pts = float(h_last5["points"].mean())
    away_gf = float(a_last5["gf"].mean())
    away_ga = float(a_last5["ga"].mean())
    away_pts = float(a_last5["points"].mean())

    # Rest days since the last match
    home_rest = int((d - h_hist["date"].iloc[-1]).days)
    away_rest = int((d - a_hist["date"].iloc[-1]).days)

    row = pd.DataFrame([{
        "home_team": home_team,
        "away_team": away_team,
        "home_gf_rolling_5": home_gf,
        "home_ga_rolling_5": home_ga,
        "home_points_rolling_5": home_pts,
        "home_rest_days": home_rest,
        "away_gf_rolling_5": away_gf,
        "away_ga_rolling_5": away_ga,
        "away_points_rolling_5": away_pts,
        "away_rest_days": away_rest,
    }])

    ph = float(model_home.predict(row)[0])
    pa = float(model_away.predict(row)[0])

    return {
        "predicted_home_goals": round(ph, 3),
        "predicted_away_goals": round(pa, 3),
        "predicted_total_goals": round(ph + pa, 3),
    }

# Example call (pick a date near the end of the dataset so both teams have history)
example_date = str(feat["date"].quantile(0.95).date())
print("Example date:", example_date)
print(predict_match("Arsenal", "Chelsea", example_date))