In [8]:
import time
t0 = time.time()
print("⏱️ Timer started")


⏱️ Timer started


In [9]:
# Load & Prepare Base Data

import pandas as pd
import numpy as np

# Load base data from EDA logic
df = pd.read_csv("../data/epl_final.csv")

# Target encoding
df["target"] = df["FullTimeResult"].map({"H": 2, "D": 1, "A": 0})

# Convert date & sort
df["MatchDate"] = pd.to_datetime(df["MatchDate"])
df = df.sort_values("MatchDate").reset_index(drop=True)

df.head()


Unnamed: 0,Season,MatchDate,HomeTeam,AwayTeam,FullTimeHomeGoals,FullTimeAwayGoals,FullTimeResult,HalfTimeHomeGoals,HalfTimeAwayGoals,HalfTimeResult,...,AwayShotsOnTarget,HomeCorners,AwayCorners,HomeFouls,AwayFouls,HomeYellowCards,AwayYellowCards,HomeRedCards,AwayRedCards,target
0,2000/01,2000-08-19,Charlton,Man City,4,0,H,2,0,H,...,4,6,6,13,12,1,2,0,0,2
1,2000/01,2000-08-19,Chelsea,West Ham,4,2,H,1,0,H,...,5,7,7,19,14,1,2,0,0,2
2,2000/01,2000-08-19,Coventry,Middlesbrough,1,3,A,1,1,D,...,9,8,4,15,21,5,3,1,0,0
3,2000/01,2000-08-19,Derby,Southampton,2,2,D,1,2,A,...,6,5,8,11,13,1,1,0,0,1
4,2000/01,2000-08-19,Leeds,Everton,2,0,H,2,0,H,...,6,6,4,21,20,1,3,0,0,2


In [10]:
# Create Team-Match Long Format

home_df = df[[
    "MatchDate", "Season",
    "HomeTeam", "AwayTeam",
    "FullTimeHomeGoals", "FullTimeAwayGoals", "target"
]].copy()

home_df.columns = [
    "MatchDate", "Season",
    "Team", "Opponent",
    "GoalsFor", "GoalsAgainst", "target"
]
home_df["is_home"] = 1

away_df = df[[
    "MatchDate", "Season",
    "AwayTeam", "HomeTeam",
    "FullTimeAwayGoals", "FullTimeHomeGoals", "target"
]].copy()

away_df.columns = [
    "MatchDate", "Season",
    "Team", "Opponent",
    "GoalsFor", "GoalsAgainst", "target"
]
away_df["is_home"] = 0

team_df = pd.concat([home_df, away_df])
team_df = team_df.sort_values("MatchDate").reset_index(drop=True)

team_df.head()


Unnamed: 0,MatchDate,Season,Team,Opponent,GoalsFor,GoalsAgainst,target,is_home
0,2000-08-19,2000/01,Charlton,Man City,4,0,2,1
1,2000-08-19,2000/01,Southampton,Derby,2,2,1,0
2,2000-08-19,2000/01,Everton,Leeds,0,2,2,0
3,2000-08-19,2000/01,Aston Villa,Leicester,0,0,1,0
4,2000-08-19,2000/01,Bradford,Liverpool,0,1,2,0


In [11]:
# Rolling Form Features (NO LEAKAGE)

WINDOW = 5

gf = team_df.groupby("Team")["GoalsFor"]
ga = team_df.groupby("Team")["GoalsAgainst"]

# No leakage: shift(1) so we only use past matches
team_df["gf_rolling"] = (
    gf.shift(1)
      .rolling(WINDOW, min_periods=WINDOW)
      .mean()
      .reset_index(level=0, drop=True)
)

team_df["ga_rolling"] = (
    ga.shift(1)
      .rolling(WINDOW, min_periods=WINDOW)
      .mean()
      .reset_index(level=0, drop=True)
)

team_df["matches_played"] = team_df.groupby("Team").cumcount()

team_df.head(10)



Unnamed: 0,MatchDate,Season,Team,Opponent,GoalsFor,GoalsAgainst,target,is_home,gf_rolling,ga_rolling,matches_played
0,2000-08-19,2000/01,Charlton,Man City,4,0,2,1,,,0
1,2000-08-19,2000/01,Southampton,Derby,2,2,1,0,,,0
2,2000-08-19,2000/01,Everton,Leeds,0,2,2,0,,,0
3,2000-08-19,2000/01,Aston Villa,Leicester,0,0,1,0,,,0
4,2000-08-19,2000/01,Bradford,Liverpool,0,1,2,0,,,0
5,2000-08-19,2000/01,Arsenal,Sunderland,0,1,2,0,,,0
6,2000-08-19,2000/01,Ipswich,Tottenham,1,3,2,0,,,0
7,2000-08-19,2000/01,Man City,Charlton,0,4,2,0,,,0
8,2000-08-19,2000/01,Middlesbrough,Coventry,3,1,0,0,,,0
9,2000-08-19,2000/01,West Ham,Chelsea,2,4,2,0,,,0


In [12]:
# Merge Home & Away Features Back

home_features = team_df[team_df["is_home"] == 1][[
    "MatchDate", "Team",
    "gf_rolling", "ga_rolling", "matches_played"
]].copy()

away_features = team_df[team_df["is_home"] == 0][[
    "MatchDate", "Team",
    "gf_rolling", "ga_rolling", "matches_played"
]].copy()

home_features.columns = [
    "MatchDate", "HomeTeam",
    "home_gf_form", "home_ga_form", "home_matches"
]

away_features.columns = [
    "MatchDate", "AwayTeam",
    "away_gf_form", "away_ga_form", "away_matches"
]

final_df = df.merge(home_features, on=["MatchDate", "HomeTeam"], how="left")
final_df = final_df.merge(away_features, on=["MatchDate", "AwayTeam"], how="left")

final_df = final_df.sort_values("MatchDate").reset_index(drop=True)
final_df.head()



Unnamed: 0,Season,MatchDate,HomeTeam,AwayTeam,FullTimeHomeGoals,FullTimeAwayGoals,FullTimeResult,HalfTimeHomeGoals,HalfTimeAwayGoals,HalfTimeResult,...,AwayYellowCards,HomeRedCards,AwayRedCards,target,home_gf_form,home_ga_form,home_matches,away_gf_form,away_ga_form,away_matches
0,2000/01,2000-08-19,Charlton,Man City,4,0,H,2,0,H,...,2,0,0,2,,,0,,,0
1,2000/01,2000-08-19,Chelsea,West Ham,4,2,H,1,0,H,...,2,0,0,2,,,0,,,0
2,2000/01,2000-08-19,Coventry,Middlesbrough,1,3,A,1,1,D,...,3,1,0,0,,,0,,,0
3,2000/01,2000-08-19,Derby,Southampton,2,2,D,1,2,A,...,1,0,0,1,,,0,,,0
4,2000/01,2000-08-19,Leeds,Everton,2,0,H,2,0,H,...,3,0,0,2,,,0,,,0


In [13]:
# UI-ready features (keep team names for the interface)
ui_df = final_df[[
    "MatchDate", "HomeTeam", "AwayTeam",
    "home_gf_form", "home_ga_form", "home_matches",
    "away_gf_form", "away_ga_form", "away_matches",
    "target"
]].dropna().sort_values("MatchDate").reset_index(drop=True)

ui_df.to_csv("../data/ui_features.csv", index=False)
print("✅ Saved: ../data/ui_features.csv")
print("Shape:", ui_df.shape)

ui_df.head()



✅ Saved: ../data/ui_features.csv
Shape: (9271, 10)


Unnamed: 0,MatchDate,HomeTeam,AwayTeam,home_gf_form,home_ga_form,home_matches,away_gf_form,away_ga_form,away_matches,target
0,2000-08-22,Ipswich,Man United,2.0,1.4,1,2.0,0.8,1,1
1,2000-08-23,Man City,Sunderland,0.8,2.8,1,1.4,1.0,1,2
2,2000-08-23,Southampton,Coventry,2.0,2.0,1,0.6,2.2,1,0
3,2000-08-23,Everton,Charlton,1.6,2.0,1,2.0,0.8,1,2
4,2000-08-23,West Ham,Leicester,2.4,1.4,1,0.4,2.0,1,0


In [14]:
# Model features (training/evaluation table)
model_df = ui_df[[
    "MatchDate",
    "home_gf_form", "home_ga_form", "home_matches",
    "away_gf_form", "away_ga_form", "away_matches",
    "target"
]].copy()

model_df.to_csv("../data/model_features.csv", index=False)

print("✅ Saved: ../data/model_features.csv")
print("Shape:", model_df.shape)
print(model_df.head())



✅ Saved: ../data/model_features.csv
Shape: (9271, 8)
   MatchDate  home_gf_form  home_ga_form  home_matches  away_gf_form  \
0 2000-08-22           2.0           1.4             1           2.0   
1 2000-08-23           0.8           2.8             1           1.4   
2 2000-08-23           2.0           2.0             1           0.6   
3 2000-08-23           1.6           2.0             1           2.0   
4 2000-08-23           2.4           1.4             1           0.4   

   away_ga_form  away_matches  target  
0           0.8             1       1  
1           1.0             1       2  
2           2.2             1       0  
3           0.8             1       2  
4           2.0             1       0  


In [15]:
print("⏱️ Total elapsed seconds:", round(time.time() - t0, 2))


⏱️ Total elapsed seconds: 0.26
