In [1]:
import pandas as pd

In [2]:
ELO_K = 20
ELO_HOME_ADV = 60
ROLLING_WINDOW = 5
FALLBACK_WIN_RATION = 1.0
label_map = {'H_or_D': 0, 'A': 1}

In [3]:
matches = pd.read_csv('../data/matches.csv', parse_dates=['kickoff_datetime'])
matches.head()

Unnamed: 0,match_id,season,kickoff,home_team,home_team_id,home_score,away_team,away_team_id,away_score,outcome,winner,kickoff_datetime
0,2561895,2025,2025-08-15 20:00:00,Liverpool,14,4,Bournemouth,91,2,H_or_D,Liverpool,2025-08-15 20:00:00
1,2561896,2025,2025-08-16 12:30:00,Aston Villa,7,0,Newcastle United,4,0,H_or_D,Draw,2025-08-16 12:30:00
2,2561897,2025,2025-08-16 15:00:00,Brighton and Hove Albion,36,1,Fulham,54,1,H_or_D,Draw,2025-08-16 15:00:00
3,2561898,2025,2025-08-17 14:00:00,Nottingham Forest,17,3,Brentford,94,1,H_or_D,Nottingham Forest,2025-08-17 14:00:00
4,2561899,2025,2025-08-16 15:00:00,Sunderland,56,3,West Ham United,21,0,H_or_D,Sunderland,2025-08-16 15:00:00


In [4]:
def init_elo(teams, base=1500):
    return {t: base for t in teams}

In [5]:

def expected_score(elo_ta, elo_th):
    return 1 / (1 + 10 ** ((elo_ta - elo_th) / 400.0))

In [6]:

def compute_elo_ratings(df):
    """
    Compute per-match pre-game ELO ratings for home and away teams.
    Returns two new columns: elo_home_pre, elo_away_pre
    """

    team_col_home = 'home_team'
    team_col_away = 'away_team'
    score_home = 'home_score'
    score_away = 'away_score'
    season_order_col = 'kickoff_datetime'

    teams = pd.concat([df[team_col_home], df[team_col_away]]).unique()
    elo = init_elo(teams)
    elo_home_pre = []
    elo_away_pre = []

    df_sorted = df.sort_values(by=season_order_col).reset_index(drop=True)
    for _, row in df_sorted.iterrows():
        th = row[team_col_home]
        ta = row[team_col_away]
        elo_home_pre.append(elo[th])
        elo_away_pre.append(elo[ta])

        # compute outcome
        if row[score_home] > row[score_away]:
            s_h, s_a = 1.0, 0.0
        elif row[score_home] < row[score_away]:
            s_h, s_a = 0.0, 1.0
        else:
            s_h, s_a = 0.5, 0.5

        exp_h = expected_score(elo[ta], elo[th])
        exp_a = 1 - exp_h

        elo[th] = elo[th] + ELO_K * (s_h - exp_h)
        elo[ta] = elo[ta] + ELO_K * (s_a - exp_a)

    # attach to original index
    df_out = df_sorted.copy()
    df_out['elo_home_pre'] = elo_home_pre
    df_out['elo_away_pre'] = elo_away_pre
    return df_out.sort_index()


In [7]:
matches = matches.rename(
    columns={'winner': 'winner_label', 'outcome': 'outcome_label'}) if 'winner' in matches.columns else matches
df = compute_elo_ratings(matches)

df['elo_diff_pre'] = df['elo_home_pre'] - df['elo_away_pre']

home_rows = df[['match_id', 'kickoff_datetime', 'season', 'home_team', 'away_team', 'home_score', 'away_score']].copy()
home_rows.columns = ['match_id', 'kickoff_datetime', 'season', 'team', 'opponent', 'score_for', 'score_against']
home_rows['is_home'] = 1
away_rows = df[['match_id', 'kickoff_datetime', 'season', 'away_team', 'home_team', 'away_score', 'home_score']].copy()
away_rows.columns = ['match_id', 'kickoff_datetime', 'season', 'team', 'opponent', 'score_for', 'score_against']
away_rows['is_home'] = 0
team_rows = pd.concat([home_rows, away_rows], ignore_index=True)

team_rows = team_rows.sort_values(['team', 'kickoff_datetime'])


In [8]:
team_rows

Unnamed: 0,match_id,kickoff_datetime,season,team,opponent,score_for,score_against,is_home
7,803171,2015-08-09 13:30:00,2015,Arsenal,West Ham United,0,2,1
4015,803172,2015-08-16 13:30:00,2015,Arsenal,Crystal Palace,2,1,0
29,803185,2015-08-24 20:00:00,2015,Arsenal,Liverpool,0,0,1
4028,803192,2015-08-29 12:45:00,2015,Arsenal,Newcastle United,1,0,0
45,803209,2015-09-12 15:00:00,2015,Arsenal,Stoke City,2,0,1
...,...,...,...,...,...,...,...,...
7950,2562045,2025-12-13 20:00:00,2025,Wolverhampton Wanderers,Arsenal,1,2,0
3963,2562064,2025-12-20 15:00:00,2025,Wolverhampton Wanderers,Brentford,0,2,1
7970,2562070,2025-12-27 15:00:00,2025,Wolverhampton Wanderers,Liverpool,1,2,0
7982,2562081,2025-12-30 20:15:00,2025,Wolverhampton Wanderers,Manchester United,1,1,0


In [9]:

#compute rolling avg
team_rows['points'] = team_rows.apply(
    lambda r: 3 if r['score_for'] > r['score_against'] else (1 if r['score_for'] == r['score_against'] else 0), axis=1)
team_rows

Unnamed: 0,match_id,kickoff_datetime,season,team,opponent,score_for,score_against,is_home,points
7,803171,2015-08-09 13:30:00,2015,Arsenal,West Ham United,0,2,1,0
4015,803172,2015-08-16 13:30:00,2015,Arsenal,Crystal Palace,2,1,0,3
29,803185,2015-08-24 20:00:00,2015,Arsenal,Liverpool,0,0,1,1
4028,803192,2015-08-29 12:45:00,2015,Arsenal,Newcastle United,1,0,0,3
45,803209,2015-09-12 15:00:00,2015,Arsenal,Stoke City,2,0,1,3
...,...,...,...,...,...,...,...,...,...
7950,2562045,2025-12-13 20:00:00,2025,Wolverhampton Wanderers,Arsenal,1,2,0,0
3963,2562064,2025-12-20 15:00:00,2025,Wolverhampton Wanderers,Brentford,0,2,1,0
7970,2562070,2025-12-27 15:00:00,2025,Wolverhampton Wanderers,Liverpool,1,2,0,0
7982,2562081,2025-12-30 20:15:00,2025,Wolverhampton Wanderers,Manchester United,1,1,0,1


In [10]:
team_rows

Unnamed: 0,match_id,kickoff_datetime,season,team,opponent,score_for,score_against,is_home,points
7,803171,2015-08-09 13:30:00,2015,Arsenal,West Ham United,0,2,1,0
4015,803172,2015-08-16 13:30:00,2015,Arsenal,Crystal Palace,2,1,0,3
29,803185,2015-08-24 20:00:00,2015,Arsenal,Liverpool,0,0,1,1
4028,803192,2015-08-29 12:45:00,2015,Arsenal,Newcastle United,1,0,0,3
45,803209,2015-09-12 15:00:00,2015,Arsenal,Stoke City,2,0,1,3
...,...,...,...,...,...,...,...,...,...
7950,2562045,2025-12-13 20:00:00,2025,Wolverhampton Wanderers,Arsenal,1,2,0,0
3963,2562064,2025-12-20 15:00:00,2025,Wolverhampton Wanderers,Brentford,0,2,1,0
7970,2562070,2025-12-27 15:00:00,2025,Wolverhampton Wanderers,Liverpool,1,2,0,0
7982,2562081,2025-12-30 20:15:00,2025,Wolverhampton Wanderers,Manchester United,1,1,0,1


In [11]:
team_rows[['gf_roll', 'ga_roll', 'pts_roll']] = (team_rows
.groupby('team')[['score_for', 'score_against', 'points']]
.shift(1)
.groupby(team_rows['team'])  # Rolling pro Team
.rolling(window=ROLLING_WINDOW)
.mean()
.fillna(1.5)
.reset_index(level=0, drop=True)[['score_for', 'score_against', 'points']])

In [12]:
 # take last pre-match rolling values for home and away
team_rows = team_rows.sort_values(['match_id', 'team', 'kickoff_datetime']).drop_duplicates(subset=['match_id', 'team'],
                                                                                            keep='last')
home_features = team_rows[team_rows['is_home'] == 1][['match_id', 'gf_roll', 'ga_roll', 'pts_roll']].rename(
    columns=lambda c: f'home_{c}' if c != 'match_id' else c)
away_features = team_rows[team_rows['is_home'] == 0][['match_id', 'gf_roll', 'ga_roll', 'pts_roll']].rename(
    columns=lambda c: f'away_{c}' if c != 'match_id' else c)
df = df.merge(home_features, on='match_id', how='left').merge(away_features, on='match_id', how='left')

In [13]:
df["kickoff_datetime"] = pd.to_datetime(df["kickoff_datetime"])
df = df.sort_values('kickoff_datetime').reset_index(drop=True)
df["h2h_avg_points_home"] = 1
df["h2h_avg_points_away"] = 1



In [14]:
df.dtypes

match_id                        int64
season                          int64
kickoff                        object
home_team                      object
home_team_id                    int64
home_score                      int64
away_team                      object
away_team_id                    int64
away_score                      int64
outcome_label                  object
winner_label                   object
kickoff_datetime       datetime64[ns]
elo_home_pre                  float64
elo_away_pre                  float64
elo_diff_pre                  float64
home_gf_roll                  float64
home_ga_roll                  float64
home_pts_roll                 float64
away_gf_roll                  float64
away_ga_roll                  float64
away_pts_roll                 float64
h2h_avg_points_home             int64
h2h_avg_points_away             int64
dtype: object

In [15]:
for i, row in df.iterrows():
    home_team = row["home_team"]
    away_team = row["away_team"]
    kickoff = row["kickoff_datetime"]

    past_games = df.loc[
        (df["kickoff_datetime"] < kickoff) &
        (
            ((df["home_team"] == home_team) & (df["away_team"] == away_team)) |
            ((df["home_team"] == away_team) & (df["away_team"] == home_team))
        )
    ]

    if past_games.empty:
        continue

    home_points = []
    away_points = []


    for _, past_game in past_games.iterrows():
        if past_game['home_score'] > past_game['away_score']:
           points_home_team = 3
           points_away_team = 0

        elif past_game['home_score'] < past_game['away_score']:
            points_home_team = 0
            points_away_team = 3

        else:
            points_home_team = 1
            points_away_team = 1

        if past_game['home_team'] == home_team:
            home_points.append(points_home_team)
            away_points.append(points_away_team)
        else:
            home_points.append(points_away_team)
            away_points.append(points_home_team)


    df.at[i, "h2h_avg_points_home"] = sum(home_points) / len(home_points)
    df.at[i, "h2h_avg_points_away"] = sum(away_points) / len(away_points)





  df.at[i, "h2h_avg_points_home"] = sum(home_points) / len(home_points)
  df.at[i, "h2h_avg_points_away"] = sum(away_points) / len(away_points)


In [16]:
df = df.sort_values('kickoff_datetime').reset_index(drop=True)

last_kickoff = {}
rest_days_home = []
rest_days_away = []
for _, row in df.iterrows():
    th = row['home_team'];
    ta = row['away_team'];
    t0 = row['kickoff_datetime']

    if th in last_kickoff:
        delta = (t0 - last_kickoff[th]).total_seconds() / (24 * 3600)
        rest_days_home.append(delta)
    else:
        rest_days_home.append(7)
    last_kickoff[th] = t0

    if ta in last_kickoff:
        delta = (t0 - last_kickoff[ta]).total_seconds() / (24 * 3600)
        rest_days_away.append(delta)
    else:
        rest_days_away.append(7)
    last_kickoff[ta] = t0
df['rest_days_home'] = rest_days_home
df['rest_days_away'] = rest_days_away
df['rest_days_diff'] = df['rest_days_home'] - df['rest_days_away']

df['target'] = df['outcome_label'].map(label_map)

df.to_csv('./../data/preprocessed_matches.csv', index=False)