In [25]:
import os
import pandas as pd

DATA_PATH= os.path.join('data')

files= os.listdir(DATA_PATH)
data= pd.DataFrame()

for file in files:
    file_path= os.path.join(DATA_PATH, file)
    season= file.split('.')[0]
    df= pd.read_csv(file_path)
    df['Season']= season
    data= pd.concat([data, df], ignore_index=True)

I will not use the odds for the model for the following reasons :

- Some bookmaker statistics are in one file but not in another
- If the logic used by the betting site changes from one year to another, then we would create biases

We may use a model with the odds if we have time or just in order to compare with the results

In [26]:
with open('notes.txt', 'r', encoding='utf-8') as f:
    content = f.read()
    sections= content.split('\n\n')

In [27]:
def get_columns_from_section(section):
    col_names= []
    lines= sections[section].split('\n')
    for elt in lines:
        if '=' in elt:
            parts= elt.split('and')
            for part in parts:
                col_name= part.split('=')[0].strip()
                col_names.append(col_name)
    return col_names

cols_third_section= get_columns_from_section(3)
cols_fourth_section= get_columns_from_section(4)
cols_to_keep= ['Season'] + cols_third_section + cols_fourth_section

In [28]:
existing_cols = [c for c in cols_to_keep if c in data.columns]
final_data= data[existing_cols]

In [29]:
final_data.isna().sum()

Season        0
Div           0
Date          0
Time        380
HomeTeam      0
AwayTeam      0
FTHG          0
FTAG          0
FTR           0
HTHG          0
HTAG          0
HTR           0
Referee       0
HS            0
AS            0
HST           0
AST           0
HC            0
AC            0
HF            0
AF            0
HY            0
AY            0
HR            0
AR            0
dtype: int64

In [30]:
final_data[final_data['Time'].isna()]['Season'].unique()

array(['2018-2019'], dtype=object)

In [31]:
final_data= final_data[final_data['Season'] != '2018-2019']

I think "Time" would be interesting to keep for the modeling part, so I will delete season 2018-2019. 4 seasons should be sufficient for the training.

Let us now do rolling windows in order to compute data before match

In [32]:
# ...existing code...
import numpy as np
from datetime import timedelta

# Trier chronologiquement (Date doit être datetime avant le tri)
final_data['Date'] = pd.to_datetime(final_data['Date'], dayfirst=True, errors='coerce')
final_data_sorted = final_data.sort_values(['Date']).reset_index(drop=True)

# 1) Assurez-vous que Date est datetime et que l'index correspond à match_id
final_data_sorted['Date'] = pd.to_datetime(final_data_sorted['Date'], dayfirst=True, errors='coerce')
final_data_sorted = final_data_sorted.sort_values('Date').reset_index(drop=True)
final_data_sorted['match_id'] = final_data_sorted.index

# 2) Construire la table longue
home_long = final_data_sorted[['match_id', 'Date', 'HomeTeam']].rename(columns={'HomeTeam':'Team'})
away_long = final_data_sorted[['match_id', 'Date', 'AwayTeam']].rename(columns={'AwayTeam':'Team'})
long = pd.concat([home_long.assign(is_home=1), away_long.assign(is_home=0)], ignore_index=True)
long = long.sort_values(['Team','Date'])

# 3) Calculer la date précédente et la différence en jours
long['prev_date'] = long.groupby('Team')['Date'].shift(1)
long['days_rest'] = (long['Date'] - long['prev_date']).dt.days.fillna(0).astype(int)

# 4) Joindre les résultats sur le dataframe d'origine
home_rest = long[long['is_home']==1][['match_id','days_rest']].rename(columns={'days_rest':'home_days_rest'})
away_rest = long[long['is_home']==0][['match_id','days_rest']].rename(columns={'days_rest':'away_days_rest'})

final = final_data_sorted.merge(home_rest, on='match_id', how='left').merge(away_rest, on='match_id', how='left')
final['home_days_rest'] = final['home_days_rest'].fillna(0).astype(int)
final['away_days_rest'] = final['away_days_rest'].fillna(0).astype(int)

def create_features_before_match(data, recent_matches=5):
    """
    Crée des features basées UNIQUEMENT sur l'historique avant chaque match.
    (principe inchangé ; corrections : calculs goals/concédés sur tous les matchs
     de l'équipe et calcul correct des points récents)
    """
    features = data.copy()
    features['home_avg_goals'] = np.nan
    features['home_avg_conceded'] = np.nan
    features['away_avg_goals'] = np.nan
    features['away_avg_conceded'] = np.nan
    features['home_recent_form'] = np.nan  # points sur derniers N matchs
    features['away_recent_form'] = np.nan
    features['home_h2h_wins'] = np.nan  # historique direct
    features['away_h2h_wins'] = np.nan
    features['home_home_record'] = np.nan  # record à domicile
    features['away_away_record'] = np.nan  # record en déplacement

    def points_for_team(hist_row, team_name):
        # rend les points (3/1/0) pour team_name dans la ligne hist_row
        if hist_row['HomeTeam'] == team_name:
            if hist_row['FTR'] == 'H':
                return 3
            if hist_row['FTR'] == 'D':
                return 1
            return 0
        else:  # team était away dans ce match
            if hist_row['FTR'] == 'A':
                return 3
            if hist_row['FTR'] == 'D':
                return 1
            return 0

    for idx, row in features.iterrows():
        if idx == 0:
            continue  # premier match sans historique

        home_team = row['HomeTeam']
        away_team = row['AwayTeam']

        # Historique AVANT ce match
        history_window = features.iloc[:idx]

        # ===== MOYENNES GLISSANTES : goals marqués / encaissés (sur tous les matchs de l'équipe) =====
        # Pour l'équipe à domicile (home_team)
        home_team_matches = history_window[
            (history_window['HomeTeam'] == home_team) |
            (history_window['AwayTeam'] == home_team)
        ]
        if len(home_team_matches) > 0:
            # goals marqués par home_team (somme des FTHG quand Home + FTAG quand Away)
            goals_home_when_home = history_window.loc[history_window['HomeTeam'] == home_team, 'FTHG'].sum()
            goals_home_when_away = history_window.loc[history_window['AwayTeam'] == home_team, 'FTAG'].sum()
            total_home_goals = goals_home_when_home + goals_home_when_away
            features.at[idx, 'home_avg_goals'] = total_home_goals / len(home_team_matches)

            # goals encaissés par home_team
            conceded_home_when_home = history_window.loc[history_window['HomeTeam'] == home_team, 'FTAG'].sum()
            conceded_home_when_away = history_window.loc[history_window['AwayTeam'] == home_team, 'FTHG'].sum()
            total_home_conceded = conceded_home_when_home + conceded_home_when_away
            features.at[idx, 'home_avg_conceded'] = total_home_conceded / len(home_team_matches)
        else:
            features.at[idx, 'home_avg_goals'] = 0.0
            features.at[idx, 'home_avg_conceded'] = 0.0

        # Pour l'équipe à l'extérieur (away_team)
        away_team_matches = history_window[
            (history_window['HomeTeam'] == away_team) |
            (history_window['AwayTeam'] == away_team)
        ]
        if len(away_team_matches) > 0:
            goals_away_when_home = history_window.loc[history_window['HomeTeam'] == away_team, 'FTHG'].sum()
            goals_away_when_away = history_window.loc[history_window['AwayTeam'] == away_team, 'FTAG'].sum()
            total_away_goals = goals_away_when_home + goals_away_when_away
            features.at[idx, 'away_avg_goals'] = total_away_goals / len(away_team_matches)

            conceded_away_when_home = history_window.loc[history_window['HomeTeam'] == away_team, 'FTAG'].sum()
            conceded_away_when_away = history_window.loc[history_window['AwayTeam'] == away_team, 'FTHG'].sum()
            total_away_conceded = conceded_away_when_home + conceded_away_when_away
            features.at[idx, 'away_avg_conceded'] = total_away_conceded / len(away_team_matches)
        else:
            features.at[idx, 'away_avg_goals'] = 0.0
            features.at[idx, 'away_avg_conceded'] = 0.0

        # ===== FORME RÉCENTE (derniers N matchs) =====
        home_recent = home_team_matches.tail(recent_matches)
        away_recent = away_team_matches.tail(recent_matches)

        if len(home_recent) > 0:
            # calcul correct des points en tenant compte Home/Away dans chaque ligne
            home_points = home_recent.apply(lambda r: points_for_team(r, home_team), axis=1).sum()
            features.at[idx, 'home_recent_form'] = home_points
        else:
            features.at[idx, 'home_recent_form'] = 0.0

        if len(away_recent) > 0:
            away_points = away_recent.apply(lambda r: points_for_team(r, away_team), axis=1).sum()
            features.at[idx, 'away_recent_form'] = away_points
        else:
            features.at[idx, 'away_recent_form'] = 0.0

        # ===== HEAD-TO-HEAD (historique direct) =====
        h2h = history_window[
            ((history_window['HomeTeam'] == home_team) & (history_window['AwayTeam'] == away_team)) |
            ((history_window['HomeTeam'] == away_team) & (history_window['AwayTeam'] == home_team))
        ]

        if len(h2h) > 0:
            home_wins_h2h = len(h2h[
                ((h2h['HomeTeam'] == home_team) & (h2h['FTR'] == 'H')) |
                ((h2h['AwayTeam'] == home_team) & (h2h['FTR'] == 'A'))
            ])
            away_wins_h2h = len(h2h[
                ((h2h['HomeTeam'] == away_team) & (h2h['FTR'] == 'H')) |
                ((h2h['AwayTeam'] == away_team) & (h2h['FTR'] == 'A'))
            ])
            features.at[idx, 'home_h2h_wins'] = home_wins_h2h
            features.at[idx, 'away_h2h_wins'] = away_wins_h2h
        else:
            features.at[idx, 'home_h2h_wins'] = 0.0
            features.at[idx, 'away_h2h_wins'] = 0.0

        # ===== RECORD HOME/AWAY =====
        home_at_home = history_window[history_window['HomeTeam'] == home_team]
        if len(home_at_home) > 0:
            home_wins = len(home_at_home[home_at_home['FTR'] == 'H'])
            features.at[idx, 'home_home_record'] = home_wins / len(home_at_home)
        else:
            features.at[idx, 'home_home_record'] = 0.0

        away_away = history_window[history_window['AwayTeam'] == away_team]
        if len(away_away) > 0:
            away_wins = len(away_away[away_away['FTR'] == 'A'])
            features.at[idx, 'away_away_record'] = away_wins / len(away_away)
        else:
            features.at[idx, 'away_away_record'] = 0.0

    return features

features_df = create_features_before_match(final, recent_matches=5)

In [33]:
season_2019_2020= features_df[features_df['Season']=='2019-2020']

I want to briefly check what was done : Let us check that data at row 379

In [34]:
west_ham_2019_2020= season_2019_2020[(season_2019_2020['HomeTeam']=='West Ham') | (season_2019_2020['AwayTeam']=='West Ham')]

In [35]:
west_ham_2019_2020[['HomeTeam', 'AwayTeam', 'Date', 'Time', 'home_days_rest', 'away_days_rest']]

Unnamed: 0,HomeTeam,AwayTeam,Date,Time,home_days_rest,away_days_rest
1,West Ham,Man City,2019-08-10,12:30,0,0
13,Brighton,West Ham,2019-08-17,15:00,7,7
26,Watford,West Ham,2019-08-24,15:00,7,7
36,West Ham,Norwich,2019-08-31,15:00,7,7
49,Aston Villa,West Ham,2019-09-16,20:00,16,16
57,West Ham,Man United,2019-09-22,14:00,6,8
60,Bournemouth,West Ham,2019-09-28,15:00,8,6
74,West Ham,Crystal Palace,2019-10-05,17:30,7,7
80,Everton,West Ham,2019-10-19,12:30,14,14
94,West Ham,Sheffield United,2019-10-26,15:00,7,5
