In [111]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [112]:
df = pd.read_csv("../data/02_preprocessed/2020-2021.csv")
df['date'] = pd.to_datetime(df['date'])
df.head()

Unnamed: 0,date,time,round,home_team,away_team,venue,result,home_goals,away_goals,home_poss,away_poss,home_xg,away_xg,home_sh,away_sh,home_shot_on_target,away_shot_on_target,home_dist_covered,away_dist_covered,home_formation,away_formation,season
0,2020-09-12,12:30,Matchweek 1,Fulham,Arsenal,Home,L,0,3,46.0,54.0,0.1,1.9,5.0,13.0,2.0,5.0,25.1,14.1,4-2-3-1,3-4-3,2021
1,2020-09-12,15:00,Matchweek 1,Crystal Palace,Southampton,Home,W,1,0,31.0,69.0,1.1,0.9,5.0,9.0,3.0,5.0,10.6,15.3,4-4-2,4-4-2,2021
2,2020-09-12,17:30,Matchweek 1,Liverpool,Leeds United,Home,W,4,3,49.0,51.0,2.7,0.3,20.0,6.0,4.0,3.0,18.4,19.9,4-3-3,4-1-4-1,2021
3,2020-09-12,20:00,Matchweek 1,West Ham,Newcastle Utd,Home,L,0,2,58.0,42.0,1.0,1.6,15.0,15.0,3.0,2.0,16.5,17.6,4-2-3-1,4-4-2,2021
4,2020-09-13,14:00,Matchweek 1,West Brom,Leicester City,Home,L,0,3,36.0,64.0,0.4,2.8,7.0,11.0,1.0,5.0,18.4,19.3,5-4-1,4-1-4-1,2021


In [113]:
# Map results to integers
results_map = {"W": 1,
               "D": 0,
               "L": -1} # W,D,L all in the context of the home team

df_prep = df.copy()
df_prep['result'] = df_prep['result'].apply(lambda x: results_map[x])
df_prep.head()

Unnamed: 0,date,time,round,home_team,away_team,venue,result,home_goals,away_goals,home_poss,away_poss,home_xg,away_xg,home_sh,away_sh,home_shot_on_target,away_shot_on_target,home_dist_covered,away_dist_covered,home_formation,away_formation,season
0,2020-09-12,12:30,Matchweek 1,Fulham,Arsenal,Home,-1,0,3,46.0,54.0,0.1,1.9,5.0,13.0,2.0,5.0,25.1,14.1,4-2-3-1,3-4-3,2021
1,2020-09-12,15:00,Matchweek 1,Crystal Palace,Southampton,Home,1,1,0,31.0,69.0,1.1,0.9,5.0,9.0,3.0,5.0,10.6,15.3,4-4-2,4-4-2,2021
2,2020-09-12,17:30,Matchweek 1,Liverpool,Leeds United,Home,1,4,3,49.0,51.0,2.7,0.3,20.0,6.0,4.0,3.0,18.4,19.9,4-3-3,4-1-4-1,2021
3,2020-09-12,20:00,Matchweek 1,West Ham,Newcastle Utd,Home,-1,0,2,58.0,42.0,1.0,1.6,15.0,15.0,3.0,2.0,16.5,17.6,4-2-3-1,4-4-2,2021
4,2020-09-13,14:00,Matchweek 1,West Brom,Leicester City,Home,-1,0,3,36.0,64.0,0.4,2.8,7.0,11.0,1.0,5.0,18.4,19.3,5-4-1,4-1-4-1,2021


In [114]:
# Last 5 games performance for each team (regardless of venue)
for team in df_prep['home_team'].unique():
    # Get all games for this team (both home and away)
    team_home_games = df_prep[df_prep['home_team'] == team].copy()
    team_away_games = df_prep[df_prep['away_team'] == team].copy()
    
    # Flip results for away games (from team's perspective)
    team_away_games['result'] = team_away_games['result'] * -1
    
    # Create standardized columns for goals, xg, etc. from team perspective
    team_away_games['team_goals'] = team_away_games['away_goals']  
    team_away_games['team_xg'] = team_away_games['away_xg']
    team_away_games['team_shots'] = team_away_games['away_sh']
    team_away_games['team_shots_on_target'] = team_away_games['away_shot_on_target']
    team_away_games['team_poss'] = team_away_games['away_poss']
    team_away_games['team_goals_conceded'] = team_away_games['home_goals']
    team_away_games['team_xg_conceded'] = team_away_games['home_xg']
    
    team_home_games['team_goals'] = team_home_games['home_goals']
    team_home_games['team_xg'] = team_home_games['home_xg'] 
    team_home_games['team_shots'] = team_home_games['home_sh']
    team_home_games['team_shots_on_target'] = team_home_games['home_shot_on_target']
    team_home_games['team_poss'] = team_home_games['home_poss']
    team_home_games['team_goals_conceded'] = team_home_games['away_goals']
    team_home_games['team_xg_conceded'] = team_home_games['away_xg']
    
    # Combine and sort by date
    all_team_games = pd.concat([team_home_games, team_away_games]).sort_values('date')
    
    # Calculate rolling stats and assign back to the DataFrame
    all_team_games['form_last_5'] = all_team_games['result'].rolling(5, min_periods=1).mean().shift(1)
    all_team_games['avg_goals_last_5'] = all_team_games['team_goals'].rolling(5, min_periods=1).mean().shift(1)
    all_team_games['avg_goals_conceded_last_5'] = all_team_games['team_goals_conceded'].rolling(5, min_periods=1).mean().shift(1)
    all_team_games['avg_xg_last_5'] = all_team_games['team_xg'].rolling(5, min_periods=1).mean().shift(1)
    all_team_games['avg_xg_conceded_last_5'] = all_team_games['team_xg_conceded'].rolling(5, min_periods=1).mean().shift(1)
    all_team_games['avg_poss_last_5'] = all_team_games['team_poss'].rolling(5, min_periods=1).mean().shift(1)
    all_team_games['avg_shots_last_5'] = all_team_games['team_shots'].rolling(5, min_periods=1).mean().shift(1)
    all_team_games['avg_shots_on_target_last_5'] = all_team_games['team_shots_on_target'].rolling(5, min_periods=1).mean().shift(1)

    # Map back to original DataFrame - For home team
    home_matches = df_prep[df_prep['home_team'] == team]
    for idx in home_matches.index:
        match_date = df_prep.loc[idx, 'date']
        team_stats = all_team_games[all_team_games['date'] == match_date]
        if not team_stats.empty:
            df_prep.loc[idx, 'home_form_last_5'] = team_stats['form_last_5'].iloc[0]
            df_prep.loc[idx, 'home_avg_goals_last_5'] = team_stats['avg_goals_last_5'].iloc[0]
            df_prep.loc[idx, 'home_avg_goals_conceded_last_5'] = team_stats['avg_goals_conceded_last_5'].iloc[0]
            df_prep.loc[idx, 'home_avg_xg_last_5'] = team_stats['avg_xg_last_5'].iloc[0]
            df_prep.loc[idx, 'home_avg_xg_conceded_last_5'] = team_stats['avg_xg_conceded_last_5'].iloc[0]
            df_prep.loc[idx, 'home_avg_poss_last_5'] = team_stats['avg_poss_last_5'].iloc[0]
            df_prep.loc[idx, 'home_avg_shots_last_5'] = team_stats['avg_shots_last_5'].iloc[0]
            df_prep.loc[idx, 'home_avg_shots_on_target_last_5'] = team_stats['avg_shots_on_target_last_5'].iloc[0]
    
    # Map back to original DataFrame - For away team
    away_matches = df_prep[df_prep['away_team'] == team]
    for idx in away_matches.index:
        match_date = df_prep.loc[idx, 'date']
        team_stats = all_team_games[all_team_games['date'] == match_date]
        if not team_stats.empty:
            df_prep.loc[idx, 'away_form_last_5'] = team_stats['form_last_5'].iloc[0]
            df_prep.loc[idx, 'away_avg_goals_last_5'] = team_stats['avg_goals_last_5'].iloc[0]
            df_prep.loc[idx, 'away_avg_goals_conceded_last_5'] = team_stats['avg_goals_conceded_last_5'].iloc[0]
            df_prep.loc[idx, 'away_avg_xg_last_5'] = team_stats['avg_xg_last_5'].iloc[0]
            df_prep.loc[idx, 'away_avg_xg_conceded_last_5'] = team_stats['avg_xg_conceded_last_5'].iloc[0]
            df_prep.loc[idx, 'away_avg_poss_last_5'] = team_stats['avg_poss_last_5'].iloc[0]
            df_prep.loc[idx, 'away_avg_shots_last_5'] = team_stats['avg_shots_last_5'].iloc[0]
            df_prep.loc[idx, 'away_avg_shots_on_target_last_5'] = team_stats['avg_shots_on_target_last_5'].iloc[0]

    # Map back to original DataFrame
    # (This requires matching by date and team - bit more complex



In [115]:
df_prep.head(10)

Unnamed: 0,date,time,round,home_team,away_team,venue,result,home_goals,away_goals,home_poss,away_poss,home_xg,away_xg,home_sh,away_sh,home_shot_on_target,away_shot_on_target,home_dist_covered,away_dist_covered,home_formation,away_formation,season,home_form_last_5,home_avg_goals_last_5,home_avg_goals_conceded_last_5,home_avg_xg_last_5,home_avg_xg_conceded_last_5,home_avg_poss_last_5,home_avg_shots_last_5,home_avg_shots_on_target_last_5,away_form_last_5,away_avg_goals_last_5,away_avg_goals_conceded_last_5,away_avg_xg_last_5,away_avg_xg_conceded_last_5,away_avg_poss_last_5,away_avg_shots_last_5,away_avg_shots_on_target_last_5
0,2020-09-12,12:30,Matchweek 1,Fulham,Arsenal,Home,-1,0,3,46.0,54.0,0.1,1.9,5.0,13.0,2.0,5.0,25.1,14.1,4-2-3-1,3-4-3,2021,,,,,,,,,,,,,,,,
1,2020-09-12,15:00,Matchweek 1,Crystal Palace,Southampton,Home,1,1,0,31.0,69.0,1.1,0.9,5.0,9.0,3.0,5.0,10.6,15.3,4-4-2,4-4-2,2021,,,,,,,,,,,,,,,,
2,2020-09-12,17:30,Matchweek 1,Liverpool,Leeds United,Home,1,4,3,49.0,51.0,2.7,0.3,20.0,6.0,4.0,3.0,18.4,19.9,4-3-3,4-1-4-1,2021,,,,,,,,,,,,,,,,
3,2020-09-12,20:00,Matchweek 1,West Ham,Newcastle Utd,Home,-1,0,2,58.0,42.0,1.0,1.6,15.0,15.0,3.0,2.0,16.5,17.6,4-2-3-1,4-4-2,2021,,,,,,,,,,,,,,,,
4,2020-09-13,14:00,Matchweek 1,West Brom,Leicester City,Home,-1,0,3,36.0,64.0,0.4,2.8,7.0,11.0,1.0,5.0,18.4,19.3,5-4-1,4-1-4-1,2021,,,,,,,,,,,,,,,,
5,2020-09-13,16:30,Matchweek 1,Tottenham,Everton,Home,-1,0,1,52.0,48.0,1.1,1.2,9.0,15.0,5.0,4.0,12.6,17.8,4-2-3-1,4-3-3,2021,,,,,,,,,,,,,,,,
6,2020-09-14,18:00,Matchweek 1,Sheffield Utd,Wolves,Home,-1,0,2,55.0,45.0,1.0,1.4,9.0,11.0,1.0,4.0,12.9,17.2,3-5-2,3-4-3,2021,,,,,,,,,,,,,,,,
7,2020-09-14,20:15,Matchweek 1,Brighton,Chelsea,Home,-1,1,3,52.0,48.0,1.1,1.2,13.0,9.0,3.0,4.0,17.6,21.3,3-5-2,4-2-2-2,2021,,,,,,,,,,,,,,,,
8,2020-09-19,12:30,Matchweek 2,Everton,West Brom,Home,1,5,2,71.0,29.0,3.9,0.3,17.0,6.0,7.0,4.0,12.0,22.2,4-3-3,5-4-1,2021,1.0,1.0,0.0,1.2,1.1,48.0,15.0,4.0,-1.0,0.0,3.0,0.4,2.8,36.0,7.0,1.0
9,2020-09-19,15:00,Matchweek 2,Leeds United,Fulham,Home,1,4,3,51.0,49.0,1.4,1.7,9.0,13.0,6.0,5.0,16.9,21.5,4-1-4-1,4-2-3-1,2021,-1.0,3.0,4.0,0.3,2.7,51.0,6.0,3.0,-1.0,0.0,3.0,0.1,1.9,46.0,5.0,2.0


In [116]:
# Create dummy variables for the team columns separately
dummies = pd.get_dummies(df_prep[['home_team', 'away_team']], prefix=['home', 'away'], dtype=int)

# Concatenate the new dummy columns with the original DataFrame
df_prep = pd.concat([df_prep, dummies], axis=1)

In [118]:
df_prep.columns

Index(['date', 'time', 'round', 'home_team', 'away_team', 'venue', 'result',
       'home_goals', 'away_goals', 'home_poss', 'away_poss', 'home_xg',
       'away_xg', 'home_sh', 'away_sh', 'home_shot_on_target',
       'away_shot_on_target', 'home_dist_covered', 'away_dist_covered',
       'home_formation', 'away_formation', 'season', 'home_form_last_5',
       'home_avg_goals_last_5', 'home_avg_goals_conceded_last_5',
       'home_avg_xg_last_5', 'home_avg_xg_conceded_last_5',
       'home_avg_poss_last_5', 'home_avg_shots_last_5',
       'home_avg_shots_on_target_last_5', 'away_form_last_5',
       'away_avg_goals_last_5', 'away_avg_goals_conceded_last_5',
       'away_avg_xg_last_5', 'away_avg_xg_conceded_last_5',
       'away_avg_poss_last_5', 'away_avg_shots_last_5',
       'away_avg_shots_on_target_last_5', 'home_Arsenal', 'home_Aston Villa',
       'home_Brighton', 'home_Burnley', 'home_Chelsea', 'home_Crystal Palace',
       'home_Everton', 'home_Fulham', 'home_Leeds United',
 

In [71]:
# Form comparison
df_prep['form_difference'] = df_prep['home_form_last_5'] - df_prep['away_form_last_5']

# Performance comparisons  
df_prep['goals_difference'] = df_prep['home_avg_goals_last_5'] - df_prep['away_avg_goals_last_5']
df_prep['xg_difference'] = df_prep['home_avg_xg_last_5'] - df_prep['away_avg_xg_last_5']
df_prep['poss_difference'] = df_prep['home_avg_poss_last_5'] - df_prep['away_avg_poss_last_5']

# Defensive comparison
df_prep['defensive_difference'] = df_prep['away_avg_goals_conceded_last_5'] - df_prep['home_avg_goals_conceded_last_5']

In [77]:
df_prep.drop(columns=['season'], inplace=True)

In [78]:
df_prep

Unnamed: 0,date,time,round,result,home_goals,away_goals,home_poss,away_poss,home_xg,away_xg,home_sh,away_sh,home_shot_on_target,away_shot_on_target,home_dist_covered,away_dist_covered,home_form_last_5,home_avg_goals_last_5,home_avg_goals_conceded_last_5,home_avg_xg_last_5,home_avg_xg_conceded_last_5,home_avg_poss_last_5,home_avg_shots_last_5,home_avg_shots_on_target_last_5,away_form_last_5,away_avg_goals_last_5,away_avg_goals_conceded_last_5,away_avg_xg_last_5,away_avg_xg_conceded_last_5,away_avg_poss_last_5,away_avg_shots_last_5,away_avg_shots_on_target_last_5,home_Arsenal,home_Aston Villa,home_Brighton,home_Burnley,home_Chelsea,home_Crystal Palace,home_Everton,home_Fulham,home_Leeds United,home_Leicester City,home_Liverpool,home_Manchester City,home_Manchester Utd,home_Newcastle Utd,home_Sheffield Utd,home_Southampton,home_Tottenham,home_West Brom,home_West Ham,home_Wolves,away_Arsenal,away_Aston Villa,away_Brighton,away_Burnley,away_Chelsea,away_Crystal Palace,away_Everton,away_Fulham,away_Leeds United,away_Leicester City,away_Liverpool,away_Manchester City,away_Manchester Utd,away_Newcastle Utd,away_Sheffield Utd,away_Southampton,away_Tottenham,away_West Brom,away_West Ham,away_Wolves,form_difference,goals_difference,xg_difference,poss_difference,defensive_difference
0,2020-09-12,12:30,Matchweek 1,-1,0,3,46.0,54.0,0.1,1.9,5.0,13.0,2.0,5.0,25.1,14.1,,,,,,,,,,,,,,,,,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,,
1,2020-09-12,15:00,Matchweek 1,1,1,0,31.0,69.0,1.1,0.9,5.0,9.0,3.0,5.0,10.6,15.3,,,,,,,,,,,,,,,,,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,,,,,
2,2020-09-12,17:30,Matchweek 1,1,4,3,49.0,51.0,2.7,0.3,20.0,6.0,4.0,3.0,18.4,19.9,,,,,,,,,,,,,,,,,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,,,,,
3,2020-09-12,20:00,Matchweek 1,-1,0,2,58.0,42.0,1.0,1.6,15.0,15.0,3.0,2.0,16.5,17.6,,,,,,,,,,,,,,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,,,,,
4,2020-09-13,14:00,Matchweek 1,-1,0,3,36.0,64.0,0.4,2.8,7.0,11.0,1.0,5.0,18.4,19.3,,,,,,,,,,,,,,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,2021-05-23,16:00,Matchweek 38,1,2,0,69.0,31.0,1.6,0.6,19.0,5.0,5.0,4.0,14.7,19.4,0.8,2.4,0.8,2.26,1.16,60.8,19.8,6.4,-0.2,1.4,2.0,1.46,1.36,39.4,15.0,5.8,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1.0,1.0,0.80,21.4,1.2
376,2021-05-23,16:00,Matchweek 38,1,5,0,67.0,33.0,2.5,1.1,21.0,7.0,11.0,2.0,15.6,23.1,0.2,2.2,1.8,1.42,1.12,61.4,14.2,3.8,0.0,0.6,0.6,1.12,1.00,44.8,14.0,4.2,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,0.2,1.6,0.30,16.6,-1.2
377,2021-05-23,16:00,Matchweek 38,1,1,0,43.0,57.0,0.5,1.1,12.0,10.0,3.0,3.0,17.4,13.4,-0.2,0.4,1.4,0.80,1.84,40.6,8.6,1.6,-0.2,1.4,1.8,1.52,1.56,41.6,12.2,4.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,-1.0,-0.72,-1.0,0.4
378,2021-05-23,16:00,Matchweek 38,1,3,0,38.0,62.0,1.3,1.5,14.0,17.0,7.0,5.0,15.9,17.0,0.0,1.2,1.0,1.82,1.58,53.0,15.4,3.4,0.0,1.4,1.4,1.28,1.52,47.2,11.6,4.6,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,0.0,-0.2,0.54,5.8,0.4


In [None]:
df['home_formation'].unique() # interested in somehow making formations a feature

array(['4-2-3-1', '4-4-2', '4-3-3', '5-4-1', '3-5-2', '4-1-4-1', '3-4-3',
       '5-3-2', '3-1-4-2', '4-3-2-1', '4-1-2-1-2◆', '3-4-1-2', '3-4-3◆',
       '3-3-3-1', '4-4-1-1', '4-3-1-2', '4-2-2-2', '4-1-3-2', '3-5-1-1'],
      dtype=object)