In [1]:
import pandas as pd

data = pd.read_csv('D:\dev\project\Football-Match-Prediction\data\processed\la_liga_data.csv')
data['Time'] = pd.to_datetime(data['Time'])
data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5520 entries, 0 to 5519
Data columns (total 165 columns):
 #    Column                Dtype         
---   ------                -----         
 0    Time                  datetime64[ns]
 1    Comp                  object        
 2    Round                 int64         
 3    Day                   int64         
 4    Venue                 int64         
 5    Result                object        
 6    GF                    float64       
 7    GA                    float64       
 8    Opponent              object        
 9    xG                    float64       
 10   xGA                   float64       
 11   Captain               object        
 12   Formation             object        
 13   Opp Formation         object        
 14   Referee               object        
 15   Standard__Gls         int64         
 16   Standard__Sh          float64       
 17   Standard__SoT         float64       
 18   Standard__SoT%        floa

In [2]:
data.head()

Unnamed: 0,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,...,Performance__OG,Performance__Recov,Aerial Duels__Won,Aerial Duels__Lost,Aerial Duels__Won%,Season,Team,Points,Poss,GD
0,2017-08-18 20:15:00,La Liga,1,5,0,L,0.0,1.0,Leganes,1.1,...,0.0,60.0,16.0,30.0,34.8,2017,Alaves,0,47.0,-1.0
1,2017-08-18 20:15:00,La Liga,1,5,1,W,1.0,0.0,Alaves,1.3,...,0.0,55.0,30.0,16.0,65.2,2017,Leganes,3,53.0,1.0
2,2017-08-18 22:15:00,La Liga,1,5,0,L,0.0,1.0,Valencia,0.2,...,0.0,42.0,6.0,8.0,42.9,2017,Las Palmas,0,53.0,-1.0
3,2017-08-18 22:15:00,La Liga,1,5,1,W,1.0,0.0,Las Palmas,1.4,...,0.0,54.0,8.0,6.0,57.1,2017,Valencia,3,47.0,1.0
4,2017-08-19 18:15:00,La Liga,1,6,1,L,2.0,3.0,Real Sociedad,1.8,...,0.0,48.0,11.0,12.0,47.8,2017,Celta Vigo,0,53.0,-1.0


In [3]:
data.drop(columns=['Comp', 'Day', 'Captain', 'Formation', 'Opp Formation', 'Referee'], inplace=True)

#### Get current form features

In [4]:
current_form_features = ['GF', 'GA', 'GD', 'Standard__SoT', 'Poss', 'Performance__Save%']
window_size = 5

In [5]:
# Sort the data by Team, Season, Round, and Time
data = data.sort_values(['Team', 'Season', 'Round', 'Time'])
train_data = data[['Time', 'Season', 'Venue', 'Round', 'Team', 'Opponent', 'GF', 'GA', 'Result']].copy()
# Calculate rolling averages for the last 5 matches
for feature in current_form_features:
    train_data[f"{feature}_avg_last5"] = data.groupby('Team')[feature].transform(
        lambda x: x.shift().rolling(window=window_size, min_periods=1).mean()
    )

train_data['CurrentPoints'] = data.groupby(['Team', 'Season'])['Points'].transform(
    lambda x: x.shift().cumsum()
).fillna(0)

In [6]:
train_data.sort_values(['Team', 'Season', 'Round', 'Time'], inplace=True)
train_data[['Team', 'Season', 'Round', 'Time', 'CurrentPoints']].head(45)

Unnamed: 0,Team,Season,Round,Time,CurrentPoints
0,Alaves,2017,1,2017-08-18 20:15:00,
25,Alaves,2017,2,2017-08-26 18:15:00,0.0
55,Alaves,2017,3,2017-09-10 18:30:00,0.0
70,Alaves,2017,4,2017-09-17 12:00:00,0.0
88,Alaves,2017,5,2017-09-20 21:00:00,0.0
103,Alaves,2017,6,2017-09-23 16:15:00,0.0
127,Alaves,2017,7,2017-09-30 18:30:00,0.0
147,Alaves,2017,8,2017-10-14 18:30:00,3.0
162,Alaves,2017,9,2017-10-21 16:15:00,3.0
181,Alaves,2017,10,2017-10-28 13:00:00,3.0


In [None]:
prematch_cols = ['Time', 'Round', 'Venue', 'Season']
# train_data = data[prematch_cols + ['Team', 'Opponent', 'Result', 'GF', 'GA']].sort_values(by=['Season', 'Round', 'Time'])
# Rename columns to include 'HomeTeam_' prefix where Venue == 1
df_team = train_data[train_data['Venue'] == 1].copy()
df_team['HomeTeam'] = df_team['Team']
df_team.drop(columns=['Team'], inplace=True)
df_team['AwayTeam'] = df_team['Opponent']
df_team.drop(columns=['Opponent'], inplace=True)
df_team = df_team.rename(columns=lambda x: f"HomeTeam_{x}" if x not in (prematch_cols + ['HomeTeam', 'AwayTeam']) else x)
df_team.drop(columns=['Venue'], inplace=True)
df_opponent = train_data[train_data['Venue'] == 0].copy()
df_opponent['AwayTeam'] = df_opponent['Team']
df_opponent = df_opponent.drop(columns=['Team'])
df_opponent['HomeTeam'] = df_opponent['Opponent']
df_opponent = df_opponent.drop(columns=['Opponent'])
df_opponent = df_opponent.rename(columns=lambda x: f"AwayTeam_{x}" if x not in (prematch_cols + ['HomeTeam', 'AwayTeam']) else x)
df_opponent.drop(columns=['Venue'], inplace=True)   
# Merge the two dataframes
prematch_cols.remove('Venue')
train_data_merged = pd.merge(df_team, df_opponent, on=(prematch_cols + ['HomeTeam', 'AwayTeam']))
train_data_merged.drop(columns=['HomeTeam_GA', 'AwayTeam_GA', 'AwayTeam_Result'], inplace=True)

In [11]:
train_data_merged = train_data_merged.sort_values(['Season', 'Round', 'Time']).reset_index(drop=True)
train_data_merged.head(45)

Unnamed: 0,Time,Season,Round,HomeTeam_GF,HomeTeam_Result,HomeTeam_GF_avg_last5,HomeTeam_GA_avg_last5,HomeTeam_GD_avg_last5,HomeTeam_Standard__SoT_avg_last5,HomeTeam_Poss_avg_last5,...,HomeTeam,AwayTeam,AwayTeam_GF,AwayTeam_GF_avg_last5,AwayTeam_GA_avg_last5,AwayTeam_GD_avg_last5,AwayTeam_Standard__SoT_avg_last5,AwayTeam_Poss_avg_last5,AwayTeam_Performance__Save%_avg_last5,AwayTeam_CurrentPoints
0,2017-08-18 20:15:00,2017,1,1.0,W,,,,,,...,Leganes,Alaves,0.0,,,,,,,
1,2017-08-18 22:15:00,2017,1,1.0,W,,,,,,...,Valencia,Las Palmas,0.0,,,,,,,
2,2017-08-19 18:15:00,2017,1,2.0,L,,,,,,...,Celta Vigo,Real Sociedad,3.0,,,,,,,
3,2017-08-19 20:15:00,2017,1,2.0,D,,,,,,...,Girona,Atletico Madrid,2.0,,,,,,,
4,2017-08-19 22:15:00,2017,1,1.0,D,,,,,,...,Sevilla,Espanyol,1.0,,,,,,,
5,2017-08-20 18:15:00,2017,1,0.0,D,,,,,,...,Athletic Club,Getafe,0.0,,,,,,,
6,2017-08-20 20:15:00,2017,1,2.0,W,,,,,,...,Barcelona,Real Betis,0.0,,,,,,,
7,2017-08-20 22:15:00,2017,1,0.0,L,,,,,,...,Deportivo La Coruna,Real Madrid,3.0,,,,,,,
8,2017-08-21 20:15:00,2017,1,1.0,W,,,,,,...,Levante,Villarreal,0.0,,,,,,,
9,2017-08-21 22:00:00,2017,1,0.0,L,,,,,,...,Malaga,Eibar,1.0,,,,,,,
