In [1]:
import pandas as pd
import numpy as np
data = pd.read_csv('D:\dev\project\Football-Match-Prediction\data\processed\la_liga_data.csv')
data['Time'] = pd.to_datetime(data['Time'])
data.info(verbose=True)

In [2]:
data.head()

In [3]:
data.drop(columns=['Comp', 'Day', 'Captain', 'Formation', 'Opp Formation', 'Referee'], inplace=True)

In [4]:
for col in data.select_dtypes(include=[np.number]).columns:
    data[col] = data[col].fillna(data.groupby('Team')[col].transform('mean'))
    # df2[cols] = df2[cols].fillna(df2.groupby('Team')[cols].mean()) 

#### Get current form features

In [5]:
current_form_features = pd.read_csv('D:/dev/project/Football-Match-Prediction/src/eda/eda_data/selected/all_feature_names.csv')#.drop(axis = 0, labels=['Performance__GA', 'Standard__Gls'], inplace=False)   
window_size = 5
current_form_features = set(current_form_features['Feature_Name'].values)
current_form_features.remove('Performance__GA')
current_form_features.remove('Standard__Gls')
current_form_features = list(current_form_features)
current_form_features

In [6]:
# Sort the data by Team, Season, Round, and Time
data = data.sort_values(['Team', 'Season', 'Round', 'Time'])
train_data = data[['Time', 'Season', 'Venue', 'Round', 'Team', 'Opponent', 'GF', 'GA', 'Result']].copy()
# Calculate rolling averages for the last 5 matches
for feature in current_form_features:
    if feature != 'Performance__CS':
        train_data[f"{feature}_avg_last5"] = data.groupby('Team')[feature].transform(
            lambda x: x.shift().rolling(window=window_size, min_periods=1).mean()
        )
    else:
        train_data[f"{feature}_avg_last5"] = data.groupby('Team')[feature].transform(
            lambda x: x.shift().rolling(window=window_size, min_periods=1).sum()
        )
# Calculate the current points

train_data['CurrentPoints'] = data.groupby(['Team', 'Season'])['Points'].transform(
    lambda x: x.shift().cumsum()
).fillna(0)

In [7]:
train_data.sort_values(['Team', 'Season', 'Round', 'Time'], inplace=True)
train_data[['Team', 'Season', 'Round', 'Time', 'CurrentPoints']].head(45)

In [8]:
prematch_cols = ['Time', 'Round', 'Venue', 'Season']
# train_data = data[prematch_cols + ['Team', 'Opponent', 'Result', 'GF', 'GA']].sort_values(by=['Season', 'Round', 'Time'])
# Rename columns to include 'HomeTeam_' prefix where Venue == 1
df_team = train_data[train_data['Venue'] == 1].copy()
df_team['HomeTeam'] = df_team['Team']
df_team.drop(columns=['Team'], inplace=True)
df_team['AwayTeam'] = df_team['Opponent']
df_team.drop(columns=['Opponent'], inplace=True)
df_team = df_team.rename(columns=lambda x: f"HomeTeam_{x}" if x not in (prematch_cols + ['HomeTeam', 'AwayTeam']) else x)
df_team.drop(columns=['Venue'], inplace=True)
df_opponent = train_data[train_data['Venue'] == 0].copy()
df_opponent['AwayTeam'] = df_opponent['Team']
df_opponent = df_opponent.drop(columns=['Team'])
df_opponent['HomeTeam'] = df_opponent['Opponent']
df_opponent = df_opponent.drop(columns=['Opponent'])
df_opponent = df_opponent.rename(columns=lambda x: f"AwayTeam_{x}" if x not in (prematch_cols + ['HomeTeam', 'AwayTeam']) else x)
df_opponent.drop(columns=['Venue'], inplace=True)   
# Merge the two dataframes
prematch_cols.remove('Venue')
train_data_merged = pd.merge(df_team, df_opponent, on=(prematch_cols + ['HomeTeam', 'AwayTeam']))
train_data_merged.drop(columns=['HomeTeam_GA', 'AwayTeam_GA', 'AwayTeam_Result'], inplace=True)

In [9]:
train_data_merged = train_data_merged.sort_values(['Season', 'Round', 'Time']).reset_index(drop=True)

In [10]:
train_data_merged.info(verbose=True)

In [11]:
train_data.sort_values('Time', inplace=True)

# Initialize a list to store head-to-head stats
h2h_stats = []

# Iterate over each match in df_merged
for index, row in train_data_merged.iterrows():
    home_team = row['HomeTeam']
    away_team = row['AwayTeam']
    match_time = row['Time']
    
    # Filter past matches between the two teams
    past_matches = train_data[
        (((train_data['Team'] == home_team) & (train_data['Opponent'] == away_team)) |
         ((train_data['Team'] == away_team) & (train_data['Opponent'] == home_team))) &
        (train_data['Time'] < match_time)
    ]
    
    total_matches = len(past_matches)
    if total_matches == 0:
        home_wins = away_wins = draws = home_goals = away_goals = 0
    else:
        # Results from the perspective of the home team
        home_wins = ((past_matches['Team'] == home_team) & (past_matches['Result'] == 'W')).sum()
        away_wins = ((past_matches['Team'] == home_team) & (past_matches['Result'] == 'L')).sum()
        draws = (past_matches['Result'] == 'D').sum()
        
        # Goals scored by each team
        home_goals = past_matches.loc[past_matches['Team'] == home_team, 'GF'].sum() + \
                     past_matches.loc[past_matches['Opponent'] == home_team, 'GA'].sum()
        away_goals = past_matches.loc[past_matches['Team'] == away_team, 'GF'].sum() + \
                     past_matches.loc[past_matches['Opponent'] == away_team, 'GA'].sum()
    
    h2h_stats.append({
        'H2H_Total_Matches': total_matches,
        'H2H_Home_Wins': home_wins,
        'H2H_Away_Wins': away_wins,
        'H2H_Draws': draws,
        'H2H_Home_Goals': home_goals,
        'H2H_Away_Goals': away_goals,
    })

# Convert the list to a DataFrame
h2h_stats_df = pd.DataFrame(h2h_stats)

# Merge the head-to-head stats with the original dataframe
df_merged = pd.concat([train_data_merged.reset_index(drop=True), h2h_stats_df], axis=1)

In [15]:
df_merged.sort_values('Time', inplace=True)
df_merged.head()

In [25]:
df_merged.sort_values('Time', inplace=True)
df_merged.to_csv('D:/dev/project/Football-Match-Prediction/data/processed/df_merged.csv', index=False)

In [12]:
# import pandas as pd

# # Ensure 'Time' is datetime and sort the DataFrame chronologically
# train_data_merged['Time'] = pd.to_datetime(train_data_merged['Time'])
# train_data_merged.sort_values('Time', inplace=True)

# # Create a unique identifier for each team pair
# train_data_merged['TeamPair'] = train_data_merged.apply(lambda x: '_'.join(sorted([x['HomeTeam'], x['AwayTeam']])), axis=1)

# # Sort by 'TeamPair' and 'Time' to prepare for cumulative calculations
# train_data_merged.sort_values(['TeamPair', 'Time'], inplace=True)
# train_data_merged.head()


In [13]:
# # Initialize the head-to-head features
# head_to_head_features = [
#     'HeadToHead_Matches',
#     'HeadToHead_HomeWins',
#     'HeadToHead_AwayWins',
#     'HeadToHead_Draws',
#     'HeadToHead_HomeGoals',
#     'HeadToHead_AwayGoals'
# ]

# for feature in head_to_head_features:
#     train_data_merged[feature] = 0

# # Calculate cumulative head-to-head statistics excluding current match
# train_data_merged['HeadToHead_Matches'] = train_data_merged.groupby('TeamPair').cumcount()

# # Home Wins
# train_data_merged['HeadToHead_HomeWins'] = train_data_merged.groupby('TeamPair')['HomeTeam_Result'].transform(lambda x: (x == 'W').cumsum()).shift(1).fillna(0)

# # Away Wins
# train_data_merged['HeadToHead_AwayWins'] = train_data_merged.groupby('TeamPair')['HomeTeam_Result'].transform(lambda x: (x == 'L').cumsum()).shift(1).fillna(0)

# # Draws
# train_data_merged['HeadToHead_Draws'] = train_data_merged.groupby('TeamPair')['HomeTeam_Result'].transform(lambda x: (x == 'D').cumsum()).shift(1).fillna(0)

# # Cumulative Goals
# train_data_merged['HeadToHead_HomeGoals'] = train_data_merged.groupby('TeamPair')['HomeTeam_GF'].transform('cumsum').shift(1).fillna(0)
# train_data_merged['HeadToHead_AwayGoals'] = train_data_merged.groupby('TeamPair')['AwayTeam_GF'].transform('cumsum').shift(1).fillna(0)

In [14]:
pd.set_option('display.max_columns', None)

train_data_merged.sort_values('Time', inplace=True)
train_data_merged.head()