Import necessary libraries

In [130]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

Import all data

In [150]:
seasons = range(2010,2023)
premier_match_data = {}
premier_team_data = {}
premier_player_data = {}

for i in seasons:
    match_data_file = f'..//data//england_premier_match//england-premier-league-matches-{i}-to-{i+1}-stats.csv'
    team_data_file = f'..//data//england_premier_team//england-premier-league-teams-{i}-to-{i+1}-stats.csv'
    player_data_file = f'..//data//england_premier_player//england-premier-league-players-{i}-to-{i+1}-stats.csv'

    premier_match_data[f'{i}_{i+1}'] = pd.read_csv(match_data_file)
    premier_team_data[f'{i}_{i+1}'] = pd.read_csv(team_data_file)
    premier_player_data[f'{i}_{i+1}'] = pd.read_csv(player_data_file)


Pre processing data
1. fill all missing value with -1 
2. Split goal scoring minutes into first half and second half
3. label encoding all object column
4. group previous three matches as the feature to train model.

In [151]:
def bin_goal_timings(goal_timings):
    # Parse the string into a list of integers
    if not isinstance(goal_timings, str):
        return (0,0)  # Return (0,0) if goal_timings is not a string

    if goal_timings=='-1':
        return (0,0)
    # Split the string into a list of times
    goal_times = goal_timings.split(',')

    first_half_goals = 0
    second_half_goals = 0

    for time in goal_times:
        # Check if it's stoppage time
        if "'" in time:
            time_parts = time.split("'")
            # Consider stoppage time as part of the second half
            if int(time_parts[0]) >= 45:
                second_half_goals += 1
        else:
            # Check if the goal was scored in the first or second half
            if int(time) <= 45:
                first_half_goals += 1
            else:
                second_half_goals += 1
    return first_half_goals, second_half_goals


In [152]:
# Create a list of all team names across all seasons
all_teams = []
all_referee = []
all_stadiums = []
for season in premier_match_data.keys():
    all_teams.extend(list(premier_match_data[season]['home_team_name'].unique()))
    all_referee.extend(list(premier_match_data[season]['referee'].unique()))
    all_stadiums.extend(list(premier_match_data[season]['stadium_name'].unique()))
    

In [153]:
le = LabelEncoder()
le_teams = LabelEncoder()
le_referee = LabelEncoder()
le_stadium = LabelEncoder()

le_teams.fit(all_teams)
le_referee.fit(all_referee)
le_stadium.fit(all_stadiums)


avoid_column = ['home_team_name','away_team_name','referee','stadium_name']
for season, df in premier_match_data.items():

    # Transform teams
    df['home_team_name'] = le_teams.transform(df['home_team_name'])
    df['away_team_name'] = le_teams.transform(df['away_team_name'])
    df['referee'] = le_referee.transform(df['referee'])
    df['stadium_name'] = le_stadium.transform(df['stadium_name'])

    premier_match_data[season] = df.fillna(-1,inplace=True)
   
    # Convert the column to string type
    for col in df.columns:
        
        if df[col].dtype == 'object' and col not in avoid_column:
            # Convert the column to string type
            df[col] = df[col].astype(str)
            
            # Apply the label encoder
            le.fit(df[col])
            df[col] = le.transform(df[col])

    
    premier_match_data[season] = df
    

Now we need to group previous 3 game stats as features of this game

In [154]:
def calculate_rolling_stats(group):
    # List of columns to skip
    columns_to_skip = ['timestamp', 'date_GMT', 'status', 'home_team_name', 'away_team_name', 'referee', 'Game Week']
    group = group.sort_values('Game Week')
    # Iterate over all columns in the group
    for col in group.columns:
        # Skip the column if it is in the list of columns to skip
        if col not in columns_to_skip:
            # The rolling window size is 3, which means the previous 3 games
            group['rolling_avg_' + col] = group[col].rolling(window=3).mean()

    # Shift the data down 1 so the current game's stats aren't included
    group = group.shift(1)

    # Drop the first two rows (which won't have any rolling data)
    group = group.iloc[3:]
    
    return group


Get all rolling data and remove those without. 
Leave the 2022-23 as a test dataset.

In [155]:
# Initialize empty dictionaries to hold the stats for all seasons
all_seasons_home_stats = {}
all_seasons_away_stats = {}

# Get a list of all season keys, sorted in ascending order
seasons = sorted(premier_match_data.keys())

# Use all seasons except the last one
for season in seasons[:-1]:
    # Apply the calculate_rolling_stats function to the home and away data for this season
    all_seasons_home_stats[season] = premier_match_data[season].groupby('home_team_name').apply(calculate_rolling_stats)
    all_seasons_away_stats[season] = premier_match_data[season].groupby('away_team_name').apply(calculate_rolling_stats)

# Convert the dictionaries to dataframes
all_seasons_home_stats_df = pd.concat(all_seasons_home_stats, keys=all_seasons_home_stats.keys())
all_seasons_away_stats_df = pd.concat(all_seasons_away_stats, keys=all_seasons_away_stats.keys())

# Now, the last season's data can be accessed separately like this
last_season_home_stats = premier_match_data[seasons[-1]].groupby('home_team_name').apply(calculate_rolling_stats)
last_season_away_stats = premier_match_data[seasons[-1]].groupby('away_team_name').apply(calculate_rolling_stats)


In [156]:
all_seasons_home_stats_df.to_csv("2022-23.csv", index=False)

Train

In [159]:
home_goals = all_seasons_home_stats_df['home_team_goal_count']
away_goals = all_seasons_away_stats_df['away_team_goal_count']

columns_to_drop_home = ['home_team_goal_count','total_goal_count','total_goals_at_half_time','home_team_goal_count_half_time','away_team_goal_count_half_time','home_team_goal_timings'
                   ,'away_team_goal_timings','home_team_corner_count','away_team_corner_count','home_team_yellow_cards','home_team_red_cards','away_team_yellow_cards','away_team_red_cards',
                   'home_team_first_half_cards','home_team_second_half_cards','away_team_first_half_cards','away_team_second_half_cards','home_team_shots','away_team_shots','home_team_shots_on_target',
                   'away_team_shots_on_target','home_team_shots_off_target','away_team_shots_off_target','home_team_fouls','away_team_fouls','home_team_possession','away_team_possession',
                   'status','attendance']

columns_to_drop_away = ['away_team_goal_count','total_goal_count','total_goals_at_half_time','home_team_goal_count_half_time','away_team_goal_count_half_time','home_team_goal_timings'
                   ,'away_team_goal_timings','home_team_corner_count','away_team_corner_count','home_team_yellow_cards','home_team_red_cards','away_team_yellow_cards','away_team_red_cards',
                   'home_team_first_half_cards','home_team_second_half_cards','away_team_first_half_cards','away_team_second_half_cards','home_team_shots','away_team_shots','home_team_shots_on_target',
                   'away_team_shots_on_target','home_team_shots_off_target','away_team_shots_off_target','home_team_fouls','away_team_fouls','home_team_possession','away_team_possession',
                   'status','attendance']

# Drop these columns from the predictor variables DataFrame
X_home = all_seasons_home_stats_df.drop(columns_to_drop_home, axis=1)
X_away = all_seasons_away_stats_df.drop(columns_to_drop_away, axis=1)

# Use train_test_split to split the data
X_train_home, X_test_home, y_home_train, y_home_test = train_test_split(X_home, home_goals, test_size=0.2, random_state=42)
X_train_away, X_test_away, y_away_train, y_away_test = train_test_split(X_away, away_goals, test_size=0.2, random_state=42)

In [160]:
X_home

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,timestamp,date_GMT,home_team_name,away_team_name,referee,Game Week,Pre-Match PPG (Home),Pre-Match PPG (Away),home_ppg,away_ppg,...,rolling_avg_odds_ft_home_team_win,rolling_avg_odds_ft_draw,rolling_avg_odds_ft_away_team_win,rolling_avg_odds_ft_over15,rolling_avg_odds_ft_over25,rolling_avg_odds_ft_over35,rolling_avg_odds_ft_over45,rolling_avg_odds_btts_yes,rolling_avg_odds_btts_no,rolling_avg_stadium_name
Unnamed: 0_level_1,home_team_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2010_2011,1,70,1.285423e+09,189.0,1.0,35.0,21.0,6.0,3.00,0.00,1.95,0.89,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,16.0
2010_2011,1,90,1.287238e+09,162.0,1.0,3.0,20.0,8.0,2.00,0.67,1.95,0.68,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,16.0
2010_2011,1,107,1.288447e+09,173.0,1.0,36.0,24.0,10.0,2.25,0.50,1.95,0.68,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,16.0
2010_2011,1,130,1.289137e+09,135.0,1.0,23.0,23.0,11.0,2.40,1.40,1.95,1.05,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,16.0
2010_2011,1,150,1.290257e+09,147.0,1.0,33.0,29.0,14.0,2.00,1.17,1.95,1.37,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021_2022,38,284,1.646492e+09,131.0,38.0,12.0,1.0,28.0,1.42,0.92,1.26,1.00,...,2.760000,3.220000,2.923333,1.456667,2.320000,4.316667,9.016667,1.983333,1.766667,35.0
2021_2022,38,293,1.647634e+09,147.0,38.0,17.0,15.0,30.0,1.43,0.71,1.26,1.05,...,2.280000,3.286667,3.290000,1.426667,2.170000,4.083333,8.483333,1.916667,1.833333,35.0
2021_2022,38,356,1.648908e+09,1.0,38.0,2.0,7.0,31.0,1.33,1.20,1.26,1.16,...,2.440000,3.286667,3.073333,1.430000,2.180000,4.083333,8.616667,1.916667,1.833333,35.0
2021_2022,38,336,1.652296e+09,162.0,38.0,20.0,20.0,33.0,1.35,2.47,1.26,2.42,...,5.330000,4.050000,2.323333,1.326667,1.933333,3.483333,7.200000,1.983333,1.793333,35.0


In [161]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "Support Vector Regression": SVR()
}

In [162]:
def fit_and_compute_mse(models, X_train, X_test, y_train, y_test):
    for name, model in models.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        mse = mean_squared_error(y_test, predictions)
        print(f"{name} MSE: {mse}")


In [163]:
print("Home Goals:")
fit_and_compute_mse(models, X_train_home, X_test_home, y_home_train, y_home_test)

print("Away Goals:")
fit_and_compute_mse(models, X_train_away, X_test_away, y_away_train, y_away_test)

Home Goals:
Linear Regression MSE: 0.8066430108305854
Decision Tree MSE: 1.9114583333333333
Random Forest MSE: 0.9554078125
Gradient Boosting MSE: 0.8930710127085946
Support Vector Regression MSE: 1.8178025299790537
Away Goals:
Linear Regression MSE: 0.7003482287265063
Decision Tree MSE: 1.8528645833333333
