Import necessary libraries

In [85]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

Import all data

In [144]:
seasons = range(2010,2023)
premier_match_data = {}
premier_team_data = {}
premier_player_data = {}

for i in seasons:
    match_data_file = f'..//data//england_premier_match//england-premier-league-matches-{i}-to-{i+1}-stats.csv'
    team_data_file = f'..//data//england_premier_team//england-premier-league-teams-{i}-to-{i+1}-stats.csv'
    player_data_file = f'..//data//england_premier_player//england-premier-league-players-{i}-to-{i+1}-stats.csv'

    premier_match_data[f'{i}_{i+1}'] = pd.read_csv(match_data_file)
    premier_team_data[f'{i}_{i+1}'] = pd.read_csv(team_data_file)
    premier_player_data[f'{i}_{i+1}'] = pd.read_csv(player_data_file)


Pre processing data
1. fill all missing value with -1 
2. Split goal scoring minutes into first half and second half
3. label encoding referee column



In [136]:
def bin_goal_timings(goal_timings):
    # Parse the string into a list of integers
    if not isinstance(goal_timings, str):
        return (0,0)  # Return (0,0) if goal_timings is not a string

    if goal_timings=='-1':
        return (0,0)
    # Split the string into a list of times
    goal_times = goal_timings.split(',')

    first_half_goals = 0
    second_half_goals = 0

    for time in goal_times:
        # Check if it's stoppage time
        if "'" in time:
            time_parts = time.split("'")
            # Consider stoppage time as part of the second half
            if int(time_parts[0]) >= 45:
                second_half_goals += 1
        else:
            # Check if the goal was scored in the first or second half
            if int(time) <= 45:
                first_half_goals += 1
            else:
                second_half_goals += 1
    return first_half_goals, second_half_goals


In [145]:
le = LabelEncoder()

for season, df in premier_match_data.items():
    premier_match_data[season] = df.fillna(-1)
    # split goal scoring time into first and second half
    df['first_half_goals_home'], df['second_half_goals_home'] = zip(*df['home_team_goal_timings'].apply(bin_goal_timings))
    df['first_half_goals_away'], df['second_half_goals_away'] = zip(*df['away_team_goal_timings'].apply(bin_goal_timings))

    # Convert the column to string type
    for col in df.columns:
        

        if df[col].dtype == 'object' and (col != 'home_team_name' or col != 'away_team_name'):
            # Convert the column to string type
            df[col] = df[col].astype(str)
            
            # Apply the label encoder
            le.fit(df[col])
            df[col] = le.transform(df[col])


    
    # label team names
    le.fit(pd.concat([df['home_team_name'], df['away_team_name']]))
    # Transform both columns
    df['home_team_name'] = le.transform(df['home_team_name'])
    df['away_team_name'] = le.transform(df['away_team_name'])


    premier_match_data[season] = df
    
    

In [147]:
premier_match_data['2022_2023']

Unnamed: 0,timestamp,date_GMT,status,attendance,home_team_name,away_team_name,referee,Game Week,Pre-Match PPG (Home),Pre-Match PPG (Away),...,odds_ft_over25,odds_ft_over35,odds_ft_over45,odds_btts_yes,odds_btts_no,stadium_name,first_half_goals_home,second_half_goals_home,first_half_goals_away,second_half_goals_away
0,1659726000,37,0,25286.0,6,1,2,1,0.00,0.00,...,1.90,3.25,6.75,1.80,1.95,11,0,0,1,1
1,1659785400,38,0,,8,11,1,1,0.00,0.00,...,1.66,2.43,4.10,1.91,1.91,1,1,1,0,2
2,1659794400,39,0,,0,2,15,1,0.00,0.00,...,1.95,3.66,7.50,1.80,1.95,19,1,1,0,0
3,1659794400,39,0,,9,19,16,1,0.00,0.00,...,2.03,3.62,7.50,1.80,1.95,2,1,1,1,0
4,1659794400,39,0,52245.0,14,15,18,1,0.00,0.00,...,1.94,3.73,7.50,2.00,1.75,12,0,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,1685287800,170,0,,7,0,19,38,1.00,0.94,...,1.67,2.63,4.50,1.91,1.91,5,0,1,0,0
376,1685287800,170,0,,9,17,2,38,1.22,1.11,...,1.44,2.30,3.75,1.44,2.63,2,0,1,1,3
377,1685287800,170,0,,10,18,18,38,0.89,0.67,...,1.61,2.60,4.50,1.62,2.20,7,1,1,0,1
378,1685287800,170,0,,13,8,16,38,2.50,1.28,...,1.40,2.10,3.50,1.62,2.20,10,1,1,1,0


In [100]:
seasons_team_names = {}
for season, data in premier_team_data.items():
    seasons_team_names[season] = data['common_name'].unique().tolist()

In [104]:
seasons_team_names

{'2010_2011': ['Arsenal',
  'Tottenham Hotspur',
  'Manchester City',
  'Stoke City',
  'West Bromwich Albion',
  'Everton',
  'Manchester United',
  'Liverpool',
  'Chelsea',
  'West Ham United',
  'Sunderland',
  'Newcastle United',
  'Aston Villa',
  'Fulham',
  'Birmingham City',
  'Blackburn Rovers',
  'Wigan Athletic',
  'Wolverhampton Wanderers',
  'Bolton Wanderers',
  'Blackpool'],
 '2011_2012': ['Arsenal',
  'Tottenham Hotspur',
  'Manchester City',
  'Stoke City',
  'West Bromwich Albion',
  'Everton',
  'Manchester United',
  'Liverpool',
  'Chelsea',
  'Swansea City',
  'Sunderland',
  'Newcastle United',
  'Aston Villa',
  'Norwich City',
  'Queens Park Rangers',
  'Fulham',
  'Blackburn Rovers',
  'Wigan Athletic',
  'Wolverhampton Wanderers',
  'Bolton Wanderers'],
 '2012_2013': ['Arsenal',
  'Tottenham Hotspur',
  'Manchester City',
  'Stoke City',
  'West Bromwich Albion',
  'Everton',
  'Southampton',
  'Manchester United',
  'Liverpool',
  'Chelsea',
  'West Ham Uni

In [101]:
home_match_data = {}
away_match_data = {}

for season, data in premier_match_data.items():
    teams = seasons_team_names[season]  # Get the team names for this season
    home_match_data[season] = {}
    away_match_data[season] = {}

    for team in teams:
        home_match_data[season][team] = data[data['home_team_name'] == team]
        away_match_data[season][team] = data[data['away_team_name'] == team]


In [103]:
home_match_data['2010_2011']

{'Arsenal': Empty DataFrame
 Columns: [timestamp, date_GMT, status, attendance, home_team_name, away_team_name, referee, Game Week, Pre-Match PPG (Home), Pre-Match PPG (Away), home_ppg, away_ppg, home_team_goal_count, away_team_goal_count, total_goal_count, total_goals_at_half_time, home_team_goal_count_half_time, away_team_goal_count_half_time, home_team_goal_timings, away_team_goal_timings, home_team_corner_count, away_team_corner_count, home_team_yellow_cards, home_team_red_cards, away_team_yellow_cards, away_team_red_cards, home_team_first_half_cards, home_team_second_half_cards, away_team_first_half_cards, away_team_second_half_cards, home_team_shots, away_team_shots, home_team_shots_on_target, away_team_shots_on_target, home_team_shots_off_target, away_team_shots_off_target, home_team_fouls, away_team_fouls, home_team_possession, away_team_possession, Home Team Pre-Match xG, Away Team Pre-Match xG, team_a_xg, team_b_xg, average_goals_per_match_pre_match, btts_percentage_pre_match