In [52]:
import pandas as pd              # For data manipulation and analysis
import numpy as np               # For numerical operations
import matplotlib.pyplot as plt  # For data visualization (basic)
import seaborn as sns            # For advanced data visualization
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [53]:
#Load CSV
df=pd.read_csv('premier_league_combined.csv')
df.head()

Unnamed: 0,Season,SourceFile,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,1993-94,season-9394.csv,1993-08-14,Arsenal,Coventry,0,3,A,,,...,,,,,,,,,,
1,1993-94,season-9394.csv,1993-08-14,Aston Villa,QPR,4,1,H,,,...,,,,,,,,,,
2,1993-94,season-9394.csv,1993-08-14,Chelsea,Blackburn,1,2,A,,,...,,,,,,,,,,
3,1993-94,season-9394.csv,1993-08-14,Liverpool,Sheffield Weds,2,0,H,,,...,,,,,,,,,,
4,1993-94,season-9394.csv,1993-08-14,Man City,Leeds,1,1,D,,,...,,,,,,,,,,


In [54]:
# --- KEEP ONLY COMPLETE SEASONS
performance_features = ['HS','AS','HST','AST','HF','AF','HC','AC','HY','AY','HR','AR']

# Identify incomplete seasons (those with missing performance data)
seasons_with_missing = df.loc[df[performance_features].isnull().any(axis=1), 'Season'].unique()
print("Seasons containing missing performance features:", seasons_with_missing)

# Keep only seasons that are NOT in the list (the ~ symbol inverts the condition)
df = df[~df['Season'].isin(seasons_with_missing)].reset_index(drop=True)

print(f" Removed {len(seasons_with_missing)} incomplete seasons.")
print("New dataset shape:", df.shape)


Seasons containing missing performance features: ['1993-94' '1994-95' '1995-96' '1996-97' '1997-98' '1998-99' '1999-00']
 Removed 7 incomplete seasons.
New dataset shape: (9500, 24)


In [55]:
df.shape
df.head()


Unnamed: 0,Season,SourceFile,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,2000-01,season-0001.csv,2000-08-19,Charlton,Man City,4,0,H,2.0,0.0,...,14.0,4.0,13.0,12.0,6.0,6.0,1.0,2.0,0.0,0.0
1,2000-01,season-0001.csv,2000-08-19,Chelsea,West Ham,4,2,H,1.0,0.0,...,10.0,5.0,19.0,14.0,7.0,7.0,1.0,2.0,0.0,0.0
2,2000-01,season-0001.csv,2000-08-19,Coventry,Middlesbrough,1,3,A,1.0,1.0,...,3.0,9.0,15.0,21.0,8.0,4.0,5.0,3.0,1.0,0.0
3,2000-01,season-0001.csv,2000-08-19,Derby,Southampton,2,2,D,1.0,2.0,...,4.0,6.0,11.0,13.0,5.0,8.0,1.0,1.0,0.0,0.0
4,2000-01,season-0001.csv,2000-08-19,Leeds,Everton,2,0,H,2.0,0.0,...,8.0,6.0,21.0,20.0,6.0,4.0,1.0,3.0,0.0,0.0


In [56]:
print("Remaining missing values per column:")
print(df.isnull().sum())


Remaining missing values per column:
Season        0
SourceFile    0
Date          0
HomeTeam      0
AwayTeam      0
FTHG          0
FTAG          0
FTR           0
HTHG          0
HTAG          0
HTR           0
Referee       0
HS            0
AS            0
HST           0
AST           0
HF            0
AF            0
HC            0
AC            0
HY            0
AY            0
HR            0
AR            0
dtype: int64


In [57]:
# --- REMOVE NON-RELEVANT COLUMNS
cols_to_drop = ['HTHG', 'HTAG', 'HTR', 'Referee','SourceFile']
df = df.drop(columns=[c for c in cols_to_drop if c in df.columns], errors='ignore')

print(" Dropped columns:", cols_to_drop)
print("New dataset shape:", df.shape)
df.head()

 Dropped columns: ['HTHG', 'HTAG', 'HTR', 'Referee', 'SourceFile']
New dataset shape: (9500, 19)


Unnamed: 0,Season,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,2000-01,2000-08-19,Charlton,Man City,4,0,H,17.0,8.0,14.0,4.0,13.0,12.0,6.0,6.0,1.0,2.0,0.0,0.0
1,2000-01,2000-08-19,Chelsea,West Ham,4,2,H,17.0,12.0,10.0,5.0,19.0,14.0,7.0,7.0,1.0,2.0,0.0,0.0
2,2000-01,2000-08-19,Coventry,Middlesbrough,1,3,A,6.0,16.0,3.0,9.0,15.0,21.0,8.0,4.0,5.0,3.0,1.0,0.0
3,2000-01,2000-08-19,Derby,Southampton,2,2,D,6.0,13.0,4.0,6.0,11.0,13.0,5.0,8.0,1.0,1.0,0.0,0.0
4,2000-01,2000-08-19,Leeds,Everton,2,0,H,17.0,12.0,8.0,6.0,21.0,20.0,6.0,4.0,1.0,3.0,0.0,0.0


In [58]:
team_cols = ['HomeTeam', 'AwayTeam']
encoders = {}
df_label=df

for col in team_cols:
    le = LabelEncoder()
    df_label[col + '_encoded'] = le.fit_transform(df_label[col])
    encoders[col] = le  # save the encoder
    print(f"{col} encoded. Classes: {le.classes_}")

print(df_label[['HomeTeam', 'HomeTeam_encoded', 'AwayTeam', 'AwayTeam_encoded']].head())

HomeTeam encoded. Classes: ['Arsenal' 'Aston Villa' 'Birmingham' 'Blackburn' 'Blackpool' 'Bolton'
 'Bournemouth' 'Bradford' 'Brentford' 'Brighton' 'Burnley' 'Cardiff'
 'Charlton' 'Chelsea' 'Coventry' 'Crystal Palace' 'Derby' 'Everton'
 'Fulham' 'Huddersfield' 'Hull' 'Ipswich' 'Leeds' 'Leicester' 'Liverpool'
 'Luton' 'Man City' 'Man United' 'Middlesbrough' 'Newcastle' 'Norwich'
 "Nott'm Forest" 'Portsmouth' 'QPR' 'Reading' 'Sheffield United'
 'Southampton' 'Stoke' 'Sunderland' 'Swansea' 'Tottenham' 'Watford'
 'West Brom' 'West Ham' 'Wigan' 'Wolves']
AwayTeam encoded. Classes: ['Arsenal' 'Aston Villa' 'Birmingham' 'Blackburn' 'Blackpool' 'Bolton'
 'Bournemouth' 'Bradford' 'Brentford' 'Brighton' 'Burnley' 'Cardiff'
 'Charlton' 'Chelsea' 'Coventry' 'Crystal Palace' 'Derby' 'Everton'
 'Fulham' 'Huddersfield' 'Hull' 'Ipswich' 'Leeds' 'Leicester' 'Liverpool'
 'Luton' 'Man City' 'Man United' 'Middlesbrough' 'Newcastle' 'Norwich'
 "Nott'm Forest" 'Portsmouth' 'QPR' 'Reading' 'Sheffield United'


In [59]:
df_label.head()

Unnamed: 0,Season,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,HST,...,HF,AF,HC,AC,HY,AY,HR,AR,HomeTeam_encoded,AwayTeam_encoded
0,2000-01,2000-08-19,Charlton,Man City,4,0,H,17.0,8.0,14.0,...,13.0,12.0,6.0,6.0,1.0,2.0,0.0,0.0,12,26
1,2000-01,2000-08-19,Chelsea,West Ham,4,2,H,17.0,12.0,10.0,...,19.0,14.0,7.0,7.0,1.0,2.0,0.0,0.0,13,43
2,2000-01,2000-08-19,Coventry,Middlesbrough,1,3,A,6.0,16.0,3.0,...,15.0,21.0,8.0,4.0,5.0,3.0,1.0,0.0,14,28
3,2000-01,2000-08-19,Derby,Southampton,2,2,D,6.0,13.0,4.0,...,11.0,13.0,5.0,8.0,1.0,1.0,0.0,0.0,16,36
4,2000-01,2000-08-19,Leeds,Everton,2,0,H,17.0,12.0,8.0,...,21.0,20.0,6.0,4.0,1.0,3.0,0.0,0.0,22,17


In [60]:
# --- FEATURE ENGINEERING
# Goal Difference (Home - Away) → measures dominance in the match
df['GoalDiff'] = df['FTHG'] - df['FTAG']

# Total Goals (Home + Away) → measures match intensity
df['TotalGoals'] = df['FTHG'] + df['FTAG']

# Shot Accuracy (ratio of shots on target to total shots)
df['ShotAcc_H'] = df.apply(lambda x: x['HST']/x['HS'] if x['HS'] > 0 else 0, axis=1)
df['ShotAcc_A'] = df.apply(lambda x: x['AST']/x['AS'] if x['AS'] > 0 else 0, axis=1)

print("New features created: GoalDiff, TotalGoals, ShotAcc_H, ShotAcc_A")
df[['HomeTeam','AwayTeam','GoalDiff','TotalGoals','ShotAcc_H','ShotAcc_A']].head()

New features created: GoalDiff, TotalGoals, ShotAcc_H, ShotAcc_A


Unnamed: 0,HomeTeam,AwayTeam,GoalDiff,TotalGoals,ShotAcc_H,ShotAcc_A
0,Charlton,Man City,4,4,0.823529,0.5
1,Chelsea,West Ham,2,6,0.588235,0.416667
2,Coventry,Middlesbrough,-2,4,0.5,0.5625
3,Derby,Southampton,0,4,0.666667,0.461538
4,Leeds,Everton,2,2,0.470588,0.5


In [61]:
# --- LABEL ENCODING FOR SIMPLE CATEGORIES

label_cols = ['Season', 'FTR']
for col in label_cols:
    if col in df.columns:
        encoder = LabelEncoder()
        df[col + '_encoded'] = encoder.fit_transform(df[col])
        print(f"Encoded column: {col} -> {col}_encoded")

# --- ONE-HOT ENCODING FOR TEAM NAMES ---
# Convert HomeTeam and AwayTeam into dummy variables
df = pd.get_dummies(df, columns=['HomeTeam','AwayTeam'], drop_first=True)

print(" Encoding completed. New shape:", df.shape)

Encoded column: Season -> Season_encoded
Encoded column: FTR -> FTR_encoded
 Encoding completed. New shape: (9500, 115)


In [62]:
# --- STANDARDIZATION OF NUMERIC FEATURES ---
numeric_features = ['FTHG','FTAG','HS','AS','HST','AST','HF','AF','HC','AC',
                    'HY','AY','HR','AR','GoalDiff','TotalGoals','ShotAcc_H','ShotAcc_A']

scaler = StandardScaler()
df_scaled = pd.DataFrame(
    scaler.fit_transform(df[numeric_features]),
    columns=[c + '_z' for c in numeric_features],
    index=df.index
)

# Replace original numeric columns by standardized versions
df = pd.concat([df.drop(columns=numeric_features), df_scaled], axis=1)

print(" Standardization completed. Dataset ready for analysis.")
df.head()

 Standardization completed. Dataset ready for analysis.


Unnamed: 0,Season,Date,FTR,HomeTeam_encoded,AwayTeam_encoded,Season_encoded,FTR_encoded,HomeTeam_Aston Villa,HomeTeam_Birmingham,HomeTeam_Blackburn,...,HC_z,AC_z,HY_z,AY_z,HR_z,AR_z,GoalDiff_z,TotalGoals_z,ShotAcc_H_z,ShotAcc_A_z
0,2000-01,2000-08-19,H,12,26,0,2,False,False,False,...,-0.011986,0.446826,-0.38268,0.163747,-0.247221,-0.294277,2.007458,0.767337,2.117276,0.306274
1,2000-01,2000-08-19,H,13,43,0,2,False,False,False,...,0.309683,0.810223,-0.38268,0.163747,-0.247221,-0.294277,0.906592,1.964824,0.810776,-0.119557
2,2000-01,2000-08-19,A,14,28,0,0,False,False,False,...,0.631353,-0.279969,2.905096,0.939606,3.700001,-0.294277,-1.29514,0.767337,0.320839,0.625647
3,2000-01,2000-08-19,D,16,36,0,1,False,False,False,...,-0.333656,1.17362,-0.38268,-0.612112,-0.247221,-0.294277,-0.194274,0.767337,1.246276,0.109737
4,2000-01,2000-08-19,H,22,17,0,2,False,False,False,...,-0.011986,-0.279969,-0.38268,0.939606,-0.247221,-0.294277,0.906592,-0.43015,0.157526,0.306274
