In [3]:
import pandas as pd
import os

# LOAD AND PREPARE THE DATA

**Definitions of every column**

Season	Match Season  
DateTime	Match Date and Time (yyyy-mm-dd hh:mm:ss)  
HomeTeam	Home Team  
AwayTeam	Away Team  
FTHG	Full Time Home Team Goals  
FTAG	Full Time Away Team Goals  
FTR	Full Time Result (H=Home Win, D=Draw, A=Away Win)  
HTHG	Half Time Home Team Goals  
HTAG	Half Time Away Team Goals  
HTR	Half Time Result (H=Home Win, D=Draw, A=Away Win)  
Referee	Match Referee  
HS	Home Team Shots  
AS	Away Team Shots  
HST	Home Team Shots on Target  
AST	Away Team Shots on Target  
HC	Home Team Corners  
AC	Away Team Corners  
HF	Home Team Fouls Committed  
AF	Away Team Fouls Committed  
HY	Home Team Yellow Cards  
AY	Away Team Yellow Cards  
HR	Home Team Red Cards  
AR	Away Team Red Cards  

In [6]:
df = pd.read_csv('results.csv', encoding='latin-1')  
print(df.head())

    Season              DateTime     HomeTeam        AwayTeam  FTHG  FTAG FTR  \
0  1993-94  1993-08-14T00:00:00Z      Arsenal        Coventry     0     3   A   
1  1993-94  1993-08-14T00:00:00Z  Aston Villa             QPR     4     1   H   
2  1993-94  1993-08-14T00:00:00Z      Chelsea       Blackburn     1     2   A   
3  1993-94  1993-08-14T00:00:00Z    Liverpool  Sheffield Weds     2     0   H   
4  1993-94  1993-08-14T00:00:00Z     Man City           Leeds     1     1   D   

   HTHG  HTAG  HTR  ... HST  AST  HC  AC  HF  AF  HY  AY  HR  AR  
0   NaN   NaN  NaN  ... NaN  NaN NaN NaN NaN NaN NaN NaN NaN NaN  
1   NaN   NaN  NaN  ... NaN  NaN NaN NaN NaN NaN NaN NaN NaN NaN  
2   NaN   NaN  NaN  ... NaN  NaN NaN NaN NaN NaN NaN NaN NaN NaN  
3   NaN   NaN  NaN  ... NaN  NaN NaN NaN NaN NaN NaN NaN NaN NaN  
4   NaN   NaN  NaN  ... NaN  NaN NaN NaN NaN NaN NaN NaN NaN NaN  

[5 rows x 23 columns]


## Overall shape/specification/details

In [None]:
print("Dataset shape: ", df.shape)
print("\nColumn names:")
print(df.columns.tolist())
print("\nData types:")
print(df.dtypes)
print("\nMissing values per column:")
print(df.isnull().sum())
print("\nSeasons available:")
print(df['Season'].unique())
print(f"\nDate range: {df['DateTime'].min()} to {df['DateTime'].max()}")

Dataset shape:  (11113, 23)

Column names:
['Season', 'DateTime', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR']

Missing values per column:
Season         0
DateTime       0
HomeTeam       0
AwayTeam       0
FTHG           0
FTAG           0
FTR            0
HTHG         924
HTAG         924
HTR          924
Referee     2824
HS          2824
AS          2824
HST         2824
AST         2824
HC          2824
AC          2824
HF          2824
AF          2824
HY          2824
AY          2824
HR          2824
AR          2824
dtype: int64

Seasons available:
['1993-94' '1994-95' '1995-96' '1996-97' '1997-98' '1998-99' '1999-00'
 '2000-01' '2001-02' '2002-03' '2003-04' '2004-05' '2005-06' '2006-07'
 '2007-08' '2008-09' '2009-10' '2010-11' '2011-12' '2012-13' '2013-14'
 '2014-15' '2015-16' '2016-17' '2017-18' '2018-19' '2019-20' '2020-21'
 '2021-22']

Date range: 1993-08-14T00:00:0

## Conclusion

We can observe that a lot of important data from seasons before 2000 is not available, so we have to cut out these seasons, because we will use more specific statistics than just the score. 

In [15]:
# Filter for seasons 2000 onwards
df_filtered = df[df['Season'] >= '2000-01'].copy()

print(f"Original dataset: {len(df)} matches")
print(f"Filtered dataset (2000+): {len(df_filtered)} matches")
print(f"\nSeasons included: {sorted(df_filtered['Season'].unique())}")

# Check missing values in filtered data
print("\nMissing values in filtered data:")
print(df_filtered.isnull().sum())

# Check data completeness
print(f"\nPercentage of missing values per column:")
print((df_filtered.isnull().sum() / len(df_filtered) * 100).round(2))

# Basic statistics
print(f"\nBasic stats:")
print(f"Total matches: {len(df_filtered)}")
print(f"Unique teams: {len(set(df_filtered['HomeTeam'].unique()) | set(df_filtered['AwayTeam'].unique()))}")
print(f"\nResult distribution:")
print(df_filtered['FTR'].value_counts())
print(f"\nResult percentages:")
print(df_filtered['FTR'].value_counts(normalize=True) * 100)

Original dataset: 11113 matches
Filtered dataset (2000+): 8289 matches

Seasons included: ['2000-01', '2001-02', '2002-03', '2003-04', '2004-05', '2005-06', '2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12', '2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20', '2020-21', '2021-22']

Missing values in filtered data:
Season      0
DateTime    0
HomeTeam    0
AwayTeam    0
FTHG        0
FTAG        0
FTR         0
HTHG        0
HTAG        0
HTR         0
Referee     0
HS          0
AS          0
HST         0
AST         0
HC          0
AC          0
HF          0
AF          0
HY          0
AY          0
HR          0
AR          0
dtype: int64

Percentage of missing values per column:
Season      0.0
DateTime    0.0
HomeTeam    0.0
AwayTeam    0.0
FTHG        0.0
FTAG        0.0
FTR         0.0
HTHG        0.0
HTAG        0.0
HTR         0.0
Referee     0.0
HS          0.0
AS          0.0
HST         0.0
AST         0.0
HC          0.0


Perfect! Now we have the optimal dataset for the model, next we have to prepare it.

In [19]:
df_filtered['DateTime'] = pd.to_datetime(df_filtered['DateTime'])

df_filtered = df_filtered.sort_values('DateTime').reset_index(drop=True)

df_filtered['Year'] = df_filtered['DateTime'].dt.year
df_filtered['Month'] = df_filtered['DateTime'].dt.month
df_filtered['DayOfWeek'] = df_filtered['DateTime'].dt.dayofweek

print("Data prepared! Shape:", df_filtered.shape)
print(df_filtered[['DateTime', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR']].head(10))

Data prepared! Shape: (8289, 26)
                   DateTime    HomeTeam       AwayTeam  FTHG  FTAG FTR
0 2000-08-19 00:00:00+00:00    Charlton       Man City     4     0   H
1 2000-08-19 00:00:00+00:00     Chelsea       West Ham     4     2   H
2 2000-08-19 00:00:00+00:00    Coventry  Middlesbrough     1     3   A
3 2000-08-19 00:00:00+00:00       Derby    Southampton     2     2   D
4 2000-08-19 00:00:00+00:00       Leeds        Everton     2     0   H
5 2000-08-19 00:00:00+00:00   Leicester    Aston Villa     0     0   D
6 2000-08-19 00:00:00+00:00   Liverpool       Bradford     1     0   H
7 2000-08-19 00:00:00+00:00  Sunderland        Arsenal     1     0   H
8 2000-08-19 00:00:00+00:00   Tottenham        Ipswich     3     1   H
9 2000-08-20 00:00:00+00:00  Man United      Newcastle     2     0   H


# Model