In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
#loading the data file
data = pd.read_csv('final_dataset.csv')
data["Date"] = pd.to_datetime(data["Date"])

  data["Date"] = pd.to_datetime(data["Date"])


In [3]:
#checking the data types of our columns
data.dtypes

HomeTeam            object
AwayTeam            object
FTHG               float64
FTAG               float64
FTR                 object
HS                 float64
AS                 float64
HST                float64
AST                float64
HC                 float64
AC                 float64
HY                 float64
AY                 float64
HR                 float64
AR                 float64
Date        datetime64[ns]
dtype: object

In [4]:
# Create the 'Home' rows
home_rows = data.copy()
home_rows['Team'] = home_rows['HomeTeam']  # Team column for home team
home_rows['Venue'] = 'Home'  # Venue column for home team
home_rows['Opponent'] = home_rows['AwayTeam']  # Opponent is the away team
home_rows['GoalsFor'] = home_rows['FTHG']  # Goals scored by home team
home_rows['GoalsAgainst'] = home_rows['FTAG']  # Goals conceded by home team

# Create the 'Away' rows
away_rows = data.copy()
away_rows['Team'] = away_rows['AwayTeam']  # Team column for away team
away_rows['Venue'] = 'Away'  # Venue column for away team
away_rows['Opponent'] = away_rows['HomeTeam']  # Opponent is the home team
away_rows['GoalsFor'] = away_rows['FTAG']  # Goals scored by away team
away_rows['GoalsAgainst'] = away_rows['FTHG']  # Goals conceded by away team

# Combine the two datasets
combined_data = pd.concat([home_rows, away_rows], ignore_index=True)

# Drop unnecessary columns to avoid redundancy
combined_data = combined_data.drop(['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG'], axis=1)

In [5]:
#tokenising the data so its ready for machine learning
combined_data['ftr_num'] = combined_data['FTR'].apply(
    lambda x: 1 if x == 'H' else (0 if x == 'D' else -1)
)
combined_data['Venue_num'] = combined_data['Venue'].apply(lambda x: 1 if x == 'Home' else 0)
combined_data["opp_code"] = combined_data["Opponent"].astype("category").cat.codes
combined_data["day_code"] = combined_data["Date"].dt.dayofweek

In [6]:
#date we're using to seperate the data, we're going to test the latest 2 seasons
season_start_date = pd.Timestamp('2024-08-16')

In [7]:
#we initialise the test data
latest_season = combined_data[combined_data['Date'] >= season_start_date]

In [8]:
latest_season

Unnamed: 0,FTR,HS,AS,HST,AST,HC,AC,HY,AY,HR,...,Date,Team,Venue,Opponent,GoalsFor,GoalsAgainst,ftr_num,Venue_num,opp_code,day_code
0,H,14.0,10.0,5.0,2.0,7.0,8.0,2.0,3.0,0.0,...,2024-08-16,Man United,Home,Fulham,1.0,0.0,1,1,10,4.0
1,A,7.0,18.0,2.0,5.0,2.0,10.0,3.0,1.0,0.0,...,2024-08-17,Ipswich,Home,Liverpool,0.0,2.0,-1,1,16,5.0
2,H,18.0,9.0,6.0,3.0,8.0,2.0,2.0,2.0,0.0,...,2024-08-17,Arsenal,Home,Wolves,2.0,0.0,1,1,36,5.0
3,A,9.0,10.0,1.0,5.0,1.0,5.0,1.0,1.0,1.0,...,2024-08-17,Everton,Home,Brighton,0.0,3.0,-1,1,4,5.0
4,H,3.0,19.0,1.0,4.0,3.0,12.0,2.0,4.0,1.0,...,2024-08-17,Newcastle,Home,Southampton,1.0,0.0,1,1,27,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5316,A,17.0,14.0,6.0,6.0,12.0,3.0,1.0,4.0,0.0,...,2025-05-25,Everton,Away,Newcastle,1.0,0.0,-1,0,21,6.0
5317,A,10.0,6.0,2.0,2.0,7.0,4.0,2.0,2.0,0.0,...,2025-05-25,Chelsea,Away,Nott'm Forest,1.0,0.0,-1,0,23,6.0
5318,A,7.0,23.0,2.0,8.0,5.0,8.0,0.0,0.0,0.0,...,2025-05-25,Arsenal,Away,Southampton,2.0,1.0,-1,0,27,6.0
5319,A,4.0,23.0,2.0,8.0,2.0,11.0,3.0,1.0,0.0,...,2025-05-25,Brighton,Away,Tottenham,4.0,1.0,-1,0,31,6.0


In [9]:
#we initialise the training data
historical_data = combined_data[combined_data['Date'] < season_start_date]

In [10]:
# Prepare training data
X_train = historical_data.drop(['ftr_num', 'Team', 'Opponent', 'Venue', 'Date', 'FTR', 'GoalsFor', 'GoalsAgainst', 'HST', 'AST', 'AS', 'HS', 'HY', 'AY', 'HR', 'AR' ], axis=1)
y_train = historical_data['ftr_num']

In [11]:
# Prepare prediction data
X_test = latest_season.drop(['ftr_num', 'Team', 'Opponent', 'Venue', 'Date', 'FTR', 'GoalsFor', 'GoalsAgainst', 'HST', 'AST', 'AS', 'HS', 'HY', 'AY', 'HR', 'AR' ], axis=1)
y_test = latest_season['ftr_num']

In [12]:
#initialise our random forest classifier
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [13]:
#here we fit the data
rf.fit(X_train, y_train)

In [14]:
#we initialise our test prediction
y_test_pred = rf.predict(X_test)

In [15]:
#here we initialise our accuracy
accuracy = accuracy_score(y_test, y_test_pred)

In [16]:
accuracy

0.41578947368421054

In [17]:
# Add predictions to the latest season dataset
latest_season['y_test_pred'] = y_test_pred

# Filter for matches that have been played (where ftr_num is not NaN)
played_matches = latest_season[latest_season['ftr_num'].notna()]

# Display relevant columns: Date, Team, Opponent, Venue, Actual Result, Predicted Result
played_matches_results = played_matches[['Date', 'Team', 'Opponent', 'Venue', 'ftr_num', 'y_test_pred']]

# Rename columns for clarity
played_matches_results = played_matches_results.rename(columns={
    'ftr_num': 'Actual_Result',
    'y_test_pred': 'Predicted_Result'
})





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  latest_season['y_test_pred'] = y_test_pred


In [18]:
# Display the first few rows of the dataset
played_matches_results

Unnamed: 0,Date,Team,Opponent,Venue,Actual_Result,Predicted_Result
0,2024-08-16,Man United,Fulham,Home,1,1
1,2024-08-17,Ipswich,Liverpool,Home,-1,-1
2,2024-08-17,Arsenal,Wolves,Home,1,1
3,2024-08-17,Everton,Brighton,Home,-1,1
4,2024-08-17,Newcastle,Southampton,Home,1,1
...,...,...,...,...,...,...
5316,2025-05-25,Everton,Newcastle,Away,-1,0
5317,2025-05-25,Chelsea,Nott'm Forest,Away,-1,1
5318,2025-05-25,Arsenal,Southampton,Away,-1,-1
5319,2025-05-25,Brighton,Tottenham,Away,-1,-1


In [19]:
#we sort our data by team and date, so its ready to calculate rolling averages
combined_data = combined_data.sort_values(by=['Team', 'Date'])

In [20]:
combined_data

Unnamed: 0,FTR,HS,AS,HST,AST,HC,AC,HY,AY,HR,...,Date,Team,Venue,Opponent,GoalsFor,GoalsAgainst,ftr_num,Venue_num,opp_code,day_code
4561,D,14.0,3.0,4.0,2.0,7.0,0.0,0.0,0.0,0.0,...,2012-08-18,Arsenal,Home,Sunderland,0.0,0.0,0,1,29,5.0
9521,D,7.0,16.0,4.0,6.0,0.0,11.0,2.0,0.0,0.0,...,2012-08-26,Arsenal,Away,Stoke,0.0,0.0,0,0,28,6.0
9528,A,17.0,11.0,8.0,7.0,10.0,2.0,2.0,2.0,0.0,...,2012-09-02,Arsenal,Away,Liverpool,2.0,0.0,-1,0,16,6.0
4590,H,20.0,9.0,12.0,4.0,8.0,3.0,0.0,0.0,0.0,...,2012-09-15,Arsenal,Home,Southampton,6.0,1.0,1,1,27,5.0
9548,D,14.0,10.0,10.0,4.0,9.0,4.0,1.0,0.0,0.0,...,2012-09-23,Arsenal,Away,Man City,1.0,1.0,0,0,18,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353,A,10.0,7.0,3.0,2.0,7.0,4.0,1.0,1.0,0.0,...,2025-05-10,Wolves,Home,Brighton,0.0,2.0,-1,1,4,5.0
5309,H,14.0,12.0,10.0,3.0,0.0,10.0,1.0,2.0,0.0,...,2025-05-20,Wolves,Away,Crystal Palace,2.0,4.0,1,0,8,1.0
379,D,18.0,13.0,6.0,7.0,8.0,3.0,2.0,1.0,0.0,...,2025-05-25,Wolves,Home,Brentford,1.0,1.0,0,1,3,6.0
4180,,,,,,,,,,,...,NaT,,Home,,,,-1,1,-1,


In [21]:
#here we calculate rolling averages for wins, home and away shots, home and away shots on target, the goals for and goals against, corners, yellow and red cards

combined_data['is_win'] = (combined_data['ftr_num'] == 1).astype(int)
combined_data['rolling_wins'] = (
    combined_data.groupby('Team')['is_win']   # Group by team
    .rolling(5, min_periods=1)               # Rolling window of last 5 matches
    .mean()                                  # Calculate mean (percentage of wins)
    .reset_index(level=0, drop=True)         # Align with the original DataFrame
)


combined_data['rolling_HS'] = (
    combined_data.groupby('Team')['HS']
    .rolling(5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)
combined_data['rolling_AS'] = (
    combined_data.groupby('Team')['AS']
    .rolling(5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

combined_data['rolling_HST'] = (
    combined_data.groupby('Team')['HST']
    .rolling(5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)
combined_data['rolling_AST'] = (
    combined_data.groupby('Team')['AST']
    .rolling(5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

combined_data['rolling_goals_for'] = (
    combined_data.groupby('Team')['GoalsFor']
    .rolling(5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

combined_data['rolling_goals_against'] = (
    combined_data.groupby('Team')['GoalsAgainst']
    .rolling(5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

combined_data['rolling_home_corners'] = (
    combined_data.groupby('Team')['HC']
    .rolling(5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

combined_data['rolling_away_corners'] = (
    combined_data.groupby('Team')['AC']
    .rolling(5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)
combined_data['rolling_home_yellowcards'] = (
    combined_data.groupby('Team')['HY']
    .rolling(5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

combined_data['rolling_away_yellowcards'] = (
    combined_data.groupby('Team')['AY']
    .rolling(5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

combined_data['rolling_home_redcards'] = (
    combined_data.groupby('Team')['HR']
    .rolling(5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

combined_data['rolling_away_redcards'] = (
    combined_data.groupby('Team')['AR']
    .rolling(5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

In [22]:
combined_data

Unnamed: 0,FTR,HS,AS,HST,AST,HC,AC,HY,AY,HR,...,rolling_HST,rolling_AST,rolling_goals_for,rolling_goals_against,rolling_home_corners,rolling_away_corners,rolling_home_yellowcards,rolling_away_yellowcards,rolling_home_redcards,rolling_away_redcards
4561,D,14.0,3.0,4.0,2.0,7.0,0.0,0.0,0.0,0.0,...,4.000000,2.00,0.000000,0.00,7.000000,0.000000,0.000000,0.000000,0.0,0.0
9521,D,7.0,16.0,4.0,6.0,0.0,11.0,2.0,0.0,0.0,...,4.000000,4.00,0.000000,0.00,3.500000,5.500000,1.000000,0.000000,0.0,0.0
9528,A,17.0,11.0,8.0,7.0,10.0,2.0,2.0,2.0,0.0,...,5.333333,5.00,0.666667,0.00,5.666667,4.333333,1.333333,0.666667,0.0,0.0
4590,H,20.0,9.0,12.0,4.0,8.0,3.0,0.0,0.0,0.0,...,7.000000,4.75,2.000000,0.25,6.250000,4.000000,1.000000,0.500000,0.0,0.0
9548,D,14.0,10.0,10.0,4.0,9.0,4.0,1.0,0.0,0.0,...,7.600000,4.60,1.800000,0.40,6.800000,4.000000,1.000000,0.400000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353,A,10.0,7.0,3.0,2.0,7.0,4.0,1.0,1.0,0.0,...,3.600000,2.20,1.600000,1.00,5.000000,3.400000,1.000000,2.000000,0.0,0.0
5309,H,14.0,12.0,10.0,3.0,0.0,10.0,1.0,2.0,0.0,...,4.600000,2.00,1.200000,1.40,4.800000,4.600000,1.000000,2.000000,0.0,0.0
379,D,18.0,13.0,6.0,7.0,8.0,3.0,2.0,1.0,0.0,...,5.400000,3.00,1.200000,1.60,4.600000,4.800000,1.000000,1.600000,0.0,0.0
4180,,,,,,,,,,,...,,,,,,,,,,


In [23]:
latest_season = combined_data[combined_data['Date'] >= season_start_date]

In [24]:
historical_data = combined_data[combined_data['Date'] < season_start_date]

In [25]:
# training data
X_train = historical_data.drop(['ftr_num', 'Team', 'Opponent', 'Venue', 'Date', 'FTR', 'GoalsFor', 'GoalsAgainst', 'HST', 'AST', 'AS', 'HS', 'HY', 'AY', 'HR', 'AR' ], axis=1)
y_train = historical_data['ftr_num']

In [26]:
# prediction data
X_test = latest_season.drop(['ftr_num', 'Team', 'Opponent', 'Venue', 'Date', 'FTR', 'GoalsFor', 'GoalsAgainst', 'HST', 'AST', 'AS', 'HS', 'HY', 'AY', 'HR', 'AR' ], axis=1)
y_test = latest_season['ftr_num']

In [27]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [28]:
rf.fit(X_train, y_train)

In [29]:
y_test_pred = rf.predict(X_test)

In [30]:
accuracy = accuracy_score(y_test, y_test_pred)

In [31]:
accuracy

0.7526315789473684

In [80]:
def club_correct_win(df, club_name):
    """Return #correct WIN predictions, total wins, pct for `club_name`."""
    club_rows = df[df["Team"] == club_name]

    # a win for the club is:
    #   • they were HOME  and ftr_num ==  1   OR
    #   • they were AWAY  and ftr_num == -1
    is_home = club_rows["Venue"] == "Home"
    actual_win = (is_home & (club_rows["ftr_num"] == 1)) | \
                 (~is_home & (club_rows["ftr_num"] == -1))

    predicted_win = (is_home & (club_rows["predicted"] == 1)) | \
                    (~is_home & (club_rows["predicted"] == -1))

    correct = int((actual_win & predicted_win).sum())
    total_wins = int(actual_win.sum())
    pct = round(correct / total_wins * 100, 1) if total_wins else 0.0
    return {"correct": correct,
            "total_wins": total_wins,
            "accuracy_pct": pct}


In [84]:
top4 = ["Liverpool", "Arsenal", "Man City", "Chelsea"]
summary = {club: club_correct_win(latest_season, club) for club in top4}
print(summary)


{'Liverpool': {'correct': 25, 'total_wins': 25, 'accuracy_pct': 100.0}, 'Arsenal': {'correct': 17, 'total_wins': 20, 'accuracy_pct': 85.0}, 'Man City': {'correct': 18, 'total_wins': 21, 'accuracy_pct': 85.7}, 'Chelsea': {'correct': 19, 'total_wins': 20, 'accuracy_pct': 95.0}}
