In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import spearmanr

In [2]:
path = "../"  

In [3]:
#Loading the Season Summary sheet into a pandas dataframe
file_path = path + "Premier League Data Analysis.xlsx"
season_summary_df = pd.read_excel(file_path, sheet_name="Season Summary")

#Selecting relevant features and target variable (Rank)
features = ['Goal_Difference', 'ShotOnTarget/Shots', 'Avg_Yellow_Cards/Game', 'Avg_Red_Cards/Game', 'Home_Win_Leading_HT(%)', 'Away_Win_Leading_HT(%)', 'Home_Win_Draw_HT(%)',
           'Away_Win_Draw_HT(%)', 'Home_Win_Losing_HT(%)', 'Away_Win_Losing_HT(%)', 'Manager Changes', 'Caretaker Days', 'Total_Corners']

X = season_summary_df[features]
y = season_summary_df['Rank']

#Replacing 'NAN' string with actual NaN values
X.replace('NAN', np.nan, inplace=True)

#Dropping rows with NaN values
X_clean = X.dropna()
y_clean = y[X_clean.index]  #Ensuring the target variable corresponds to the rows in X_clean

#Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42)

#Initialising and training the model (Gradient Boosting Regressor)
model = GradientBoostingRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

#Making predictions and evaluating the model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
spearman_corr, _ = spearmanr(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Spearman Rank Correlation: {spearman_corr}")

#Predicting and viewing rankings for the test set
predicted_ranks = pd.DataFrame({
    'Team': season_summary_df.loc[X_test.index, 'Team'],
    'Actual Rank': y_test,
    'Predicted Rank': y_pred
})
print(predicted_ranks)


Mean Absolute Error (MAE): 1.7139460396755966
Mean Squared Error (MSE): 5.138745450263321
Spearman Rank Correlation: 0.9244591970187854
            Team  Actual Rank  Predicted Rank
561  Southampton           12       13.889242
617    Liverpool            3        1.759068
325  Aston Villa            6        5.217353
589  Bournemouth           15       18.967236
441    Tottenham            5        6.945158
..           ...          ...             ...
362        Wigan           16       15.559349
536    Newcastle           14       14.072451
465  Bournemouth            8       12.920306
478   Sunderland           20       18.336823
270      Everton            6        5.806789

[97 rows x 3 columns]


  X.replace('NAN', np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.replace('NAN', np.nan, inplace=True)


In [4]:
#Loading the 2024-2025 season data
season_2025_df = pd.read_excel(file_path, sheet_name="Partial Data")

#Predicting ranks for 2024-2025 season using the trained model
X_2025 = season_2025_df[features] 
predicted_ranks = model.predict(X_2025)  # Use your trained model here

#Adding the predicted ranks to the dataframe
season_2025_df['Predicted_Rank'] = predicted_ranks

In [5]:
#Sorting the dataframe by 'Predicted_Rank' from lowest to highest
sorted_df = season_2025_df[['Team', 'Predicted_Rank']].sort_values(by='Predicted_Rank')

#Displaying the sorted dataframe
print(sorted_df.to_string(index=False))

          Team  Predicted_Rank
     Liverpool        3.169544
       Arsenal        3.644066
      Man City        4.414721
       Chelsea        4.715249
   Bournemouth        6.146448
     Newcastle        8.155845
     Tottenham        8.272617
 Nott'm Forest        8.721764
     Brentford        9.455386
        Fulham       10.329686
      Brighton       11.286386
   Aston Villa       11.305382
Crystal Palace       13.745264
    Man United       14.453913
      West Ham       14.777999
       Everton       15.228723
        Wolves       17.801841
     Leicester       18.167872
       Ipswich       19.498921
   Southampton       20.557718
