In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import spearmanr
from sklearn.model_selection import GridSearchCV, KFold

In [13]:
path = "C:\\Users\\antoi\\Documents\\University\\IGP\\Excel Documents\\"  

In [19]:
#Loading the Season Summary sheet
file_path = path + "Premier League Data Analysis.xlsx"
season_summary_df = pd.read_excel(file_path, sheet_name="Season Summary")

#Selecting relevant features and target variable (Rank)
features = ['Goal_Difference', 'ShotOnTarget/Shots', 'Avg_Yellow_Cards/Game', 'Avg_Red_Cards/Game', 'Home_Win_Leading_HT(%)', 'Away_Win_Leading_HT(%)', 'Home_Win_Draw_HT(%)',
           'Away_Win_Draw_HT(%)', 'Home_Win_Losing_HT(%)', 'Away_Win_Losing_HT(%)', 'Manager Changes', 'Caretaker Days', 'Total_Corners']

#Replacing 'NAN' strings with actual NaN values
season_summary_df.replace('NAN', np.nan, inplace=True)

#Droping rows with missing values
season_summary_df.dropna(inplace=True)

#Defining features and target variables
X = season_summary_df[features]  
y = season_summary_df['Rank']  

#Defining the hyperparameter grid for Gradient Boosting
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 1.0]
}

#Setting up KFold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

#Initialising the Gradient Boosting Regressor
model = GradientBoostingRegressor()

#Setting up GridSearchCV to perform hyperparameter tuning with Cross-Validation
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=kf,  #Using KFold cross-validation
    scoring='neg_mean_squared_error',  #Using MSE as the scoring metric
    verbose=1,  
    n_jobs=-1  
)

#Fitting GridSearchCV to the data and printing the best parameters found by GridSearchCV
grid_search.fit(X, y)
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validated MSE:", -grid_search.best_score_)  # Negated because it's neg_mean_squared_error

#Using the best model (with the best hyperparameters) for prediction, fitting it and using it to make predictions
best_model = grid_search.best_estimator_
best_model.fit(X, y)
predictions = best_model.predict(X)

#Evaluating the model's performance using MSE and MAE
mse = mean_squared_error(y, predictions)
print("Mean Squared Error (MSE):", mse)

#Adding predictions back to the dataframe
season_summary_df['Predicted_Rank'] = predictions

#Calculating the Spearman Rank Correlation
spearman_corr, _ = spearmanr(season_summary_df['Rank'], season_summary_df['Predicted_Rank'])
print(f"Spearman Rank Correlation: {spearman_corr:.4f}")
print(season_summary_df[['Team', 'Predicted_Rank']].sort_values(by='Predicted_Rank'))

#Finally, predicting the 2024-2025 season final standings
season_2025_df = pd.read_excel(file_path, sheet_name='Partial Data') 
X_2025 = season_2025_df[features]
season_2025_df['Predicted_Rank'] = best_model.predict(X_2025)
season_2025_df = season_2025_df.sort_values(by='Predicted_Rank')
print(season_2025_df[['Team', 'Predicted_Rank']].sort_values(by='Predicted_Rank'))

  season_summary_df.replace('NAN', np.nan, inplace=True)


Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best Parameters: {'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 100, 'subsample': 0.8}
Best Cross-Validated MSE: 3.5411209483597332
Mean Squared Error (MSE): 0.9473291430260239
Spearman Rank Correlation: 0.9860
             Team  Predicted_Rank
250       Chelsea        1.199514
467       Chelsea        1.264749
295    Man United        1.270687
204       Arsenal        1.284834
533     Liverpool        1.323992
..            ...             ...
259    Sunderland       19.990201
513  Huddersfield       20.117128
563       Watford       20.700268
546   Bournemouth       20.733121
559       Norwich       22.042131

[483 rows x 2 columns]
              Team  Predicted_Rank
11       Liverpool        2.801082
0          Arsenal        4.866737
12        Man City        5.077285
5          Chelsea        5.579634
2      Bournemouth        6.560475
14       Newcastle        7.474201
15   Nott'm Forest        7.611317
17    