In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [None]:
path = "../"

In [6]:
#Loading the data into a pandas dataframe
file_path = path + "Premier League Data Analysis.xlsx"
season_summary_df = pd.read_excel(file_path, sheet_name="Season Summary")

#Replacing 'NAN' strings with actual np.nan
season_summary_df.replace('NAN', np.nan, inplace=True)

#Dropping rows with missing values (NaN)
season_summary_df = season_summary_df.dropna()


#Selecting relevant features and target variable (Rank)
features = ['Goal_Difference', 'ShotOnTarget/Shots', 'Avg_Yellow_Cards/Game', 'Avg_Red_Cards/Game', 'Home_Win_Leading_HT(%)', 'Away_Win_Leading_HT(%)', 'Home_Win_Draw_HT(%)',
           'Away_Win_Draw_HT(%)', 'Home_Win_Losing_HT(%)', 'Away_Win_Losing_HT(%)', 'Manager Changes', 'Caretaker Days', 'Total_Corners']
X = season_summary_df[features]
y = season_summary_df['Rank']

#Defining the models we want to compare
models = {
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor(),
    "LightGBM": LGBMRegressor(),
    "CatBoost": CatBoostRegressor(verbose=0)
}

#Initialising KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
results = {}

#Evaluating each model using K-Fold Cross-Validation
for model_name, model in models.items():
    mae_scores = []
    mse_scores = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        #Training the model on the training fold
        model.fit(X_train, y_train)
        
        #Predicting on the test fold
        y_pred = model.predict(X_test)
        
        #Calculating the MAE and MSE for each fold
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        mae_scores.append(mae)
        mse_scores.append(mse)
    
    #Calculating average MAE and MSE across all folds
    avg_mae = np.mean(mae_scores)
    avg_mse = np.mean(mse_scores)

    #Storing results
    results[model_name] = {'MAE': avg_mae, 'MSE': avg_mse}

#Printing the results for each model
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Average MAE: {metrics['MAE']:.4f}")
    print(f"Average MSE: {metrics['MSE']:.4f}")
    print("\n")

  season_summary_df.replace('NAN', np.nan, inplace=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000101 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 507
[LightGBM] [Info] Number of data points in the train set: 386, number of used features: 13
[LightGBM] [Info] Start training from score 10.248705
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000077 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 518
[LightGBM] [Info] Number of data points in the train set: 386, number of used features: 13
[LightGBM] [Info] Start training from score 10.103627
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000082 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 386