In [6]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, log_loss, precision_score, recall_score

In [7]:
df = pd.read_csv('./../data/preprocessed_matches.csv')
df.head()

Unnamed: 0,match_id,season,kickoff,home_team,home_team_id,home_score,away_team,away_team_id,away_score,outcome_label,...,home_pts_roll,away_gf_roll,away_ga_roll,away_pts_roll,h2h_avg_points_home,h2h_avg_points_away,rest_days_home,rest_days_away,rest_days_diff,target
0,803169,2015,2015-08-08 12:45:00,Manchester United,1,1,Tottenham Hotspur,6,0,H_or_D,...,1.5,1.5,1.5,1.5,1.0,1.0,7.0,7.0,0.0,0
1,803170,2015,2015-08-08 15:00:00,Everton,11,2,Watford,57,2,H_or_D,...,1.5,1.5,1.5,1.5,1.0,1.0,7.0,7.0,0.0,0
2,803162,2015,2015-08-08 15:00:00,Bournemouth,91,0,Aston Villa,7,1,A,...,1.5,1.5,1.5,1.5,1.0,1.0,7.0,7.0,0.0,1
3,803163,2015,2015-08-08 15:00:00,Norwich City,45,1,Crystal Palace,31,3,A,...,1.5,1.5,1.5,1.5,1.0,1.0,7.0,7.0,0.0,1
4,803167,2015,2015-08-08 15:00:00,Leicester City,13,4,Sunderland,56,2,H_or_D,...,1.5,1.5,1.5,1.5,1.0,1.0,7.0,7.0,0.0,0


In [8]:
features = [
    'elo_home_pre', 'elo_away_pre', 'elo_diff_pre',
    'home_gf_roll', 'home_ga_roll', 'home_pts_roll',
    'away_gf_roll', 'away_ga_roll', 'away_pts_roll',
    'rest_days_home', 'rest_days_away', 'rest_days_diff',
    'h2h_avg_points_home', 'h2h_avg_points_away'
]

In [9]:
df = df[df['season'] != 2015]
df['season'].unique()

array([2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025])

In [11]:
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.01543079811345041,
    subsample=1,
    colsample_bytree=0.8,
    random_state=42,
    objective="binary:logistic",
    eval_metric="logloss"
)

In [None]:
# Timeseries Cross-Validation
seasons = df["season"].unique()
fold_accuracy = []
fold_logloss = []
fold_precision = []
fold_recall = []
train_sizes = []
train_scores = []
test_scores = []

for i in range(1, len(seasons)):
    train_df = df[df["season"].isin(seasons[:i])]
    test_df = df[df["season"].isin([seasons[i]])]

    X_train = train_df[features]
    y_train = train_df['target']
    X_test = test_df[features]
    y_test = test_df['target']

    model = xgb.fit(X_train, y_train)

    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)

    pred_labels = (preds_test >= 0.5).astype(int)


    fold_accuracy.append(accuracy_score(y_test, pred_labels))
    fold_logloss.append(log_loss(y_test, preds_test))
    fold_precision.append(precision_score(y_test, pred_labels, average='binary'))
    fold_recall.append(recall_score(y_test, pred_labels, average='binary'))
    print(f"Lightgbm(n_estimators=500, max_depth=50) \n"
          f"Train({seasons[:i]}) with Test ({[seasons[i]]} \n"
          f"Accuracy: {accuracy_score(y_test, pred_labels)}, "
          f"Logloss: {log_loss(y_test, preds_test)}, "
          f"Precision: {precision_score(y_test, pred_labels, average='binary')}, "
          f"Recall: {recall_score(y_test, pred_labels, average='binary')}")

    train_sizes.append(len(train_df))
    test_scores.append(log_loss(y_test, preds_test))
    train_scores.append(log_loss(y_train, preds_train))

print("---------------------------Final---------------------------")
print(f"RandomForstClassifier(n_estimators=500, max_depth=50) "
      f"Accuracy: {np.mean(fold_accuracy)}, Logloss: {np.mean(fold_logloss)}, Precision: {np.mean(fold_precision)}, Recall: {np.mean(fold_recall)}")