In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('./../data/preprocessed_matches.csv')
df.head()
df = df.rename(columns={'winner': 'winner_label', 'outcome': 'outcome_label'}) if 'winner' in df.columns else df
label_map = {'H_or_D': 0, 'A': 1}

In [3]:
features = [
    'elo_home_pre', 'elo_away_pre', 'elo_diff_pre',
    'home_gf_roll', 'home_ga_roll', 'home_pts_roll',
    'away_gf_roll', 'away_ga_roll', 'away_pts_roll',
    'rest_days_home', 'rest_days_away', 'rest_days_diff',
    'h2h_avg_points_home', 'h2h_avg_points_away'
]

In [8]:
df = df[df['season'] != 2015]
df['season'].unique()

array([2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025])

In [8]:
import wandb

wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mphilip-baumann[0m ([33mphilip-baumann-hslu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [8]:
features_subsets = [
    ['elo_home_pre', 'elo_away_pre', 'elo_diff_pre',
     'home_gf_roll', 'home_ga_roll', 'home_pts_roll',
     'away_gf_roll', 'away_ga_roll', 'away_pts_roll',
     'rest_days_home', 'rest_days_away', 'rest_days_diff',
     'h2h_avg_points_home', 'h2h_avg_points_away'],

    ['elo_home_pre', 'elo_away_pre', 'elo_diff_pre'],
    ['elo_home_pre', 'elo_away_pre', 'elo_diff_pre',
     'h2h_avg_points_home', 'h2h_avg_points_away'],
    ['elo_home_pre', 'elo_away_pre', 'elo_diff_pre',
     'rest_days_home', 'rest_days_away', 'rest_days_diff'],
    ['elo_home_pre', 'elo_away_pre', 'elo_diff_pre',
     'home_gf_roll', 'home_ga_roll', 'home_pts_roll',
     'away_gf_roll', 'away_ga_roll', 'away_pts_roll'],
    ['elo_home_pre', 'elo_away_pre', 'elo_diff_pre',
     'home_gf_roll', 'home_ga_roll', 'home_pts_roll',
     'away_gf_roll', 'away_ga_roll', 'away_pts_roll',
     'rest_days_home', 'rest_days_away', 'rest_days_diff'],

    ['home_gf_roll', 'home_ga_roll', 'home_pts_roll',
     'away_gf_roll', 'away_ga_roll', 'away_pts_roll'],
    ['home_gf_roll', 'home_ga_roll', 'home_pts_roll',
     'away_gf_roll', 'away_ga_roll', 'away_pts_roll',
     'rest_days_home', 'rest_days_away', 'rest_days_diff'],
    ['home_gf_roll', 'home_ga_roll', 'home_pts_roll',
     'away_gf_roll', 'away_ga_roll', 'away_pts_roll',
     'h2h_avg_points_home', 'h2h_avg_points_away'],
    ['home_gf_roll', 'home_ga_roll', 'home_pts_roll',
     'away_gf_roll', 'away_ga_roll', 'away_pts_roll',
     'rest_days_home', 'rest_days_away', 'rest_days_diff',
     'h2h_avg_points_home', 'h2h_avg_points_away'],

    ['rest_days_home', 'rest_days_away', 'rest_days_diff'],
    ['rest_days_home', 'rest_days_away', 'rest_days_diff',
     'home_gf_roll', 'home_ga_roll', 'home_pts_roll',
     'away_gf_roll', 'away_ga_roll', 'away_pts_roll',
     'h2h_avg_points_home', 'h2h_avg_points_away'],
    ['rest_days_home', 'rest_days_away', 'rest_days_diff',
     'home_gf_roll', 'home_ga_roll', 'home_pts_roll',
     'away_gf_roll', 'away_ga_roll', 'away_pts_roll',
     'elo_home_pre', 'elo_away_pre', 'elo_diff_pre'
     ],
    ['h2h_avg_points_home', 'h2h_avg_points_away']
]
sweep_config = {
    "method": "grid",
    "metric": {
        "name": "logloss",
        "goal": "minimize"
    },
    "parameters": {
        "n_estimators": {"values": [50, 100, 200, 500]},
        "max_depth": {"values": [None, 50, 100, 200, 500]}
    }
}

In [10]:
from sklearn.metrics import accuracy_score, log_loss, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier

def train_wrapper(feature_subset):
    def train():
        seasons = df["season"].unique()
        fold_accuracy = []
        fold_logloss = []
        fold_precision = []
        fold_recall = []

        run = wandb.init()
        config = run.config
        run.name = (
            f"rf_"
            f"n{config.n_estimators}_"
            f"md{config.max_depth if config.max_depth is not None else 'None'}"
        )

        model = RandomForestClassifier(
            n_estimators=config.n_estimators,
            max_depth=config.max_depth,
            random_state=42,
            class_weight='balanced',
        )

        for i in range(1, len(seasons)):
            train_seasons = seasons[:i]
            test_seasons = [seasons[i]]

            train_df = df[df["season"].isin(train_seasons)]
            test_df = df[df["season"].isin(test_seasons)]

            X_train = train_df[feature_subset]
            y_train = train_df['target']
            X_test = test_df[feature_subset]
            y_test = test_df['target']

            model.fit(X_train, y_train)

            preds = model.predict(X_test)

            prob = model.predict_proba(X_test)
            pred_labels = np.argmax(prob, axis=1)

            fold_accuracy.append(accuracy_score(y_test, preds))
            fold_logloss.append(log_loss(y_test, prob))
            fold_precision.append(precision_score(y_test, pred_labels, average='macro'))
            fold_recall.append(recall_score(y_test, pred_labels, average='macro'))

        wandb.log({
            "accuracy": np.mean(fold_accuracy),
            "logloss": np.mean(fold_logloss),
            "precision": np.mean(fold_precision),
            "recall": np.mean(fold_recall),
        })

        wandb.finish()

    return train



In [None]:
import hashlib

tracker = {}

for feature in features_subsets:
    short = hashlib.sha1(feature.__str__().encode()).hexdigest()[:8]
    tracker[short] = feature
    sweep_id = wandb.sweep(sweep_config, project=f"rf-model-7-{short}")
    wandb.agent(sweep_id, function=train_wrapper(feature), count=20)


In [16]:
tracker["37f16e03"]

['elo_home_pre',
 'elo_away_pre',
 'elo_diff_pre',
 'home_gf_roll',
 'home_ga_roll',
 'home_pts_roll',
 'away_gf_roll',
 'away_ga_roll',
 'away_pts_roll',
 'rest_days_home',
 'rest_days_away',
 'rest_days_diff',
 'h2h_avg_points_home',
 'h2h_avg_points_away']

In [11]:
sweep_config = {
    "method": "bayes",   # besser als random für LogLoss
    "metric": {
        "name": "logloss",
        "goal": "minimize"
    },
    "parameters": {

        "learning_rate": {
            "distribution": "log_uniform_values",
            "min": 0.01,
            "max": 0.1
        },

        "num_leaves": {
            "values": [15, 31, 63, 127]
        },

        "min_data_in_leaf": {
            "values": [10, 20, 50, 100]
        },

        "feature_fraction": {
            "values": [0.7, 0.8, 0.9, 1.0]
        },

        "bagging_fraction": {
            "values": [0.7, 0.8, 0.9, 1.0]
        },

        "bagging_freq": {
            "values": [0, 5, 10]
        }
    }
}


In [16]:
import lightgbm as lgb
def train_wrapper_lightgbm(feature_subset):
    def train():
        seasons = df["season"].unique()
        fold_accuracy = []
        fold_logloss = []
        fold_precision = []
        fold_recall = []

        run = wandb.init()
        config = run.config
        run.name = (
            f"lightgbm_lr-{config.learning_rate}_nl-{config.num_leaves}_mdil-{config.min_data_in_leaf}_fr-{config.feature_fraction}_bfg-{config.bagging_fraction}_bagging_freq-{config.bagging_freq}"
        )

        params = {
            "objective": "binary",
            "metric": "binary_logloss",
            "verbosity": -1,
            "seed": 42,
            "learning_rate": config.learning_rate,
            "num_leaves": config.num_leaves,
            "min_data_in_leaf": config.min_data_in_leaf,
            "feature_fraction": config.feature_fraction,
            "bagging_fraction": config.bagging_fraction,
            "bagging_freq": config.bagging_freq,
        }

        for i in range(1, len(seasons)):
            train_seasons = seasons[:i]
            test_seasons = [seasons[i]]

            train_df = df[df["season"].isin(train_seasons)]
            test_df = df[df["season"].isin(test_seasons)]

            X_train = train_df[feature_subset]
            y_train = train_df['target']
            X_test = test_df[feature_subset]
            y_test = test_df['target']

            lgb_tr = lgb.Dataset(X_train, label=y_train)
            model = lgb.train(params, lgb_tr, num_boost_round=1000)

            preds = model.predict(X_test)
            pred_labels = (preds >= 0.5).astype(int)

            fold_accuracy.append(accuracy_score(y_test, pred_labels))

            fold_logloss.append(log_loss(y_test, preds))

            fold_precision.append(precision_score(y_test, pred_labels, average='macro', zero_division=0))

            fold_recall.append(recall_score(y_test, pred_labels, average='macro', zero_division=0))


        wandb.log({
            "accuracy": np.mean(fold_accuracy),
            "logloss": np.mean(fold_logloss),
            "precision": np.mean(fold_precision),
            "recall": np.mean(fold_recall),
        })

        wandb.finish()

    return train



In [None]:
import hashlib

tracker = {}

for feature in features_subsets:
    short = hashlib.sha1(feature.__str__().encode()).hexdigest()[:8]
    tracker[short] = feature
    sweep_id = wandb.sweep(sweep_config, project=f"lightgbm-model-3-{short}")
    wandb.agent(sweep_id, function=train_wrapper_lightgbm(feature), count=20)

In [10]:
tracker["04a61c2b"]

['elo_home_pre',
 'elo_away_pre',
 'elo_diff_pre',
 'home_gf_roll',
 'home_ga_roll',
 'home_pts_roll',
 'away_gf_roll',
 'away_ga_roll',
 'away_pts_roll']

In [10]:
sweep_params = {
    "method": "bayes",   # besser als random für LogLoss
    "metric": {
        "name": "logloss",
        "goal": "minimize"
    },
    "parameters": {
        "learning_rate": {
            "distribution": "log_uniform_values",
            "min": 0.01,
            "max": 0.2
        },

        "n_estimators": {
            "values": [100, 200, 400, 800]
        },

        "max_depth": {
            "values": [3, 4, 5, 6, 8]
        },

        "min_child_weight": {
            "values": [1, 5, 10, 20]
        },

        "subsample": {
            "values": [0.6, 0.8, 1.0]
        },

        "colsample_bytree": {
            "values": [0.6, 0.8, 1.0]
        },
    }


}


In [11]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, log_loss, precision_score, recall_score

def train_wrapper_xgboost(feature_subset):
    def train():
        seasons = df["season"].unique()
        fold_accuracy = []
        fold_logloss = []
        fold_precision = []
        fold_recall = []

        run = wandb.init()
        config = run.config
        run.name = (
            f"xgbclassifier_lr-{config.learning_rate}_n-{config.n_estimators}md-{config.max_depth}_mcw-{config.min_child_weight}_sub-{config.subsample}_cb-{config.colsample_bytree}"
        )

        xgb_model = XGBClassifier(
            n_estimators=config.n_estimators,
            max_depth=config.max_depth,
            learning_rate=config.learning_rate,
            subsample=config.subsample,
            colsample_bytree=config.colsample_bytree,
            random_state=42,
            objective="binary:logistic",
            eval_metric="logloss"
        )

        for i in range(1, len(seasons)):
            train_seasons = seasons[:i]
            test_seasons = [seasons[i]]

            train_df = df[df["season"].isin(train_seasons)]
            test_df = df[df["season"].isin(test_seasons)]

            X_train = train_df[feature_subset]
            y_train = train_df['target']
            X_test = test_df[feature_subset]
            y_test = test_df['target']

            # Fit model
            xgb_model.fit(X_train, y_train)

            # Predict probabilities
            prob = xgb_model.predict_proba(X_test)[:, 1]
            pred_labels = (prob >= 0.5).astype(int)

            fold_accuracy.append(accuracy_score(y_test, pred_labels))

            fold_logloss.append(log_loss(y_test, prob))

            fold_precision.append(precision_score(y_test, pred_labels, zero_division=0))

            fold_recall.append(recall_score(y_test, pred_labels, zero_division=0))


        wandb.log({
            "accuracy": np.mean(fold_accuracy),
            "logloss": np.mean(fold_logloss),
            "precision": np.mean(fold_precision),
            "recall": np.mean(fold_recall),
        })

        wandb.finish()

    return train



In [4]:
import hashlib

tracker = {}

for feature in features_subsets:
    short = hashlib.sha1(feature.__str__().encode()).hexdigest()[:8]
    tracker[short] = feature
    #sweep_id = wandb.sweep(sweep_params, project=f"xgboost-model-3-{short}")
    #wandb.agent(sweep_id, function=train_wrapper_xgboost(feature), count=20)

In [11]:
tracker["a1d097d6"]

['elo_home_pre', 'elo_away_pre', 'elo_diff_pre']