In [7]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from time import time

In [8]:
df1 = pd.read_csv('../data/match_data.csv')
X1 = df1.drop(columns=['Date', 'Team1', 'Team2', 'Score'])
y1 = df1['Score']
ds1_name = "match_data"

df2 = pd.read_csv('../data/cleaned_rounds_data.csv')
df2 = df2.drop(columns=[f'player_{i}_{suffix}' for i in range(1, 11) for suffix in ['team_name', 'name']])
X2 = df2.drop(['round_winner'], axis=1)
y2 = df2['round_winner']
ds2_name = "rounds_data"

df3 = pd.read_csv('../data/cleaned_rounds_data_with_stats.csv')
df3 = df3.drop(columns=[f'player_{i}_{suffix}' for i in range(1, 11) for suffix in ['team_name', 'name']])
X3 = df3.drop(columns=['round_winner'])
y3 = df3['round_winner']
ds3_name = "rounds_data_with_stats"

datasets = [(X1, y1, ds1_name), (X2, y2, ds2_name), (X3, y3, ds3_name)]

In [9]:
max_depth = [3, 5, 7, 9, 11]
learning_rate = [0.01, 0.03, 0.05, 0.1, 0.3]
gamma = [0, 0.1, 0.2, 0.3, 0.4]
reg_lambda = [0, 0.1, 0.2, 0.3, 0.4]

In [12]:
best_params = []
for X, y, ds_name in datasets:
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)
    best_acc = 0
    best_md = 0
    best_lr = 0
    best_g = 0
    best_rl = 0
    best_n_estimators = 0
    time_start = time()
    for md in max_depth:
        for lr in learning_rate:
            for g in gamma:
                for rl in reg_lambda:
                    model = XGBClassifier(max_depth=md, n_estimators=500, learning_rate=lr, gamma=g, reg_lambda=rl, early_stopping_rounds=20)
                    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)
                    acc = model.score(X_val, y_val)
                    if acc > best_acc:
                        results = model.evals_result()['validation_0']['logloss']
                        best_md = md
                        best_lr = lr
                        best_g = g
                        best_rl = rl
                        best_acc = acc
                        best_n_estimators = np.argmin(results) + 1
                        print(f"{ds_name}: new best accuracy={acc} with max_depth={md}, learning_rate={lr}, gamma={g}, reg_lambda={rl}, n_estimators={best_n_estimators}")
    model = XGBClassifier(max_depth=best_md, learning_rate=best_lr, gamma=best_g, reg_lambda=best_rl, n_estimators=best_n_estimators)
    model.fit(X_train, y_train)
    acc = model.score(X_test, y_test)
    time_end = time()
    print(f"Dataset: {ds_name}, Accuracy:{acc}, Best max_depth:{best_md}, Best learning_rate:{best_lr}, Best gamma:{best_g}, Best reg_lambda:{best_rl}, Best n_estimators: {best_n_estimators}, Duration:{time_end-time_start}")

    best_params.append((ds_name, best_md, best_lr, best_g, best_rl, best_n_estimators, acc))

match_data: new best accuracy=0.6027397260273972 with max_depth=3, learning_rate=0.01, gamma=0, reg_lambda=0, n_estimators=163
match_data: new best accuracy=0.6046966731898239 with max_depth=3, learning_rate=0.05, gamma=0, reg_lambda=0.1, n_estimators=55
match_data: new best accuracy=0.6066536203522505 with max_depth=3, learning_rate=0.1, gamma=0, reg_lambda=0.4, n_estimators=17
match_data: new best accuracy=0.6086105675146771 with max_depth=3, learning_rate=0.3, gamma=0, reg_lambda=0, n_estimators=8
match_data: new best accuracy=0.6301369863013698 with max_depth=3, learning_rate=0.3, gamma=0, reg_lambda=0.2, n_estimators=7
Dataset: match_data, Accuracy:0.6164383561643836, Best max_depth:3, Best learning_rate:0.3, Best gamma:0, Best reg_lambda:0.2, Best n_estimators: 7, Duration:38.15264415740967
rounds_data: new best accuracy=0.7784431137724551 with max_depth=3, learning_rate=0.01, gamma=0, reg_lambda=0, n_estimators=500
rounds_data: new best accuracy=0.7788708297690333 with max_depth

IN THE LOGS ABOVE FOR THE BEST PARAMETERS I PRINTED THE BEST ACCURACY NOT THE TEST ACCURACY
BELOW IS THE TEST ACCURACY

In [14]:
for ds_name, best_md, best_lr, best_g, best_rl, best_n_estimators, acc in best_params:
    print(f"Dataset: {ds_name}, Accuracy:{acc}, Best max_depth:{best_md}, Best learning_rate:{best_lr}, Best gamma:{best_g}, Best reg_lambda:{best_rl}, Best n_estimators: {best_n_estimators}")

Dataset: match_data, Accuracy:0.6164383561643836, Best max_depth:3, Best learning_rate:0.3, Best gamma:0, Best reg_lambda:0.2, Best n_estimators: 7
Dataset: rounds_data, Accuracy:0.8241335044929397, Best max_depth:9, Best learning_rate:0.05, Best gamma:0.2, Best reg_lambda:0.2, Best n_estimators: 146
Dataset: rounds_data_with_stats, Accuracy:0.8241335044929397, Best max_depth:9, Best learning_rate:0.1, Best gamma:0.3, Best reg_lambda:0, Best n_estimators: 62
