In [3]:
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss
from time import time

In [6]:
df1 = pd.read_csv('../data/match_data.csv')
X1 = df1.drop(columns=['Date', 'Team1', 'Team2', 'Score'])
y1 = df1['Score']
ds1_name = "match_data"

df2 = pd.read_csv('../data/cleaned_rounds_data.csv')
df2 = df2.drop(columns=[f'player_{i}_{suffix}' for i in range(1, 11) for suffix in ['team_name', 'name']])
X2 = df2.drop(['round_winner'], axis=1)
y2 = df2['round_winner']
ds2_name = "rounds_data"

df3 = pd.read_csv('../data/cleaned_rounds_data_with_stats.csv')
df3 = df3.drop(columns=[f'player_{i}_{suffix}' for i in range(1, 11) for suffix in ['team_name', 'name']])
X3 = df3.drop(columns=['round_winner'])
y3 = df3['round_winner']
ds3_name = "rounds_data_with_stats"

datasets = [(X1, y1, ds1_name), (X2, y2, ds2_name), (X3, y3, ds3_name)]

In [7]:
learning_rates = [0.01, 0.03, 0.05, 0.07, 0.1 ,0.3, 0.5, 0.7, 1]
max_depths = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [12]:
best_params = []
for X, y, ds_name in datasets:
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)
    best_acc = 0
    best_lr = 0
    best_n_est = 0
    best_max_d = 0
    start_time = time()
    for lr in learning_rates:
        for max_depth in max_depths:
            model = AdaBoostClassifier(algorithm="SAMME", n_estimators=150, learning_rate=lr, random_state=42, estimator=DecisionTreeClassifier(max_depth=max_depth))
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)
            acc = accuracy_score(y_val, y_pred)
            if acc > best_acc:
                best_acc = acc
                best_lr = lr
                best_max_d = max_depth
                log_losses = []
                for y_prob in model.staged_predict_proba(X_train):
                    log_losses.append(log_loss(y_train, y_prob, labels=[0, 1]))
                best_n_est = np.argmin(log_losses) + 1
                print(f"Dataset: {ds_name}, New best acuracy: {acc}, Learning Rate: {lr}, N Estimators: {best_n_est}, Max Depth: {max_depth}")
    
    model = AdaBoostClassifier(algorithm="SAMME", n_estimators=best_n_est, learning_rate=best_lr, random_state=42, estimator=DecisionTreeClassifier(max_depth=best_max_d))
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    end_time = time()
    print(f"Dataset: {ds_name}, Test accuracy: {acc}, Learning Rate: {best_lr}, N Estimators: {best_n_est}, Max Depth: {best_max_d}, Duration: {end_time - start_time}'s")
    best_params.append((ds_name, best_lr, best_n_est, best_max_d, acc))

Dataset: match_data, New best acuracy: 0.6086105675146771, Learning Rate: 0.01, N Estimators: 149, Max Depth: 1
Dataset: match_data, New best acuracy: 0.62426614481409, Learning Rate: 0.01, N Estimators: 150, Max Depth: 3
Dataset: match_data, New best acuracy: 0.6281800391389433, Learning Rate: 0.05, N Estimators: 150, Max Depth: 2
Dataset: match_data, Test accuracy: 0.62426614481409, Learning Rate: 0.05, N Estimators: 150, Max Depth: 2, Duration: 120.34997701644897's
Dataset: rounds_data, New best acuracy: 0.5885372112917023, Learning Rate: 0.01, N Estimators: 150, Max Depth: 1
Dataset: rounds_data, New best acuracy: 0.7335329341317365, Learning Rate: 0.01, N Estimators: 150, Max Depth: 2
Dataset: rounds_data, New best acuracy: 0.7583404619332763, Learning Rate: 0.01, N Estimators: 150, Max Depth: 3
Dataset: rounds_data, New best acuracy: 0.760906757912746, Learning Rate: 0.01, N Estimators: 150, Max Depth: 5
Dataset: rounds_data, New best acuracy: 0.7720273738237811, Learning Rate: 0

In [14]:
for ds_name, best_lr, best_n_est, best_max_d, acc in best_params:
    print(f"Dataset: {ds_name}, Test accuracy: {acc}, Learning Rate: {best_lr}, N Estimators: {best_n_est}, Max Depth: {best_max_d}")

Dataset: match_data, Test accuracy: 0.62426614481409, Learning Rate: 0.05, N Estimators: 150, Max Depth: 2
Dataset: rounds_data, Test accuracy: 0.8061617458279846, Learning Rate: 0.1, N Estimators: 22, Max Depth: 9
Dataset: rounds_data_with_stats, Test accuracy: 0.8160034231921267, Learning Rate: 0.05, N Estimators: 52, Max Depth: 10
