In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from time import time

In [3]:
df1 = pd.read_csv('../data/match_data.csv')
X1 = df1.drop(columns=['Date', 'Team1', 'Team2', 'Score'])
y1 = df1['Score']
ds1_name = "match_data"

df2 = pd.read_csv('../data/cleaned_rounds_data.csv')
df2 = df2.drop(columns=[f'player_{i}_{suffix}' for i in range(1, 11) for suffix in ['team_name', 'name']])
X2 = df2.drop(['round_winner'], axis=1)
y2 = df2['round_winner']
ds2_name = "rounds_data"

df3 = pd.read_csv('../data/cleaned_rounds_data_with_stats.csv')
df3 = df3.drop(columns=[f'player_{i}_{suffix}' for i in range(1, 11) for suffix in ['team_name', 'name']])
X3 = df3.drop(columns=['round_winner'])
y3 = df3['round_winner']
ds3_name = "rounds_data_with_stats"

datasets = [(X1, y1, ds1_name), (X2, y2, ds2_name), (X3, y3, ds3_name)]

In [10]:
n_estimators = [10, 50, 100, 150, 200, 300, 500, 1000]
max_depths = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13, 15, 17, 20, 25, 30, 40, 50, 60, None]
min_samples_splits = [2, 3, 4, 5, 6, 7, 8, 9, 10]

In [11]:
best_params = []
for X, y, ds_name in datasets:
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)
    best_acc = 0
    best_n_est = 0
    best_max_d = 0
    best_min_s_s = 0
    start_time = time()
    for min_samples_split in min_samples_splits:
        for n_est in n_estimators:
            for max_depth in max_depths:
                model = RandomForestClassifier(criterion="log_loss" , n_estimators=n_est, max_depth=max_depth, min_samples_split=min_samples_split, random_state=42, n_jobs=-1, verbose=0)
                model.fit(X_train, y_train)
                y_pred = model.predict(X_val)
                acc = accuracy_score(y_val, y_pred)
                if acc > best_acc:
                    print(f"Dataset: {ds_name}, New best acuracy: {acc}, N Estimators: {n_est}, Max Depth: {max_depth}, Min Samples Split: {min_samples_split}")
                    best_acc = acc
                    best_n_est = n_est
                    best_max_d = max_depth
                    best_min_s_s = min_samples_split
    
    model = RandomForestClassifier(criterion="log_loss", n_estimators=best_n_est, max_depth=best_max_d, min_samples_split=best_min_s_s, random_state=42, n_jobs=-1, verbose=0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    end_time = time()
    print(f"Dataset: {ds_name}, Test accuracy: {acc}, N Estimators: {best_n_est}, Max Depth: {best_max_d}, Min Samples Split {best_min_s_s}, Duration: {end_time - start_time}'s")
    best_params.append((ds_name, best_n_est, best_max_d, best_min_s_s, acc))

Dataset: match_data, New best acuracy: 0.589041095890411, N Estimators: 10, Max Depth: 1, Min Samples Split: 2
Dataset: match_data, New best acuracy: 0.5909980430528375, N Estimators: 10, Max Depth: 2, Min Samples Split: 2
Dataset: match_data, New best acuracy: 0.6066536203522505, N Estimators: 10, Max Depth: 4, Min Samples Split: 2
Dataset: match_data, New best acuracy: 0.6144814090019569, N Estimators: 50, Max Depth: 6, Min Samples Split: 2
Dataset: match_data, New best acuracy: 0.6183953033268101, N Estimators: 200, Max Depth: 8, Min Samples Split: 2
Dataset: match_data, New best acuracy: 0.6223091976516634, N Estimators: 500, Max Depth: 8, Min Samples Split: 2
Dataset: match_data, New best acuracy: 0.6281800391389433, N Estimators: 300, Max Depth: 9, Min Samples Split: 5
Dataset: match_data, Test accuracy: 0.5792563600782779, N Estimators: 300, Max Depth: 9, Min Samples Split 5, Duration: 577.0836751461029's
Dataset: rounds_data, New best acuracy: 0.6629597946963216, N Estimators: 

In [13]:
for ds_name, n_est, max_d, min_s_s, acc in best_params:
    print(f"Dataset: {ds_name}, Test Accuracy: {acc}, N Estimators: {n_est}, Max Depth: {max_d}, Min Samples Split: {min_s_s}")

Dataset: match_data, Test Accuracy: 0.5792563600782779, N Estimators: 300, Max Depth: 9, Min Samples Split: 5
Dataset: rounds_data, Test Accuracy: 0.834403080872914, N Estimators: 150, Max Depth: 40, Min Samples Split: 3
Dataset: rounds_data_with_stats, Test Accuracy: 0.8258451005562687, N Estimators: 300, Max Depth: 25, Min Samples Split: 7
