In [8]:
from time import time

import pandas as pd
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [9]:
from sklearn.naive_bayes import GaussianNB
df = pd.read_csv('./data/processed_data/matches_sorted.csv')

In [10]:
# removes the index column
df.drop(df.columns[0], axis=1, inplace=True)

# removes rows with Null / Nan values
df.dropna(inplace=True)

# replace win with boolean
df['win'].replace({1: True, 0: False}, inplace=True)


In [11]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df.iloc[:, -1], test_size=0.4, random_state=0)

print(f"Training data length: {len(X_train)}")
print(f"Test data lenght: {len(X_test)}")
print(f"Sum: {len(X_test) + len(X_train)}")

Training data length: 93806
Test data lenght: 62538
Sum: 156344


In [12]:
models = [
    GaussianNB(),
    tree.DecisionTreeClassifier(min_samples_split=10, criterion='entropy'),
    RandomForestClassifier(),
    LogisticRegression(solver='liblinear', max_iter=500, random_state=0)
]

model_times = [0.0, 0.0, 0.0, 0.0]
model_accuracies = [0.0, 0.0, 0.0, 0.0]

for index, model in enumerate(models):
    print(type(model).__name__)
    start_time = time()

    model.fit(X_train, y_train)
    predicted = model.predict(X_test)

    model_time = time() - start_time
    model_times[index] = model_time

    acc = accuracy_score(y_test, predicted)
    model_accuracies[index] = acc

    print(f'Test accuracy score: {acc}')
    print(f'Time: {model_time}')

GaussianNB
Test accuracy score: 0.7788064856567207
Time: 1.0881574153900146
DecisionTreeClassifier
Test accuracy score: 0.7668777383350923
Time: 40.487165451049805
RandomForestClassifier
Test accuracy score: 0.8138411845597876
Time: 131.50372552871704
LogisticRegression
Test accuracy score: 0.8227477693562314
Time: 365.98045802116394
