In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from time import time

In [2]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

In [5]:
class KNN:
    def __init__(self, k=5):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, x):
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        most_common = np.bincount(k_nearest_labels).argmax()
        return most_common

In [3]:
df1 = pd.read_csv('../data/match_data.csv')
X1 = df1.drop(columns=['Date', 'Team1', 'Team2', 'Score'])
y1 = df1['Score']
ds1_name = "match_data"

df2 = pd.read_csv('../data/cleaned_rounds_data.csv')
df2 = df2.drop(columns=[f'player_{i}_{suffix}' for i in range(1, 11) for suffix in ['team_name', 'name']])
X2 = df2.drop(['round_winner'], axis=1)
y2 = df2['round_winner']
ds2_name = "rounds_data"

df3 = pd.read_csv('../data/cleaned_rounds_data_with_stats.csv')
df3 = df3.drop(columns=[f'player_{i}_{suffix}' for i in range(1, 11) for suffix in ['team_name', 'name']])
X3 = df3.drop(columns=['round_winner'])
y3 = df3['round_winner']
ds3_name = "rounds_data_with_stats"

datasets = [(X1, y1, ds1_name), (X2, y2, ds2_name), (X3, y3, ds3_name)]

In [7]:
K = [5, 7, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
best_params = []
for X, y, ds_name in datasets:
    y = y.to_numpy()
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)
    best_k = 0
    best_acc = 0
    start_time = time()
    for k in K:
        knn = KNN(k=k)
        knn.fit(X_train, y_train)
        y_pred = [knn.predict(x) for x in X_val]
        acc = accuracy_score(y_val, y_pred)
        if acc - best_acc > 0.01:
            print(f'For dataset: {ds_name}, New best accuracy: {acc}, K: {k}')
            best_acc = acc
            best_k = k

    knn = KNN(k=best_k)
    knn.fit(X_train, y_train)
    y_pred = [knn.predict(x) for x in X_test]
    acc = accuracy_score(y_test, y_pred)
    end_time = time()
    best_params.append((ds_name, best_k, acc))
    print(f'For dataset: {ds_name}, Best K: {best_k}, Accuracy: {acc}, Duration: {end_time - start_time}')


For dataset: match_data, New best accuracy: 0.5949119373776908, K: 5
For dataset: match_data, Best K: 5, Accuracy: 0.5557729941291585, Duration: 50.23921465873718
For dataset: rounds_data, New best accuracy: 0.688622754491018, K: 5
For dataset: rounds_data, New best accuracy: 0.6988879384088965, K: 7
For dataset: rounds_data, New best accuracy: 0.7095808383233533, K: 10
For dataset: rounds_data, New best accuracy: 0.7198460222412318, K: 20
For dataset: rounds_data, New best accuracy: 0.7301112061591104, K: 30
For dataset: rounds_data, New best accuracy: 0.7425149700598802, K: 45
For dataset: rounds_data, Best K: 45, Accuracy: 0.7539580658964484, Duration: 1230.824372291565
For dataset: rounds_data_with_stats, New best accuracy: 0.6723695466210436, K: 5
For dataset: rounds_data_with_stats, New best accuracy: 0.6864841745081266, K: 7
For dataset: rounds_data_with_stats, New best accuracy: 0.7005988023952096, K: 10
For dataset: rounds_data_with_stats, New best accuracy: 0.7108639863130881

In [8]:
for ds_name, best_k, accuracy in best_params:
    print(f'For dataset: {ds_name}, Best K: {best_k}, Accuracy: {accuracy}')

For dataset: match_data, Best K: 5, Accuracy: 0.5557729941291585
For dataset: rounds_data, Best K: 45, Accuracy: 0.7539580658964484
For dataset: rounds_data_with_stats, Best K: 40, Accuracy: 0.7458279845956355
