In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

In [4]:
class KNN:
    def __init__(self, k=5):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, x):
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        most_common = np.bincount(k_nearest_labels).argmax()
        return most_common

In [7]:
df1 = pd.read_csv('../data/match_data.csv')
X1 = df1.drop(columns=['Date', 'Team1', 'Team2', 'Score'])
y1 = df1['Score']
ds1_name = "match_data"

df2 = pd.read_csv('../data/cleaned_rounds_data.csv')
df2 = df2.drop(columns=[f'player_{i}_{suffix}' for i in range(1, 11) for suffix in ['team_name', 'name']])
X2 = df2.drop(['round_winner'], axis=1)
y2 = df2['round_winner']
ds2_name = "rounds_data"

df3 = pd.read_csv('../data/cleaned_rounds_data_with_stats.csv')
df3 = df3.drop(columns=[f'player_{i}_{suffix}' for i in range(1, 11) for suffix in ['team_name', 'name']])
X3 = df3.drop(columns=['round_winner'])
y3 = df3['round_winner']
ds3_name = "rounds_data_with_stats"

datasets = [(X1, y1, ds1_name), (X2, y2, ds2_name), (X3, y3, ds3_name)]

In [8]:
K = [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 250, 300, 350, 400, 450, 500]
best_params = []
for X, y, ds_name in datasets:
    y = y.to_numpy()
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)
    best_k = 0
    best_accuracy = 0
    for k in K:
        knn = KNN(k=k)
        knn.fit(X_train, y_train)
        y_pred = [knn.predict(x) for x in X_val]
        print(f'For dataset: {ds_name}, K: {k}, Accuracy: {metrics.accuracy_score(y_val, y_pred)}')
        if metrics.accuracy_score(y_val, y_pred) > best_accuracy:
            best_accuracy = metrics.accuracy_score(y_val, y_pred)
            best_k = k

    knn = KNN(k=best_k)
    knn.fit(X_train, y_train)
    y_pred = [knn.predict(x) for x in X_test]
    best_params.append((ds_name, best_k, metrics.accuracy_score(y_test, y_pred)))
    print(f'For dataset: {ds_name}, Best K: {best_k}, Accuracy: {metrics.accuracy_score(y_test, y_pred)}')


For dataset: match_data, K: 5, Accuracy: 0.5949119373776908
For dataset: match_data, K: 10, Accuracy: 0.5557729941291585
For dataset: match_data, K: 20, Accuracy: 0.5753424657534246
For dataset: match_data, K: 30, Accuracy: 0.5772994129158513
For dataset: match_data, K: 40, Accuracy: 0.5831702544031311
For dataset: match_data, K: 50, Accuracy: 0.5518590998043053
For dataset: match_data, K: 60, Accuracy: 0.5557729941291585
For dataset: match_data, K: 70, Accuracy: 0.5518590998043053
For dataset: match_data, K: 80, Accuracy: 0.5616438356164384
For dataset: match_data, K: 90, Accuracy: 0.5538160469667319
For dataset: match_data, K: 100, Accuracy: 0.5675146771037182
For dataset: match_data, K: 150, Accuracy: 0.5596868884540117
For dataset: match_data, K: 200, Accuracy: 0.5675146771037182
For dataset: match_data, K: 250, Accuracy: 0.5577299412915852
For dataset: match_data, K: 300, Accuracy: 0.547945205479452
For dataset: match_data, K: 350, Accuracy: 0.5557729941291585
For dataset: match_d

In [10]:
for ds_name, best_k, accuracy in best_params:
    print(f'For dataset: {ds_name}, Best K: {best_k}, Accuracy: {accuracy}')

For dataset: match_data, Best K: 5, Accuracy: 0.5557729941291585
For dataset: rounds_data, Best K: 200, Accuracy: 0.7697903294822422
For dataset: rounds_data_with_stats, Best K: 250, Accuracy: 0.7603765511339324
