In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

In [3]:
class KNN:
    def __init__(self, k=5):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, x):
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        most_common = np.bincount(k_nearest_labels).argmax()
        return most_common
    
    def predict_proba(self, x):
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        probs = np.bincount(k_nearest_labels, minlength=np.max(self.y_train)+1) / self.k
        return probs

In [17]:
df = pd.read_csv('../data/match_data.csv')
X = df.drop(columns=['Date', 'Team1', 'Team2', 'Score'])
y = df['Score']
y = y.to_numpy()
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)

In [20]:
df = pd.read_csv('../data/cleaned_rounds_data.csv')
df = df.drop(columns=[f'player_{i}_{suffix}' for i in range(1, 11) for suffix in ['team_name', 'name']])
X = df.drop(['round_winner'], axis=1)
y = df['round_winner']
y = y.to_numpy()
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)

In [4]:
df = pd.read_csv('../data/cleaned_rounds_data_with_stats.csv')
df = df.drop(columns=[f'player_{i}_{suffix}' for i in range(1, 11) for suffix in ['team_name', 'name']])
X = df.drop(columns=['round_winner'])
y = df['round_winner']
y = y.to_numpy()
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)

In [16]:
K = [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 250, 300, 350, 400, 450, 500]

In [21]:
best_k = 0
best_accuracy = 0
for k in K:
    knn = KNN(k=k)
    knn.fit(X_train, y_train)
    y_pred = [knn.predict(x) for x in X_val]
    print(f'K: {k}, Accuracy: {metrics.accuracy_score(y_val, y_pred)}')
    if metrics.accuracy_score(y_val, y_pred) > best_accuracy:
        best_accuracy = metrics.accuracy_score(y_val, y_pred)
        best_k = k

knn = KNN(k=best_k)
knn.fit(X_train, y_train)
y_pred = [knn.predict(x) for x in X_test]
print(f'Best K: {best_k}, Accuracy: {metrics.accuracy_score(y_test, y_pred)}')

K: 5, Accuracy: 0.688622754491018
K: 10, Accuracy: 0.7095808383233533
K: 20, Accuracy: 0.7198460222412318
K: 30, Accuracy: 0.7301112061591104
K: 40, Accuracy: 0.7395209580838323
K: 50, Accuracy: 0.7446535500427716
K: 60, Accuracy: 0.7489307100085543
K: 70, Accuracy: 0.7532078699743371
K: 80, Accuracy: 0.7557741659538066
K: 90, Accuracy: 0.7527801539777588
K: 100, Accuracy: 0.7510692899914457
K: 150, Accuracy: 0.7549187339606501
K: 200, Accuracy: 0.7591958939264328
K: 250, Accuracy: 0.7557741659538066
K: 300, Accuracy: 0.7544910179640718
K: 350, Accuracy: 0.7557741659538066
K: 400, Accuracy: 0.7467921300256629
K: 450, Accuracy: 0.7476475620188195
K: 500, Accuracy: 0.7455089820359282


In [22]:
knn = KNN(k=best_k)
knn.fit(X_train, y_train)
y_pred = [knn.predict(x) for x in X_test]
print(f'Best K: {best_k}, Accuracy: {metrics.accuracy_score(y_test, y_pred)}')

Best K: 200, Accuracy: 0.7697903294822422


In [19]:
print(f'Best K: {best_k}, Best Accuracy: {best_accuracy}')

Best K: 5, Best Accuracy: 0.5949119373776908


FOR MATCH DATA BEST ACCURACY WAS FOUND FOR 250 NEIGHBORS
Best K: 5, Best Accuracy: 0.5949119373776908

FOR ROUNDS DATA BEST ACCURACY WAS FOUND FOR 250 NEIGHBORS

FOR ROUNDS DATA WITH STATS BEST ACCURACY WAS FOUND FOR 250 NEIGHBORS
Best K: 250, Best Accuracy: 0.7497861420017109