### Import Libraries

In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from loguru import logger

In [32]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
y = train["label"].values
X = train.drop("label", axis=1).values.astype(np.float32)
X_test = test.values.astype(np.float32)

X /= 255.0
X_test /= 255.0
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

### PCA

In [33]:
pca = PCA(n_components=100) 
X_train_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(X_test)

### Tuning K

In [34]:
best_k = None
best_acc = 0
for k in [1, 3, 5, 7, 11]:
    clf = KNeighborsClassifier(n_neighbors=k, weights='distance')
    clf.fit(X_train_pca, y_train)
    yv = clf.predict(X_val_pca)
    acc = accuracy_score(y_val, yv)
    print("k =", k, "val acc =", acc)
    if acc > best_acc:
        best_acc = acc
        best_k = k

print("Best k:", best_k)

k = 1 val acc = 0.9693650793650793
k = 3 val acc = 0.9747619047619047
k = 5 val acc = 0.9739682539682539
k = 7 val acc = 0.9720634920634921
k = 11 val acc = 0.9687301587301588
Best k: 3


In [35]:
pca_full = PCA(n_components=100)
X_full_pca = pca_full.fit_transform(X / 255.0)
X_test_full_pca = pca_full.transform(X_test / 255.0)

clf = KNeighborsClassifier(n_neighbors=best_k, weights='distance')
clf.fit(X_full_pca, y)
y_pred = clf.predict(X_test_full_pca)

submission = pd.DataFrame({"ImageId": np.arange(1, len(y_pred)+1), "Label": y_pred})
submission.to_csv("knn_submission.csv", index=False)