In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt


class KNN:
    def __init__(self, k, metric="euclidean"):
        self.k = k
        self.metric = metric

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        if self.metric == "euclidean":
            distances = self.euclidean_distance(X_test)
        elif self.metric == "cosine":
            distances = self.cosine_similarity(X_test)
        else:
            raise ValueError("Invalid metric. Choose from 'euclidean' or 'cosine'.")

        y_pred = []
        for distance in distances:
            nearest_neighbors = np.argsort(distance)[: self.k]
            nearest_labels = self.y_train[nearest_neighbors]
            pred_label = np.argmax(np.bincount(nearest_labels))
            y_pred.append(pred_label)
        return np.array(y_pred)

    def euclidean_distance(self, X_test):
        distances = []
        for x in X_test:
            dist = np.sqrt(np.sum((self.X_train - x) ** 2, axis=1))
            distances.append(dist)
        return np.array(distances)

    def cosine_similarity(self, X_test):
        similarities = []
        for x in X_test:
            dot_product = np.dot(self.X_train, x)
            norm_product = np.linalg.norm(self.X_train, axis=1) * np.linalg.norm(x)
            cosine_sim = dot_product / norm_product
            similarities.append(cosine_sim)
        return np.array(similarities)

In [None]:
data = pd.read_csv("Q3.csv", header=None)
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

splits = [(0.8, 0.1, 0.1), (0.34, 0.33, 0.33), (0.25, 0.25, 0.5)]

k_values = [1, 3, 5, 11]
errors_dict = {"0.8/0.1/0.1": [], "0.34/0.33/0.33": [], "0.25/0.25/0.5": []}
custom_euclidean_errors = []
custom_cosine_errors = []
sklearn_euclidean_errors = []
sklearn_cosine_errors = []

for train_size, val_size, test_size in splits:
    for k_value in k_values:
        X_train, X_temp, y_train, y_temp = train_test_split(
            X, y, train_size=train_size, random_state=42
        )
        X_val, X_test, y_val, y_test = train_test_split(
            X_temp,
            y_temp,
            test_size=test_size / (val_size + test_size),
            random_state=42,
        )

        knn_euclidean = KNN(k=k_value, metric="euclidean")
        knn_euclidean.fit(X_train, y_train)
        y_pred_euclidean_custom = knn_euclidean.predict(X_val)
        val_error_custom = 1 - accuracy_score(y_val, y_pred_euclidean_custom)
        custom_euclidean_errors.append(val_error_custom)

        knn_cosine = KNN(k=k_value, metric="cosine")
        knn_cosine.fit(X_train, y_train)
        y_pred_cosine_custom = knn_cosine.predict(X_val)
        val_error_cosine_custom = 1 - accuracy_score(y_val, y_pred_cosine_custom)
        custom_cosine_errors.append(val_error_cosine_custom)

        knn_sklearn_euclidean = KNeighborsClassifier(
            n_neighbors=k_value, metric="euclidean"
        )
        knn_sklearn_euclidean.fit(X_train, y_train)
        y_pred_euclidean_sklearn = knn_sklearn_euclidean.predict(X_val)
        val_error_euclidean_sklearn = 1 - accuracy_score(
            y_val, y_pred_euclidean_sklearn
        )
        sklearn_euclidean_errors.append(val_error_euclidean_sklearn)

        knn_sklearn_cosine = KNeighborsClassifier(n_neighbors=k_value, metric="cosine")
        knn_sklearn_cosine.fit(X_train, y_train)
        y_pred_cosine_sklearn = knn_sklearn_cosine.predict(X_val)
        val_error_cosine_sklearn = 1 - accuracy_score(y_val, y_pred_cosine_sklearn)
        sklearn_cosine_errors.append(val_error_cosine_sklearn)

        print(
            f"Validation error for k={k_value} and train/val/test ratio={train_size}/{val_size}/{test_size}: (Sklearn KNN, Euclidean) {round(val_error_euclidean_sklearn, 4)}"
        )
        print(
            f"Validation error for k={k_value} and train/val/test ratio={train_size}/{val_size}/{test_size}: (Sklearn KNN, Cosine) {round(val_error_cosine_sklearn, 3)}"
        )
        print()
    print()

The train/validation/test split of 80/10/10 yielded the best performance among the provided ratios for several reasons. Primarily, allocating 80% of the dataset for training ensures that the model has ample data to learn from, facilitating better generalization and predictive performance. With a larger training set, the model can capture more diverse patterns and relationships within the data, resulting in a more robust learned representation. While the validation set comprises only 10% of the data, it still offers a sufficient sample size for evaluating the model's performance and tuning hyperparameters effectively. This balance between training and validation data enables thorough model assessment without sacrificing substantial amounts of training data. Furthermore, the 10% test set provides a reliable measure of the model's generalization performance on unseen data while remaining manageable in size for efficient evaluation. By having a larger training set and separate validation and test sets, the 80/10/10 split mitigates the risk of overfitting, as the model is less likely to memorize noise in the data and instead learns more meaningful patterns. Overall, the 80/10/10 split strikes a favorable balance between data allocation for training, validation, and testing, leading to optimal model performance.


In [None]:
labels = [
    "k=1 split=0.8/0.1/0.1",
    "k=3 split=0.8/0.1/0.1",
    "k=5 split=0.8/0.1/0.1",
    "k=11 split=0.8/0.1/0.1",
    "k=1 split=0.34/0.33/0.33",
    "k=3 split=0.34/0.33/0.33",
    "k=5 split=0.34/0.33/0.33",
    "k=11 split=0.34/0.33/0.33",
    "k=1 split=0.25/0.25/0.5",
    "k=3 split=0.25/0.25/0.5",
    "k=5 split=0.25/0.25/0.5",
    "k=11 split=0.25/0.25/0.5",
]
x = np.arange(len(labels))
width = 0.2

fig, ax = plt.subplots(figsize=(15, 10))

rects1 = ax.bar(
    x - 1.5 * width, custom_euclidean_errors, width, label="Custom Euclidean"
)
rects2 = ax.bar(
    x - 0.5 * width, sklearn_euclidean_errors, width, label="Sklearn Euclidean"
)
rects3 = ax.bar(x + 0.5 * width, custom_cosine_errors, width, label="Custom Cosine")
rects4 = ax.bar(x + 0.5 * width, sklearn_cosine_errors, width, label="Sklearn Cosine")

ax.set_ylabel("Validation Error")
ax.set_title("Validation Error by Train/Val/Test Ratio and Method")
ax.set_xticks(x, labels, rotation=45, ha="right")
ax.legend()

plt.show()

Based on the provided graphs, the combination with the largest value of k (k=11)
for the KNN and a train/val/test split of 0.25/0.25/0.5 performs the best. It
has the lowest Euclidean error (0.2031) and a reasonably low euclidean error (0.396).
