In [4]:
import numpy as np
import os
from PIL import Image
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

def load_images_from_folder(folder):
    images = []
    labels = []
    for label, subfolder in enumerate(os.listdir(folder)):
        subfolder_path = os.path.join(folder, subfolder)
        for filename in os.listdir(subfolder_path):
            img_path = os.path.join(subfolder_path, filename)
            with Image.open(img_path) as img:
                img_gray = img.convert('L') 
                images.append(np.array(img_gray).flatten()) 
                labels.append(label)
    return np.array(images), np.array(labels)

train_images, train_labels = load_images_from_folder('train')


In [5]:
# k-NN Implementation
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

def manhattan_distance(x1, x2):
    return np.sum(abs(x1 - x2))

def knn_predict(X_train, y_train, X_test, k=3, distance='l2'):
    predictions = []
    for x_test in X_test:
        if distance == 'l2':
            distances = [euclidean_distance(x_test, x_train) for x_train in X_train]
        elif distance == 'l1':
            distances = [manhattan_distance(x_test, x_train) for x_train in X_train]
        k_indices = np.argsort(distances)[:k]
        k_nearest_labels = [y_train[i] for i in k_indices]
        most_common = max(set(k_nearest_labels), key=k_nearest_labels.count)
        predictions.append(most_common)
    return predictions


In [None]:
# 5-Fold Cross Validation Setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
k_values = [1, 3, 5, 7, 9]
distance_metrics = ['l1', 'l2']

accuracy_scores = {metric: {k: [] for k in k_values} for metric in distance_metrics}

for train_index, test_index in kf.split(train_images):
    X_train, X_test = train_images[train_index], train_images[test_index]
    y_train, y_test = train_labels[train_index], train_labels[test_index]
    
    for distance in distance_metrics:
        for k in k_values:
            y_pred = knn_predict(X_train, y_train, X_test, k=k, distance=distance)
            accuracy = accuracy_score(y_test, y_pred)
            accuracy_scores[distance][k].append(accuracy)

# Calculate the average accuracy for each combination of k and distance metric
average_accuracies = {metric: {k: np.mean(scores) for k, scores in ks.items()} for metric, ks in accuracy_scores.items()}
