In [79]:
from typing import List
from collections import Counter
import math as m
import numpy as np
import random as rd

In [80]:
def majority_vote(labels: List[str]) -> str:
    """Assumes that labels are ordered from nearest to farthest."""
    vote_counts = Counter(labels)
    winner, winner_count = vote_counts.most_common(1)[0]
    num_winners = len(
        [count for count in vote_counts.values() if count == winner_count])

    if num_winners == 1:
        return winner                     # unique winner, so return it
    else:
        return majority_vote(labels[:-1])  # try again without the farthest


In [81]:
Vector = List[float]

def subtract(v: Vector, w: Vector) -> Vector:
    """Subtracts corresponding elements"""    
    return [v_i - w_i for v_i, w_i in zip(v, w)]

def dot(v: Vector, w: Vector) -> float:
    """Computes v_1 * w_1 + ... + v_n * w_n"""
    return sum(v_i * w_i for v_i, w_i in zip(v, w))

def sum_of_squares(v: Vector) -> float:
    """Returns v_1 * v_1 + ... + v_n * v_n"""
    return dot(v, v)

def squared_distance(v: Vector, w: Vector) -> float:
    """Computes (v_1 - w_1) ** 2 + ... + (v_n - w_n) ** 2"""
    return sum_of_squares(subtract(v, w))


In [82]:
# Distance
def distance(v: Vector, w: Vector) -> float:
    """Computes the distance between v and w"""
    return m.sqrt(squared_distance(v, w))

In [83]:
def knn_classifier(k: int, X_train: list, X_test: list, x_unknown: list) -> str:
    distances = [distance(x_unknown, sample) for sample in X_train]
    
    nearest_neighbor = np.argsort(distances)
    
    labels = [X_test[i] for i in nearest_neighbor[:k]]

    return majority_vote(labels)

In [84]:
# Create function reading dataset.
def read_data(file_path: str) -> list:
    data = []
    with open(file_path, "r") as file:
        lines = file.readlines()
        lines = lines[1:]
        for line in lines:
            values = line.strip().split()
            data.append([float(val) for val in values])
    return data


In [85]:
# Define function train_test_split to split the data and labels.
def train_test_split(dataset:list)->list:
    data = [d[:-1] for d in dataset]
    labels = [int(label[-1]) for label in dataset]
    return data, labels

In [86]:
def hold_out(train, test, num_train = 80):
    length = len(train)
    d = length*num_train // 100
    
    X_train = train[:d]
    X_test = test[:d]

    y_train = train[d:]
    y_test = test[d:]

    return X_train, X_test, y_train, y_test

In [87]:
# สร้างฟังก์ชันที่ทำ Cross Validation
def cross_validation(X, y, k_folds, k):
    fold_size = len(X) // k_folds
    accuracy_scores = []
    
    for i in range(k_folds):
        start, end = i * fold_size, (i + 1) * fold_size
        X_val_fold = X[start:end]
        y_val_fold = y[start:end]
        
        X_train_fold = np.concatenate((X[:start], X[end:]), axis=0)
        y_train_fold = np.concatenate((y[:start], y[end:]), axis=0)

        correct_predictions = 0
        for j in range(len(X_val_fold)):
            predicted_label = knn_classifier(k, X_train_fold, y_train_fold, X_val_fold[j])
            if predicted_label == y_val_fold[j]:
                correct_predictions += 1

        accuracy = correct_predictions / len(X_val_fold)
        accuracy_scores.append(accuracy)

    avg_accuracy = np.mean(accuracy_scores)
    return avg_accuracy

In [88]:
def fisher_yates_shuffle(arr):
    # rd.seed(0)
    rd.shuffle(arr)

## MAIN CODE

In [89]:
data = read_data("iris.pat")

In [90]:
fisher_yates_shuffle(data)

In [91]:
X, y = train_test_split(data)
k_folds = 10

In [92]:
k_dict = {}
X_train, X_test, Y_train, Y_test = hold_out(X,y)

for k in range(1,20,1):
    avg_accuracy = cross_validation(np.array(X_train), np.array(X_test), k_folds, k)
    k_dict[k] = avg_accuracy
    print(f"k = {k}, Average Accuracy: {avg_accuracy:.4f}")

max_k = max(k_dict.values())
foo = list(k_dict.values())
index = list(k_dict.keys())
max_k = index[foo.index(max_k)]

print(max_k)

k = 1, Average Accuracy: 0.9667
k = 2, Average Accuracy: 0.9667
k = 3, Average Accuracy: 0.9500
k = 4, Average Accuracy: 0.9500
k = 5, Average Accuracy: 0.9500
k = 6, Average Accuracy: 0.9500
k = 7, Average Accuracy: 0.9417
k = 8, Average Accuracy: 0.9417
k = 9, Average Accuracy: 0.9333
k = 10, Average Accuracy: 0.9333
k = 11, Average Accuracy: 0.9500
k = 12, Average Accuracy: 0.9500
k = 13, Average Accuracy: 0.9417
k = 14, Average Accuracy: 0.9417
k = 15, Average Accuracy: 0.9500
k = 16, Average Accuracy: 0.9500
k = 17, Average Accuracy: 0.9417
k = 18, Average Accuracy: 0.9417
k = 19, Average Accuracy: 0.9417
1


In [93]:
cross_validation(np.array(Y_train), np.array(Y_test), k_folds, k=max_k)

0.9333333333333332