In [36]:
import numpy as np
import random as rd# Only used for generating random numbers

##### Import and Read the data.

In [37]:
# Create function reading dataset.
def read_data(file_path:str)->list:
    data = []
    with open(file_path, "r") as file:
        lines = file.readlines()
        lines = lines[1:]
        for line in lines:
            values = line.strip().split()
            data.append([float(val) for val in values])
    return data

In [38]:
def most_frequent(List):
    return max(set(List), key = List.count)

In [39]:
def argSort(distance:list)->list:
    sort_index = [i for i, x in sorted(enumerate(distance), key=lambda x: x[1])]
    return sort_index

In [40]:
def fisher_yates_shuffle(arr):
    rd.shuffle(arr)

In [41]:
dataset = read_data('iris.pat')

fisher_yates_shuffle(dataset)

In [42]:
# Define function train_test_split to split the data and labels.
def train_test_split(dataset:list)->list:
    data = [d[:-1] for d in dataset]
    labels = [int(label[-1]) for label in dataset]
    return data, labels

In [43]:
X, y = train_test_split(dataset)
X = np.array(X)
y = np.array(y)

In [44]:
def hold_out(train, test, num_train = 70):
    length = len(train)
    d = length*num_train // 100
    X_train = train[:d]
    X_test = test[:d]

    y_train = train[d:]
    y_test = test[d:]

    return X_train, X_test, y_train, y_test

In [53]:
hold_out(X,y)

(array([[6.3, 3.3, 4.7, 1.6],
        [7.9, 3.8, 6.4, 2. ],
        [5.8, 2.7, 5.1, 1.9],
        [6.3, 2.5, 4.9, 1.5],
        [6.6, 3. , 4.4, 1.4],
        [5.2, 3.5, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [6.7, 3. , 5. , 1.7],
        [6.5, 3.2, 5.1, 2. ],
        [5.7, 2.5, 5. , 2. ],
        [6.9, 3.1, 5.1, 2.3],
        [6.3, 2.5, 5. , 1.9],
        [6. , 3. , 4.8, 1.8],
        [5.5, 4.2, 1.4, 0.2],
        [4.4, 3.2, 1.3, 0.2],
        [6.5, 3. , 5.5, 1.8],
        [4.9, 3.1, 1.5, 0.1],
        [7.3, 2.9, 6.3, 1.8],
        [5. , 3.6, 1.4, 0.2],
        [6.3, 2.7, 4.9, 1.8],
        [5. , 3.2, 1.2, 0.2],
        [5.5, 2.4, 3.8, 1.1],
        [5.8, 2.8, 5.1, 2.4],
        [5.5, 2.6, 4.4, 1.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.2, 3.4, 1.4, 0.2],
        [5.7, 2.6, 3.5, 1. ],
        [6.9, 3.1, 4.9, 1.5],
        [5.5, 2.4, 3.7, 1. ],
        [6.7, 3.1, 4.4, 1.4],
        [5.7, 3. , 4.2, 1.2],
        [6.6, 2.9, 4.6, 1.3],
        [5

#### Distance

In [46]:
# Function calculate Distance
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))

#### Modeling

In [47]:
# สร้างฟังก์ชัน K-NN
def k_nearest_neighbors(X_train, y_train, x_query, k):

    distances = [euclidean_distance(x_query, x) for x in X_train]

    sorted_indices = argSort(distances)

    k_indices = sorted_indices[:k]

    k_nearest_labels = [y_train[i] for i in k_indices]

    most_common = most_frequent(k_nearest_labels)

    return most_common

#### Cross validation

In [48]:
# สร้างฟังก์ชันที่ทำ Cross Validation
def cross_validation(X, y, k_folds, k):
    fold_size = len(X) // k_folds
    accuracy_scores = []
    
    for i in range(k_folds):
        start, end = i * fold_size, (i + 1) * fold_size
        X_val_fold = X[start:end]
        y_val_fold = y[start:end]
        X_train_fold = np.concatenate((X[:start], X[end:]), axis=0)
        y_train_fold = np.concatenate((y[:start], y[end:]), axis=0)

        correct_predictions = 0
        for j in range(len(X_val_fold)):
            predicted_label = k_nearest_neighbors(X_train_fold, y_train_fold, X_val_fold[j], k)
            if predicted_label == y_val_fold[j]:
                correct_predictions += 1

        accuracy = correct_predictions / len(X_val_fold)
        accuracy_scores.append(accuracy)

    avg_accuracy = np.mean(accuracy_scores)
    return avg_accuracy

#### Set Parameters

In [49]:
k_folds = 10

#### Main Program

In [50]:
k_dict = {}
X_train, X_test, Y_train, Y_test = hold_out(X,y)
for k in range(1,20,2):
    avg_accuracy = cross_validation(np.array(X_train), np.array(X_test), k_folds, k)
    k_dict[k] = avg_accuracy
    print(f"k = {k}, Average Accuracy: {avg_accuracy:.4f}")

max_k = max(k_dict.values())
foo = list(k_dict.values())
index = list(k_dict.keys())
max_k = index[foo.index(max_k)]

k = 1, Average Accuracy: 0.9400
k = 3, Average Accuracy: 0.9600
k = 5, Average Accuracy: 0.9700
k = 7, Average Accuracy: 0.9500
k = 9, Average Accuracy: 0.9600
k = 11, Average Accuracy: 0.9600


k = 13, Average Accuracy: 0.9600
k = 15, Average Accuracy: 0.9400
k = 17, Average Accuracy: 0.9500
k = 19, Average Accuracy: 0.9400


## TESTING

In [51]:
cross_validation(np.array(Y_train), np.array(Y_test), k_folds, k=max_k)

0.975