In [None]:
import numpy as np
import pandas as pd
from sklearn .model_selection import train_test_split
import random
from collections import Counter

SEED = 2460686032




In [249]:
def load_data():
    df = pd.read_csv("iris.data", header=None)
    df.columns = ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 'Class']
    return df

In [250]:
def divide_data(df):
    # converts the dataframe into list
    data = df.values.tolist()

    # prepares 3 list for each type
    data_setosa = []
    data_versicolor = []
    data_virginica = []

    # assigns each to its list accordingly
    for datum in data:
        if datum[4] == "Iris-setosa":
            data_setosa.append(datum)
        elif datum[4] == "Iris-versicolor":
            data_versicolor.append(datum)
        elif datum[4] == "Iris-virginica":
            data_virginica.append(datum)
            
    # divides the data into input and label
    X_setosa = [row[:4] for row in data_setosa] # gets first 4 columns as input 
    Y_setosa = [row[4] for row in data_setosa] # class label as last column

    # randomly divides each data by 50/50 train and test while maintaining class balance
    x_setosa_train, x_setosa_test, y_setosa_train, y_setosa_test = train_test_split(X_setosa, Y_setosa, test_size = 0.5, random_state=SEED, shuffle=True)


    X_versicolor = [row[:4] for row in data_versicolor] # gets first 4 columns as input 
    Y_versicolor = [row[4] for row in data_versicolor] # class label as last column

    x_versicolor_train, x_versicolor_test, y_versicolor_train, y_versicolor_test = train_test_split(X_versicolor, Y_versicolor, test_size = 0.5, random_state=SEED, shuffle=True)


    X_virginica = [row[:4] for row in data_virginica] # gets first 4 columns as input 
    Y_virginica = [row[4] for row in data_virginica] # class label as last column

    x_virginica_train, x_virginica_test, y_virginica_train, y_virginica_test = train_test_split(X_virginica, Y_virginica, test_size = 0.5, random_state=SEED, shuffle=True)

    random.seed(SEED)
    # merges back the dataset into general x train, y train, x test and y test
    X_train = x_setosa_train + x_versicolor_train + x_virginica_train
    Y_train = y_setosa_train + y_versicolor_train + y_virginica_train

    #shuffles while keeping the order
    combined_train = list(zip(X_train, Y_train))
    random.shuffle(combined_train)
    X_train, Y_train = zip(*combined_train)


    X_test = x_setosa_test + x_versicolor_test + x_virginica_test
    Y_test = y_setosa_test + y_versicolor_test + y_virginica_test
    combined_test = list(zip(X_test, Y_test))
    random.shuffle(combined_test)
    X_test, Y_test = zip(*combined_test)

    return X_train, Y_train, X_test, Y_test

In [251]:

# euclidean distance calculation function 
def calculate_euclidean_dst(x1, x2):
    total_sum = 0
    for i, j in zip(x1, x2):
        total_sum += (i - j) ** 2
    return np.sqrt(total_sum)


In [None]:
def knn_classify(X_train, Y_train, X_test, Y_test, K):
    # KNN logic
    predictions = []
    # calculates euclidean distance from new x_test point to all points in training set
    # gets K least distances and gets the majority label class of those and saving it into the prediction list
    for x_test in X_test:
        euclidean_dsts = []
        for index, x_train in enumerate(X_train):
            euclidean_dsts.append([index, calculate_euclidean_dst(x_test, x_train)])
        euclidean_dsts = sorted(euclidean_dsts, key=lambda x: x[1])
        euclidean_dsts = euclidean_dsts[:K]
        
        nearest_neighbor_label = []
        for dst in euclidean_dsts:
            nearest_neighbor_label.append(Y_train[dst[0]])
            
        #gets majority label of K nearest neighbor    
        counter = Counter(nearest_neighbor_label)
        predictions.append(counter.most_common(1)[0][0])
        
    # evaluation part
    if len(Y_test) == len(predictions):
        print("Prediction size matches with ground truth labels")
    else:
        print("Prediction size did not match with ground truth labels!")
        
    sample_size = len(Y_test)
    correct_prediction = 0
    for prediction, y_test in zip(predictions, Y_test):
        if prediction == y_test:
            correct_prediction += 1

    accuracy = correct_prediction / sample_size
    print(f"KNN Accuracy:{accuracy}")

Prediction size matches with ground truth labels
KNN Accuracy:0.9733333333333334


In [None]:
df = load_data()
X = df.iloc[:, :-1]
X

Y = df.iloc[:,-1]
Y

print(len(X_train))
print(len(Y_train))

print(len(X_test))
print(len(Y_test))


print(X_train[74])
print(Y_train[74])

print(X_test[5])
print(Y_test[5])
