In [72]:
import numpy as np
import pandas as pd
from collections import Counter

# KNN Algorithm

In [73]:
def Knn(X_train, Y_train, X_test, k):
    '''
    Performs K-nearest neighbours classification on the test instances by looking for k minimum distances for 
    each instance from the training data. 
    Arguments
    ---------
    X_train : 2d-array
        The train dataset of shape (m, n), where m is the number of training examples and n is the number of features.
    Y_train : array
        The labels of train data.
    X_test : 2d-array
        The test dataset of shape (p, n), where p is the number of test instances and n is the number of features.
    k : int
      The number of nearest neighbours to be used.
    Returns
    -------
    Y_test :  array
        The predicted instances of test data.
    
    '''
    p = X_test.shape[0]
    Y_test = []
    for i in range(0, p):
        distance = np.sqrt(np.sum(np.square(X_train - X_test[i, :]), axis=1)) # euclidean distance
        k_neigh_index = np.argsort(distance)[0:k]    # select k minimum distances
        k_neigh = Y_train[k_neigh_index]     # find labels of the minimum distances 
        k_neigh_count = Counter(k_neigh)   
        Y_test.append(k_neigh_count.most_common(1)[0][0])      # select the most common label
    return np.array(Y_test)

# Evaluation Metrics

In [74]:
def confusion_matrix(actual_values, predicted_values):
    '''
    Generates confusion matrix for classification evaluation.
    Arguments
    ---------
    actual_values : array
        The actual decoded labels of test data: 1 for "Iris-setosa", 2 for "Iris-versicolor" and 3 for "Iris-virginica".
        
    predicted_values : array
        The predicted decoded labels of test data.
        
    Returns
    -------
    conf_matrix : 2d array
        The confusion matrix
    
    '''
    conf_matrix = np.zeros((3,3))
    for i in range(0, len(actual_values)):
        if actual_values[i] == predicted_values[i]:
            if actual_values[i] == 1:
                conf_matrix[0,0] = conf_matrix[0,0] + 1
            if actual_values[i] == 2:
                conf_matrix[1,1] = conf_matrix[1,1] + 1
            if actual_values[i] == 3:
                conf_matrix[2,2] = conf_matrix[2,2] + 1
        else:
            if actual_values[i] == 1 and predicted_values[i] == 2:
                conf_matrix[1,0] = conf_matrix[1,0] + 1
            if actual_values[i] == 1 and predicted_values[i] == 3:
                conf_matrix[2,0] = conf_matrix[2,0] + 1
            if actual_values[i] == 2 and predicted_values[i] == 1:
                conf_matrix[0,1] = conf_matrix[0,1] + 1
            if actual_values[i] == 2 and predicted_values[i] == 3:
                conf_matrix[2,1] = conf_matrix[2,1] + 1
            if actual_values[i] == 3 and predicted_values[i] == 1:
                conf_matrix[0,2] = conf_matrix[0,2] + 1
            if actual_values[i] == 3 and predicted_values[i] == 2:
                conf_matrix[1,2] = conf_matrix[1,2] + 1
    return conf_matrix

In [75]:
def classification_report(conf_matrix):
    '''
    Generates micro and macro average scores for classification evaluation using the confusion matrix.
    Arguments
    ---------
    conf_matrix : 2d array
        The confusion matrix
        
    Returns
    -------
    classification_report : dataframe
        A dataframe containing micro and macro average precision, recall, accuracy and F1-scores.
    

    '''
    tp_c1 = conf_matrix[0,0]
    fp_c1 = conf_matrix[0,1] + conf_matrix[0,2]
    fn_c1 = conf_matrix[1,0] + conf_matrix[2,0]
    tn_c1 = conf_matrix[1,1] + conf_matrix[1,2] + conf_matrix[2,1] + conf_matrix[2,2]
    precision_c1 = tp_c1 / (tp_c1 + fp_c1)
    recall_c1 = tp_c1 / (tp_c1 + fn_c1)
    acc_c1 = (tp_c1 + tn_c1) / (tp_c1 + fp_c1 + tn_c1 + fn_c1)
    F1_score_c1 = (2 * precision_c1 * recall_c1) / (precision_c1 + recall_c1)
    
    tp_c2 = conf_matrix[1,1]
    fp_c2 = conf_matrix[1,0] + conf_matrix[1,2]
    fn_c2 = conf_matrix[0,1] + conf_matrix[2,1]
    tn_c2 = conf_matrix[0,0] + conf_matrix[0,2] + conf_matrix[2,0] + conf_matrix[2,2]
    precision_c2 = tp_c2 / (tp_c2 + fp_c2)
    recall_c2 = tp_c2 / (tp_c2 + fn_c2)
    acc_c2 = (tp_c2 + tn_c2) / (tp_c2 + fp_c2 + tn_c2 + fn_c2)
    F1_score_c2 = (2 * precision_c2 * recall_c2) / (precision_c2 + recall_c2)
    
    tp_c3 = conf_matrix[2,2]
    fp_c3 = conf_matrix[2,0] + conf_matrix[2,1]
    fn_c3 = conf_matrix[0,2] + conf_matrix[1,2]
    tn_c3 = conf_matrix[0,0] + conf_matrix[0,1] + conf_matrix[1,0] + conf_matrix[1,1]
    precision_c3 = tp_c3 / (tp_c3 + fp_c3)
    recall_c3 = tp_c3 / (tp_c3 + fn_c3)
    acc_c3 = (tp_c3 + tn_c3) / (tp_c3 + fp_c3 + tn_c3 + fn_c3)
    F1_score_c3 = (2 * precision_c3 * recall_c3) / (precision_c3 + recall_c3)
    
    
    macro_prec = (precision_c1 + precision_c2 + precision_c3) / 3
    macro_recall = (recall_c1 + recall_c2 + recall_c3) / 3
    macro_acc = (acc_c1 + acc_c2 + acc_c3) / 3
    macro_F1 = (F1_score_c1 + F1_score_c2 + F1_score_c3) / 3
    
    micro_prec = (tp_c1 + tp_c2 + tp_c3) / (tp_c1 + tp_c2 + tp_c3 + fp_c1 + fp_c2 + fp_c3)
    micro_recall = (tp_c1 + tp_c2 + tp_c3) / (tp_c1 + tp_c2 + tp_c3 + fn_c1 + fn_c2 + fn_c3)
    micro_acc = (tp_c1 + tp_c2 + tp_c3 + tn_c1 + tn_c2 + tn_c3) / (tp_c1 + tp_c2 + tp_c3 + tn_c1 + tn_c2 + tn_c3 + fp_c1 + fp_c2 + fp_c3 + fn_c1 + fn_c2 + fn_c3)
    micro_F1 = (2 * micro_prec * micro_recall) / (micro_prec + micro_recall)
    
    data = np.array([[macro_prec, macro_recall, macro_acc, macro_F1], [micro_prec, micro_recall, micro_acc, micro_F1]])
    classification_report = pd.DataFrame(data, columns = ['precision', 'recall', 'accuracy', 'F1_score'], index=['macro_average', 'micro_avg'])
    return classification_report

# Decoding String Labels

In [76]:
def decode(Y):
    '''
    This function decodes the labels.
    Arguments
    ---------
    Y : array
        The values of the function at each data point. This is a vector of
        size m, where m is the number of training examples.
    
    Returns
    -------
    Y_d : list
          Decoded values . 1 for "Iris-setosa" class, 2 for "Iris-versicolor" and 3 for "Iris-virginica".

    '''
    Y_d = []
    for i in range(0, len(Y)):
        if Y[i] == 'Iris-setosa':
            Y_d.append(1)
        if Y[i] == 'Iris-versicolor':
            Y_d.append(2)
        if Y[i] == 'Iris-virginica':
            Y_d.append(3)
    return Y_d

# Training and Testing

In [77]:
df_train = pd.read_csv('train.csv', header=None)
df_test = pd.read_csv('test.csv', header=None)
X_train = np.array(df_train.drop(4, axis=1))
Y_train = np.array(df_train[4])
X_test = np.array(df_test.drop(4, axis=1))
Y_test = np.array(df_test[4])

In [78]:
Y_test = decode(Y_test)

k = 1

In [79]:
Y_hat1 = Knn(X_train, Y_train, X_test, 1)

In [80]:
Y_hat1 = decode(Y_hat1)

In [81]:
conf_matrix1 = confusion_matrix(Y_test, Y_hat1)

In [82]:
conf_matrix1

array([[5., 0., 0.],
       [0., 5., 0.],
       [0., 0., 5.]])

In [83]:
cf_report1 = classification_report(conf_matrix1)

In [84]:
cf_report1

Unnamed: 0,precision,recall,accuracy,F1_score
macro_average,1.0,1.0,1.0,1.0
micro_avg,1.0,1.0,1.0,1.0


k = 3

In [85]:
Y_hat2 = Knn(X_train, Y_train, X_test, 3)

In [86]:
Y_hat2 = decode(Y_hat2)

In [87]:
conf_matrix2 = confusion_matrix(Y_test, Y_hat2)

In [88]:
conf_matrix2

array([[5., 0., 0.],
       [0., 5., 0.],
       [0., 0., 5.]])

In [89]:
cf_report2 = classification_report(conf_matrix2)

In [90]:
cf_report2

Unnamed: 0,precision,recall,accuracy,F1_score
macro_average,1.0,1.0,1.0,1.0
micro_avg,1.0,1.0,1.0,1.0


k = 5

In [91]:
Y_hat3 = Knn(X_train, Y_train, X_test, 5)

In [92]:
Y_hat3 = decode(Y_hat3)

In [93]:
conf_matrix3 = confusion_matrix(Y_test, Y_hat3)

In [94]:
conf_matrix3

array([[5., 0., 0.],
       [0., 5., 0.],
       [0., 0., 5.]])

In [95]:
cf_report3 = classification_report(conf_matrix3)

In [96]:
cf_report3

Unnamed: 0,precision,recall,accuracy,F1_score
macro_average,1.0,1.0,1.0,1.0
micro_avg,1.0,1.0,1.0,1.0
