In [1]:
import numpy as np
import pandas as pd
from collections import Counter

# KNN Algorithm


def knn(xtrain, ytrain, xtest, k):
    distance = np.sum(xtest**2, axis=1)[:, np.newaxis] + np.sum(xtrain**2, axis=1) - 2*np.dot(xtest, xtrain.T)
    near_idx = np.argsort(distance, axis=1)[:, :k]
    near_classes = ytrain[near_idx]
    most_voted_class = [Counter(i).most_common(1)[0][0] for i in near_classes]
    pred_arr = np.array(most_voted_class)
    return pred_arr

# Calculate performance function


def performance(pred_arr, ytest):
    y_actual_series = pd.Series(ytest, name="Actual")
    y_pred_series = pd.Series(pred_arr, name="Predicted")
    df_cm = pd.crosstab(y_actual_series, y_pred_series)  # Confusion matrix as DataFrame
    cm_arr = np.array(df_cm)  # Confusion matrix as array

    tp = np.diag(cm_arr)  # True positive
    fn = np.sum(cm_arr, axis=1) - tp  # False negative
    fp = np.sum(cm_arr, axis=0) - tp  # False positive
    tn = cm_arr.sum() - (tp + fp + fn)  # True negative

    accuracies = (tp+tn)/(tp+tn+fp+fn)
    precisions = tp/(tp+fp)
    recalls = tp/(tp+fn)

    macro_accuracy = np.sum(accuracies)/len(accuracies)
    macro_precision = np.sum(precisions)/len(precisions)
    macro_recall = np.sum(recalls)/len(precisions)

    print("Accuracy: %%%f" % (macro_accuracy*100))
    print("Precision: %%%f" % (macro_precision*100))
    print("Recall: %%%f\n" % (macro_recall*100))


p_dict = {"ESTJ": 0, "ENTJ": 1, "ESFJ": 2, "ENFJ": 3, "ISTJ": 4, "ISFJ": 5, "INTJ": 6, "INFJ": 7, "ESTP": 8,
          "ESFP": 9, "ENTP": 10, "ENFP": 11, "ISTP": 12, "ISFP": 13, "INTP": 14, "INFP": 15}

# Pre-processing data

df = pd.read_csv("16P.csv", encoding="cp1252")
df.loc[len(df)] = df.loc[2997].copy()  # Add a random already existing row to make an even row of 60000
df.drop(columns=["Response Id"], inplace=True)
df["Personality"].replace(to_replace=p_dict, inplace=True)


# Numpy array
array = df.to_numpy()

# Normalised array
n_array = (array - array.min()) / (array.max() - array.min())

# Split the arrays into 5 folds
array = np.array(np.split(array, 5))
n_array = np.array(np.split(n_array, 5))


for i in range(5):
    k = (2 * i) + 1  # K nearest neighboorhood
    for j in range(5):

        x = array[j][:, :-1]  # Predictor array
        x_nor = n_array[j][:, :-1]  # Normalised predictor array
        y = array[j][:, -1]  # Target array

        # Train and test arrays

        idx_arr = np.array(range(5))
        idx_arr = idx_arr[idx_arr != j]
        # Get the tests as the current fold and the trains as the combination of other folds
        x_train, x_test = np.concatenate(array[idx_arr], axis=0)[:, :-1], x
        x_nor_train, x_nor_test = np.concatenate(n_array[idx_arr], axis=0)[:, :-1], x_nor
        y_train, y_test = np.concatenate(array[idx_arr], axis=0)[:, -1], y

        print("For k={} and fold {} as test, with feature normalization".format(k, j + 1))
        arr_nor = knn(x_nor_train, y_train, x_nor_test, k)
        performance(arr_nor, y_test)

        print("For k={} and fold {} as test, without feature normalization".format(k, j + 1))
        arr = knn(x_train, y_train, x_test, k)
        performance(arr, y_test)


For k=1 and fold 1 as test, with feature normalization
Accuracy: %99.732292
Precision: %97.858117
Recall: %97.853506

For k=1 and fold 1 as test, without feature normalization
Accuracy: %99.731250
Precision: %97.848693
Recall: %97.844731

For k=1 and fold 2 as test, with feature normalization
Accuracy: %99.703125
Precision: %97.627219
Recall: %97.623477

For k=1 and fold 2 as test, without feature normalization
Accuracy: %99.712500
Precision: %97.700491
Recall: %97.699203

For k=1 and fold 3 as test, with feature normalization
Accuracy: %99.708333
Precision: %97.673275
Recall: %97.670665

For k=1 and fold 3 as test, without feature normalization
Accuracy: %99.709375
Precision: %97.682650
Recall: %97.680257

For k=1 and fold 4 as test, with feature normalization
Accuracy: %99.734375
Precision: %97.881460
Recall: %97.875732

For k=1 and fold 4 as test, without feature normalization
Accuracy: %99.738542
Precision: %97.913647
Recall: %97.909411

For k=1 and fold 5 as test, with feature nor