In [16]:
import pandas as pd
import numpy as np

from collections import Counter

def FeatureNormalization(Data):

    return (Data + 3) / 6

def kNN(TrainData, TestData, Personalities, k):
    TestDataSum =  np.sum(TestData**2, axis=1, keepdims = True)
    TrainDataSum = np.sum(TrainData**2, axis=1, keepdims = True)

    distances = np.sqrt(-2 * TestData.dot(TrainData.T) + TestDataSum + TrainDataSum.T)

    del TestDataSum
    del TrainDataSum
    del TrainData
    del TestData
    
    closest = np.argsort(distances)[:,:k]

    del distances

    closest = Personalities[closest]

    guesses = [Counter(i).most_common(1)[0][0] for i in closest]

    del Personalities
    del closest

    return guesses
      
        
def main(k, FeatureNormal):
    csvdata = pd.read_csv("16P.csv",encoding= "ISO 8859-1")
    csvdata = csvdata.drop(["Response Id"], axis = 1)

    csvdata = csvdata.replace(["ESTJ","ENTJ","ESFJ","ENFJ","ISTJ","ISFJ","INTJ","INFJ","ESTP","ESFP","ENTP","ENFP","ISTP","ISFP","INTP","INFP"],[0, 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15])

    Data = csvdata.to_numpy()

    del csvdata

    Personalities = Data[:,-1] 
    
    Data = Data[:,:-1] # Drops the last column
    
    if FeatureNormal:
        print("Feature Normalization: True")
        Data = FeatureNormalization(Data)
    else:
        print("Feature Normalization: False")

    print("k for kNN algorithm : ", k)

    for i in range(1,6):
        
        print(20*"-")
        print("Fold: ", i)
        
        Initial = int(len(Data) * (0.2 * (i-1)) ) # From
        RangeMax = int((len(Data)*((2/10)*i))) # To
    
        TestData = np.array_split(Data,5)[i-1]     # Cross validation 5 fold
        TestData, TestData2 = np.array_split(TestData,2)  # Splitting the Test Data to 2 because using knn for all data at once uses up too much ram

        TrainData = np.concatenate(np.array_split(Data,5)[:i-1] + np.array_split(Data,5)[i:])

        PersonalitiesTest = np.array_split(Personalities,5)[i-1]
        PersonalitiesTrain = np.concatenate(np.array_split(Personalities,5)[:i-1] + np.array_split(Personalities,5)[i:])

        del Initial
        del RangeMax

        guesses = kNN(TrainData, TestData, PersonalitiesTrain, k)
        del TestData
        guesses2 = kNN(TrainData, TestData2, PersonalitiesTrain, k)

        del TrainData
        del TestData2
        del PersonalitiesTrain
        
        guesses.extend(guesses2)

        Classes = {
            0: {"TP": 0, "TN": 0, "FP": 0, "FN": 0},
            1: {"TP": 0, "TN": 0, "FP": 0, "FN": 0},
            2: {"TP": 0, "TN": 0, "FP": 0, "FN": 0},
            3: {"TP": 0, "TN": 0, "FP": 0, "FN": 0},
            4: {"TP": 0, "TN": 0, "FP": 0, "FN": 0},
            5: {"TP": 0, "TN": 0, "FP": 0, "FN": 0},
            6: {"TP": 0, "TN": 0, "FP": 0, "FN": 0}, 
            7: {"TP": 0, "TN": 0, "FP": 0, "FN": 0},
            8: {"TP": 0, "TN": 0, "FP": 0, "FN": 0},
            9: {"TP": 0, "TN": 0, "FP": 0, "FN": 0},
            10:{"TP": 0, "TN": 0, "FP": 0, "FN": 0},
            11:{"TP": 0, "TN": 0, "FP": 0, "FN": 0},
            12:{"TP": 0, "TN": 0, "FP": 0, "FN": 0},
            13:{"TP": 0, "TN": 0, "FP": 0, "FN": 0},
            14:{"TP": 0, "TN": 0, "FP": 0, "FN": 0},
            15:{"TP": 0, "TN": 0, "FP": 0, "FN": 0}
        }

        
        Iter = 0

        for guess in guesses:
            if guess != PersonalitiesTest[Iter]:   # If the guessed personality type is incorrect
                Classes[guess]["FP"] += 1
                Classes[PersonalitiesTest[Iter]]["FN"] += 1
                for result in Classes:
                    if result != guess and result != PersonalitiesTest[Iter]:
                        Classes[result]["TN"] += 1
            else:                                  # If the guessed personality type is correct
                Classes[guess]["TP"] += 1
                for result in Classes:
                    if result != guess:
                        Classes[result]["TN"] += 1
            Iter += 1
  
        del guesses
        del guesses2

        TrueP = 0
        TrueN = 0
        FalseP = 0
        FalseN = 0
        Precision = 0
        Recall = 0

        for i in Classes:
            Precision += Classes[i]["TP"] / (Classes[i]["TP"] + Classes[i]["FP"])
            Recall += Classes[i]["TP"] / (Classes[i]["TP"] + Classes[i]["FN"])
            TrueP += Classes[i]["TP"]
            TrueN += Classes[i]["TN"]
            FalseP += Classes[i]["FP"]
            FalseN += Classes[i]["FN"]
            
        Precision /= 16  # Macro Average
        Recall /= 16     # Macro Average
        
        print("Accuracy: " + str( (TrueP + TrueN) / ( TrueP + TrueN + FalseP + FalseN) ))
        print("Precision Macro Average: ", Precision)
        print("Recall Macro Average: ", Recall)

        del PersonalitiesTest
        del Iter
        del Classes
       


In [18]:
main(1, True)

Feature Normalization: True
k for kNN algorithm :  1
--------------------
Fold:  1
Accuracy: 0.99734375
Precision Macro Average:  0.9787516347624936
Recall Macro Average:  0.9787014884993274
--------------------
Fold:  2
Accuracy: 0.997125
Precision Macro Average:  0.9770028738716551
Recall Macro Average:  0.9769772191651214
--------------------
Fold:  3
Accuracy: 0.99715625
Precision Macro Average:  0.9773176453096315
Recall Macro Average:  0.977296371315472
--------------------
Fold:  4
Accuracy: 0.9973541666666667
Precision Macro Average:  0.9788670364022395
Recall Macro Average:  0.9788323697191104
--------------------
Fold:  5
Accuracy: 0.9972914409534128
Precision Macro Average:  0.9783011637024195
Recall Macro Average:  0.9782918222544101


In [4]:
main(3, True)

Feature Normalization: True
k for kNN algorithm :  3
--------------------
Fold:  1
Accuracy: 0.9985833333333334
Precision Macro Average:  0.9887305688010263
Recall Macro Average:  0.9886265958332332
--------------------
Fold:  2
Accuracy: 0.9986041666666666
Precision Macro Average:  0.9888020919840492
Recall Macro Average:  0.9888452186091746
--------------------
Fold:  3
Accuracy: 0.998625
Precision Macro Average:  0.9890459741870914
Recall Macro Average:  0.9890386431339114
--------------------
Fold:  4
Accuracy: 0.99846875
Precision Macro Average:  0.9877770818368403
Recall Macro Average:  0.9877359654675865
--------------------
Fold:  5
Accuracy: 0.9985207100591716
Precision Macro Average:  0.9881572342937556
Recall Macro Average:  0.9881322885328716


In [5]:
main(5, True)

Feature Normalization: True
k for kNN algorithm :  5
--------------------
Fold:  1
Accuracy: 0.9986666666666667
Precision Macro Average:  0.9893754041767349
Recall Macro Average:  0.989294562998331
--------------------
Fold:  2
Accuracy: 0.998625
Precision Macro Average:  0.9889891067913978
Recall Macro Average:  0.9889955606083022
--------------------
Fold:  3
Accuracy: 0.9986666666666667
Precision Macro Average:  0.989372189945719
Recall Macro Average:  0.9893715517746581
--------------------
Fold:  4
Accuracy: 0.9985833333333334
Precision Macro Average:  0.98869652249881
Recall Macro Average:  0.988652205763824
--------------------
Fold:  5
Accuracy: 0.9986040503375281
Precision Macro Average:  0.9888223889279684
Recall Macro Average:  0.9888130675661366


In [6]:
main(7, True)

Feature Normalization: True
k for kNN algorithm :  7
--------------------
Fold:  1
Accuracy: 0.9986979166666666
Precision Macro Average:  0.989620689026629
Recall Macro Average:  0.989548183495552
--------------------
Fold:  2
Accuracy: 0.9986458333333333
Precision Macro Average:  0.9891444591086107
Recall Macro Average:  0.9891623777331094
--------------------
Fold:  3
Accuracy: 0.99871875
Precision Macro Average:  0.9897836937846075
Recall Macro Average:  0.9897898504263223
--------------------
Fold:  4
Accuracy: 0.9986041666666666
Precision Macro Average:  0.9888524088488525
Recall Macro Average:  0.9888225974299621
--------------------
Fold:  5
Accuracy: 0.9986353029419118
Precision Macro Average:  0.989075461069711
Recall Macro Average:  0.9890651510510345


In [7]:
main(9, True)

Feature Normalization: True
k for kNN algorithm :  9
--------------------
Fold:  1
Accuracy: 0.9986979166666666
Precision Macro Average:  0.9896157250326025
Recall Macro Average:  0.9895442453465451
--------------------
Fold:  2
Accuracy: 0.9986979166666666
Precision Macro Average:  0.9895587181696679
Recall Macro Average:  0.9895867840582319
--------------------
Fold:  3
Accuracy: 0.9987291666666667
Precision Macro Average:  0.9898592156126281
Recall Macro Average:  0.9898785485402842
--------------------
Fold:  4
Accuracy: 0.99859375
Precision Macro Average:  0.9887652579541343
Recall Macro Average:  0.9887392640966288
--------------------
Fold:  5
Accuracy: 0.9986353029419118
Precision Macro Average:  0.9890807266646019
Recall Macro Average:  0.9890673618595891


In [8]:
main(1, False)

Feature Normalization: False
k for kNN algorithm :  1
--------------------
Fold:  1
Accuracy: 0.9973333333333333
Precision Macro Average:  0.9786650790937739
Recall Macro Average:  0.9786130922017358
--------------------
Fold:  2
Accuracy: 0.9971041666666667
Precision Macro Average:  0.9768596257116923
Recall Macro Average:  0.9768243513136036
--------------------
Fold:  3
Accuracy: 0.9971041666666667
Precision Macro Average:  0.976886150833187
Recall Macro Average:  0.976863112242346
--------------------
Fold:  4
Accuracy: 0.9973541666666667
Precision Macro Average:  0.9788819344080226
Recall Macro Average:  0.978838510886218
--------------------
Fold:  5
Accuracy: 0.9971976831402617
Precision Macro Average:  0.9775490168820061
Recall Macro Average:  0.9775348412685985


In [9]:
main(3, False)

Feature Normalization: False
k for kNN algorithm :  3
--------------------
Fold:  1
Accuracy: 0.99859375
Precision Macro Average:  0.9888118406708534
Recall Macro Average:  0.9887116298468387
--------------------
Fold:  2
Accuracy: 0.99859375
Precision Macro Average:  0.9887169391342778
Recall Macro Average:  0.9887654992214195
--------------------
Fold:  3
Accuracy: 0.998625
Precision Macro Average:  0.9890396257840732
Recall Macro Average:  0.989040853942466
--------------------
Fold:  4
Accuracy: 0.9984479166666667
Precision Macro Average:  0.9876060322036853
Recall Macro Average:  0.9875768745584956
--------------------
Fold:  5
Accuracy: 0.9985311275939661
Precision Macro Average:  0.9882406725852245
Recall Macro Average:  0.9882137457296719


In [10]:
main(5, False)

Feature Normalization: False
k for kNN algorithm :  5
--------------------
Fold:  1
Accuracy: 0.9986770833333334
Precision Macro Average:  0.9894606720943213
Recall Macro Average:  0.9893797128620911
--------------------
Fold:  2
Accuracy: 0.9986458333333333
Precision Macro Average:  0.9891507824590609
Recall Macro Average:  0.9891643035033381
--------------------
Fold:  3
Accuracy: 0.9986666666666667
Precision Macro Average:  0.9893674232024661
Recall Macro Average:  0.9893671971144008
--------------------
Fold:  4
Accuracy: 0.9985833333333334
Precision Macro Average:  0.9886928025695548
Recall Macro Average:  0.9886601161934211
--------------------
Fold:  5
Accuracy: 0.9986040503375281
Precision Macro Average:  0.9888269990785715
Recall Macro Average:  0.9888119474623345


In [11]:
main(7, False)

Feature Normalization: False
k for kNN algorithm :  7
--------------------
Fold:  1
Accuracy: 0.9987083333333333
Precision Macro Average:  0.9897059569442155
Recall Macro Average:  0.9896333333593121
--------------------
Fold:  2
Accuracy: 0.9986770833333334
Precision Macro Average:  0.9893962738742985
Recall Macro Average:  0.9894262306664395
--------------------
Fold:  3
Accuracy: 0.99871875
Precision Macro Average:  0.9897794256409921
Recall Macro Average:  0.9897860461843534
--------------------
Fold:  4
Accuracy: 0.9985833333333334
Precision Macro Average:  0.9886809104944942
Recall Macro Average:  0.9886657763003783
--------------------
Fold:  5
Accuracy: 0.9986248854071172
Precision Macro Average:  0.9889912773764763
Recall Macro Average:  0.9889770522364538


In [12]:
main(9, False)

Feature Normalization: False
k for kNN algorithm :  9
--------------------
Fold:  1
Accuracy: 0.9986979166666666
Precision Macro Average:  0.9896157250326025
Recall Macro Average:  0.9895442453465451
--------------------
Fold:  2
Accuracy: 0.9986979166666666
Precision Macro Average:  0.9895587181696679
Recall Macro Average:  0.9895867840582319
--------------------
Fold:  3
Accuracy: 0.9987291666666667
Precision Macro Average:  0.989860459550069
Recall Macro Average:  0.9898787910124921
--------------------
Fold:  4
Accuracy: 0.9985833333333334
Precision Macro Average:  0.9886797271805853
Recall Macro Average:  0.9886635065208712
--------------------
Fold:  5
Accuracy: 0.9986353029419118
Precision Macro Average:  0.9890744890633166
Recall Macro Average:  0.9890667980590139



General PERFORMANCE ANALYSIS : :

As we can see in the outputs above, Accuracy, Precision, Recall differs on different k, feature normalization and folds.

The higher the number of k, program's accuracy, precision and recall is also increases. The are some exceptions, for example accuracy for fold 1 and 5 didn't change between k= 7 and k = 9

Another example would be again between k = 7 and k = 9 but this time with no feature normalization. Accuracy did not change at fold 4.

Feature Normalization also changes accuracy, precision and recall. But i also realized, without feature normalization, process takes way longer.
Process takes almost twice as long than with feature normalization.


We get different accuracy, precision and recall in each fold.

When k equals to 1, 

Accuracy: 0.99734375 ---------------------- %99.7343

Precision Macro Average:  0.9787516347624936  ---  %97.8751
 
Recall Macro Average:  0.9787014884993274 -------- %97.8701 


When k equals to 9, we can see the difference clearly:

Accuracy: 0.9986979166666666 -------------------- %99.8697

Precision Macro Average:  0.9896157250326025 ---- %98.9657

Recall Macro Average:  0.9895442453465451 ------- %98.9544

Precision and Recall is around %1 higher than when k was 1.






                                    


ERROR ANALYSIS : :

Data No 1169 : The closest Personality types for this person, when k is 5 were : [2,15,2,15,2], when k is 9 it became [2,15,2,15,2,15,2,2,15]

The amount of 15 and 2 are quite close for both cases, if we increase the k even further, the problem might be solved.

Data No 3829:  Closest personalities, when k == 5 were [7,8,8,7,7], this problem could potentially be solved with weighted kNN algorithm

Data No 4578:  Closest personalities, when k == 5 were [5,15,5,15,5], same with the Data No 1169, this could potentially be solved with higher k value.

Many of the misclassified samples's closest types were like :  [8,8,8,8,8] , [3,3,3,3,3], but still they were incorrect. 

There could also be cases where all the closest types are different, for example [3,8,7], naturally the program would pick the first number, but the result might not be as accurate