***Importing Libraries***

In [1]:
import pandas as pd
from random import randrange
import operator
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier as NearestNeighbors
from sklearn.metrics import accuracy_score
from scipy.stats import ttest_rel

***Neighbour Distance Calculation***

In [2]:
class DistMetrics:
    def euclidean(self, vector1, vector2):
        distance = 0.0
        for i in range(len(vector1)-1):
            distance += (vector1[i] - vector2[i])**2
        return (distance)**0.5

    def manhattan(self, vector1, vector2):
        return sum(abs(a - b) for a, b in zip(vector1, vector2))

    def hamming(self, vector1, vector2):
        return sum(a != b for a, b in zip(vector1, vector2))

***KNN Classification Model***

In [3]:
class knnclassifier:
    def _init_(self, k=3, distanceMetric='euclidean'):
        self.k = k
        self.distanceMetric = distanceMetric

    def fit(self, xtrain, ytrain):
        assert len(xtrain) == len(ytrain)
        self.trainData = xtrain
        self.trainLabels = ytrain

    def calculateNeighborsDistance(self, test_data):
        dist_metrics = DistMetrics()
        distances = []
        for i, train_data in enumerate(self.trainData):
            if self.distanceMetric == 'euclidean':
                distances.append([train_data, dist_metrics.euclidean(test_data, train_data), self.trainLabels[i]])
            elif self.distanceMetric == 'manhattan':
                distances.append([train_data, dist_metrics.manhattan(test_data, train_data), self.trainLabels[i]])
            elif self.distanceMetric == 'hamming':
                distances.append([train_data, dist_metrics.hamming(test_data, train_data), self.trainLabels[i]])
            else:
                raise ValueError(f"Unknown distance metric: {self.distanceMetric}")

        distances.sort(key=operator.itemgetter(1))

        neighbors = []
        for index in range(self.k):
            neighbors.append(distances[index])
        return neighbors

    # Multilabeled KNN with value and weight
    def weighted_voting(self, neighbors):
        label_scores = {}
        for neighbor in neighbors:
            label = neighbor[2]
            distance = neighbor[1]
            weight = 1 / (distance + 1e-5)
            
            # Weighted voting
            if isinstance(label, list):
                for l in label:
                    if l in label_scores:
                        label_scores[l] += weight
                    else:
                        label_scores[l] = weight
            else:
                if label in label_scores:
                    label_scores[label] += weight
                else:
                    label_scores[label] = weight

        # Return the label with the highest score
        sorted_labels = sorted(label_scores.items(), key=lambda item: item[1], reverse=True)
        top_label = sorted_labels[0][0]
        return top_label

    def predict(self, xtest, k=None, distanceMetric=None):
        if k:
            self.k = k
        if distanceMetric:
            self.distanceMetric = distanceMetric

        predictions = []
        for testCase in xtest:
            neighbors = self.calculateNeighborsDistance(testCase)
            prediction = self.weighted_voting(neighbors)
            predictions.append(prediction)

        return predictions

***Accuracy Calculation.***

In [4]:
def print_prediction(actual, predictions):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predictions[i]:
            correct += 1
    return (correct / float(len(actual)))

***K-Fold Cross Validation.***

In [5]:
class kFoldCV:
    def crossValSplit(self, dataset, numFolds):
        dataSplit = list()
        dataCopy = list(dataset)
        foldSize = int(len(dataset) / numFolds)
        for x in range(numFolds):
            fold = list()
            while len(fold) < foldSize:
                index = randrange(len(dataCopy))
                fold.append(dataCopy.pop(index))
            dataSplit.append(fold)
        return dataSplit

    def kFCVEvaluate(self, dataset, numFolds, k, distanceMetrics):
        knn = knnclassifier()
        folds = self.crossValSplit(dataset, numFolds)
        
        custom_scores = {metric: [] for metric in distanceMetrics}
        sckit_scores = {metric: [] for metric in distanceMetrics}

        # Perform KNN evaluation for each distance metric
        for fold in folds:
            trainSet = list(folds)
            trainSet.remove(fold)
            trainSet = sum(trainSet, [])
            testSet = list()
            for row in fold:
                rowCopy = list(row)
                testSet.append(rowCopy)

            trainLabels = [row[-1] for row in trainSet]
            trainSet = [train[:-1] for train in trainSet]
            actual = [row[-1] for row in testSet]
            testSet = [test[:-1] for test in testSet]

            for metric in distanceMetrics:
                knn.fit(trainSet, trainLabels)
                predicted = knn.predict(testSet, k, metric)
                accuracy = print_prediction(actual, predicted)
                custom_scores[metric].append(accuracy)

                # Scikit-learn KNN evaluation for each metric
                knn_classifier = NearestNeighbors(n_neighbors=k, metric=metric)
                knn_classifier.fit(trainSet, trainLabels)
                y_pred = knn_classifier.predict(testSet)
                sklearn_accuracy = accuracy_score(actual, y_pred)
                sckit_scores[metric].append(sklearn_accuracy)

        return custom_scores, sckit_scores


***Reading Data Files.***

In [6]:
def readData(fileName):
    data = []
    labels = []

    with open(fileName, "r") as file:
        lines = file.readlines()
    for line in lines:
        splitline = line.strip().split(',')
        data.append(splitline)
        labels.append(splitline[-1])
    return data, labels

***Hayes Roth Data Set***

In [7]:
trainFile = './hayes+roth/hayes-roth.data'
trainData, trainLabel = readData(trainFile)
trainFeatures = []
for row in trainData:
    index = row[1:]
    temp = [int(item) for item in index]
    trainFeatures.append(temp)

trainLabels = [int(label) for label in trainLabel]

***Initializing KFold Cross Validation instance.***

In [8]:
kfcv = kFoldCV()

***Calculating Both Custom and Sckit Learn KNN Accuracy.***

In [9]:
distance_metrics = ['euclidean', 'hamming', 'manhattan']
hayes_custom_scores, hayes_sckit_scores = kfcv.kFCVEvaluate(trainFeatures, 10, 3, distance_metrics)

for metric in distance_metrics:
    # Calculate mean accuracies
    mean_custom_knn = sum(hayes_custom_scores[metric]) / len(hayes_custom_scores[metric])
    mean_sckit_knn = sum(hayes_sckit_scores[metric]) / len(hayes_sckit_scores[metric])

    print(f"\n{metric.capitalize()}:")
    print("Custom KNN values:")
    print([f"{score:}" for score in hayes_custom_scores[metric]])

    print("Scikit-learn KNN values:")
    print([f"{score:}" for score in hayes_sckit_scores[metric]])


    print(f"Mean Accuracy of Custom KNN: {mean_custom_knn * 100:}%")
    print(f"Mean Accuracy of Scikit-learn KNN: {mean_sckit_knn * 100:}%")




Euclidean:
Custom KNN values:
['0.46153846153846156', '0.46153846153846156', '0.8461538461538461', '0.8461538461538461', '0.7692307692307693', '0.3076923076923077', '0.46153846153846156', '0.5384615384615384', '0.46153846153846156', '0.7692307692307693']
Scikit-learn KNN values:
['0.6923076923076923', '0.6153846153846154', '0.6923076923076923', '0.6923076923076923', '0.6923076923076923', '0.8461538461538461', '0.8461538461538461', '0.5384615384615384', '0.6923076923076923', '0.38461538461538464']
Mean Accuracy of Custom KNN: 59.23076923076923%
Mean Accuracy of Scikit-learn KNN: 66.92307692307693%

Hamming:
Custom KNN values:
['0.5384615384615384', '0.7692307692307693', '0.7692307692307693', '0.6153846153846154', '0.7692307692307693', '0.6923076923076923', '0.6153846153846154', '0.46153846153846156', '0.38461538461538464', '0.46153846153846156']
Scikit-learn KNN values:
['0.6923076923076923', '0.6923076923076923', '0.6923076923076923', '0.6153846153846154', '0.6153846153846154', '0.692

***Paired Sample Hypothesis test Calculation***

In [10]:
for metric in distance_metrics:
    print(f"\n{metric.capitalize()}:")
    t_statistic, p_value = ttest_rel(hayes_custom_scores[metric], hayes_sckit_scores[metric])
    print(f'T-statistic: {t_statistic:.4f}')
    print(f'P-value: {p_value:.4f}')
    if p_value > 0.05:
      print("Fail to reject the null hypothesis (No significant difference between the models).")
    else:
      print("Reject the null hypothesis (Significant difference between the models).")        
    print("--------------------------------------------------------------------------------------")


Euclidean:
T-statistic: -0.8660
P-value: 0.4090
Fail to reject the null hypothesis (No significant difference between the models).
--------------------------------------------------------------------------------------

Hamming:
T-statistic: 0.2873
P-value: 0.7804
Fail to reject the null hypothesis (No significant difference between the models).
--------------------------------------------------------------------------------------

Manhattan:
T-statistic: 2.3333
P-value: 0.0445
Reject the null hypothesis (Significant difference between the models).
--------------------------------------------------------------------------------------


***Car Evaluation Data File***

In [11]:
carFile = './car+evaluation/car.data'
carData, carLabel = readData(carFile)
df = pd.DataFrame(carData)
df = df.apply(preprocessing.LabelEncoder().fit_transform)
carFeatures = df.values.tolist()
carLabels = [car[-1] for car in carFeatures]

***Car Evaluation Custom and Sckit Learn KNN Classification Accuracy***

In [12]:
distance_metrics = ['euclidean', 'hamming', 'manhattan']
car_custom_scores, car_sckit_scores = kfcv.kFCVEvaluate(trainFeatures, 10, 3, distance_metrics)

for metric in distance_metrics:
    # Calculate mean accuracies
    mean_custom_knn = sum(car_custom_scores[metric]) / len(car_custom_scores[metric])
    mean_sckit_knn = sum(car_sckit_scores[metric]) / len(car_sckit_scores[metric])
    
    print(f"\n{metric.capitalize()}:")
    print("Custom KNN values:")
    print([f"{score:}" for score in car_custom_scores[metric]])

    print("Scikit-learn KNN values:")
    print([f"{score:}" for score in car_sckit_scores[metric]])

    print(f"Mean Accuracy of Custom KNN: {mean_custom_knn * 100:}%")
    print(f"Mean Accuracy of Scikit-learn KNN: {mean_sckit_knn * 100:}%")


Euclidean:
Custom KNN values:
['0.5384615384615384', '0.5384615384615384', '0.3076923076923077', '0.46153846153846156', '0.6153846153846154', '0.38461538461538464', '0.6923076923076923', '0.6153846153846154', '0.7692307692307693', '0.38461538461538464']
Scikit-learn KNN values:
['0.6923076923076923', '0.6153846153846154', '0.6153846153846154', '0.38461538461538464', '0.7692307692307693', '0.6153846153846154', '0.6153846153846154', '0.7692307692307693', '0.5384615384615384', '0.7692307692307693']
Mean Accuracy of Custom KNN: 53.07692307692308%
Mean Accuracy of Scikit-learn KNN: 63.846153846153854%

Hamming:
Custom KNN values:
['0.6923076923076923', '0.5384615384615384', '0.5384615384615384', '0.3076923076923077', '0.6153846153846154', '0.38461538461538464', '0.46153846153846156', '0.6153846153846154', '0.8461538461538461', '0.7692307692307693']
Scikit-learn KNN values:
['0.6923076923076923', '0.5384615384615384', '0.6153846153846154', '0.3076923076923077', '0.6153846153846154', '0.5384

***Paired Sample Hypothesis Test Calculation***

In [13]:
for metric in distance_metrics:
    print(f"\n{metric.capitalize()}:")
    t_statistic, p_value = ttest_rel(car_custom_scores[metric], car_sckit_scores[metric])
    print(f'T-statistic: {t_statistic:.4f}')
    print(f'P-value: {p_value:.4f}')
    if p_value > 0.05:
      print("Fail to reject the null hypothesis (No significant difference between the models).")
    else:
      print("Reject the null hypothesis (Significant difference between the models).")        
    print("--------------------------------------------------------------------------------------")


Euclidean:
T-statistic: -1.8007
P-value: 0.1053
Fail to reject the null hypothesis (No significant difference between the models).
--------------------------------------------------------------------------------------

Hamming:
T-statistic: 1.0324
P-value: 0.3288
Fail to reject the null hypothesis (No significant difference between the models).
--------------------------------------------------------------------------------------

Manhattan:
T-statistic: 1.2999
P-value: 0.2259
Fail to reject the null hypothesis (No significant difference between the models).
--------------------------------------------------------------------------------------


***Breast Cancer DataSet***

In [14]:
cancerFile = './breast+cancer/breast-cancer.data'
cancerData, cancerLabel = readData(cancerFile)
cdf = pd.DataFrame(cancerData)
cdf = cdf.apply(preprocessing.LabelEncoder().fit_transform)
cdf.dropna(inplace=True)
cancerFeatures = cdf.values.tolist()
cancerLabels = [cancer[-1] for cancer in cancerFeatures]

***Calculation of Custom and Sckit KNN Algorithm Accuracy***

In [15]:
distance_metrics = ['euclidean', 'hamming', 'manhattan']
cancer_custom_scores, cancer_sckit_scores = kfcv.kFCVEvaluate(trainFeatures, 10, 3, distance_metrics)

for metric in distance_metrics:
    # Calculate mean accuracies
    mean_custom_knn = sum(cancer_custom_scores[metric]) / len(cancer_custom_scores[metric])
    mean_sckit_knn = sum(cancer_sckit_scores[metric]) / len(cancer_sckit_scores[metric])
    
    print(f"\n{metric.capitalize()}:")
    print("Custom KNN values:")
    print([f"{score:}" for score in cancer_custom_scores[metric]])

    print("Scikit-learn KNN values:")
    print([f"{score:}" for score in cancer_sckit_scores[metric]])

    print(f"Mean Accuracy of Custom KNN: {mean_custom_knn * 100:}%")
    print(f"Mean Accuracy of Scikit-learn KNN: {mean_sckit_knn * 100:}%")



Euclidean:
Custom KNN values:
['0.6923076923076923', '0.6153846153846154', '0.46153846153846156', '0.5384615384615384', '0.38461538461538464', '0.3076923076923077', '0.6153846153846154', '0.46153846153846156', '0.38461538461538464', '0.6153846153846154']
Scikit-learn KNN values:
['0.7692307692307693', '0.7692307692307693', '0.6153846153846154', '0.8461538461538461', '0.6923076923076923', '0.6153846153846154', '0.6153846153846154', '0.6153846153846154', '0.6153846153846154', '0.8461538461538461']
Mean Accuracy of Custom KNN: 50.76923076923077%
Mean Accuracy of Scikit-learn KNN: 70.0%

Hamming:
Custom KNN values:
['0.6153846153846154', '0.8461538461538461', '0.6153846153846154', '0.8461538461538461', '0.5384615384615384', '0.46153846153846156', '0.6153846153846154', '0.6153846153846154', '0.6153846153846154', '0.6153846153846154']
Scikit-learn KNN values:
['0.7692307692307693', '0.7692307692307693', '0.5384615384615384', '0.6923076923076923', '0.5384615384615384', '0.6153846153846154', 

***Paired Sample Hypothesis test calculation.***

In [16]:
for metric in distance_metrics:
    print(f"\n{metric.capitalize()}:")
    t_statistic, p_value = ttest_rel(cancer_custom_scores[metric], cancer_sckit_scores[metric])
    print(f'T-statistic: {t_statistic:.4f}')
    print(f'P-value: {p_value:.4f}')
    if p_value > 0.05:
      print("Fail to reject the null hypothesis (No significant difference between the models).")
    else:
      print("Reject the null hypothesis (Significant difference between the models).")        
    print("--------------------------------------------------------------------------------------")


Euclidean:
T-statistic: -5.8387
P-value: 0.0002
Reject the null hypothesis (Significant difference between the models).
--------------------------------------------------------------------------------------

Hamming:
T-statistic: 1.0476
P-value: 0.3221
Fail to reject the null hypothesis (No significant difference between the models).
--------------------------------------------------------------------------------------

Manhattan:
T-statistic: 1.0000
P-value: 0.3434
Fail to reject the null hypothesis (No significant difference between the models).
--------------------------------------------------------------------------------------
