In [1]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import mode, ttest_rel


class CustomKNN:
    @staticmethod
    def euclidean_distance(row1, row2):
        distance = 0.0
        for i in range(len(row1)):
            distance += (row1[i] - row2[i]) ** 2
        return distance ** 0.5

    @staticmethod
    def get_neighbors(train, test_row, num_neighbors):
        distances = []
        for train_row in train:
            dist = CustomKNN.euclidean_distance(test_row, train_row[0])
            distances.append((train_row, dist))
        distances.sort(key=lambda tup: tup[1])
        neighbors = [distances[i][0] for i in range(num_neighbors)]
        return neighbors

    @staticmethod
    def predict_classification(train, test_row, num_neighbors):
        neighbors = CustomKNN.get_neighbors(train, test_row, num_neighbors)
        output_values = [row[-1] for row in neighbors]
        prediction = mode(output_values)[0][0]
        return prediction

    @staticmethod
    def k_fold_cross_validation(X, y, num_folds, num_neighbors):
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=1)
        scores = []
        for train_index, test_index in kf.split(X):
            train_X, test_X = [X[i] for i in train_index], [X[i] for i in test_index]
            train_y, test_y = [y[i] for i in train_index], [y[i] for i in test_index]
            train = list(zip(train_X, train_y))
            test = list(zip(test_X, test_y))
            predictions, actuals = [], []
            for row in test:
                prediction = CustomKNN.predict_classification(train, row[0], num_neighbors)
                predictions.append(prediction)
                actuals.append(row[1])
            accuracy = (sum(int(pred == actual) for pred, actual in zip(predictions, actuals)) / len(actuals)) * 100.0
            scores.append(accuracy)
        return scores


class DatasetLoader:
    def __init__(self, filepath, dataset_name):
        self.filepath = filepath
        self.dataset_name = dataset_name

    def load(self):
        if self.dataset_name == "breast-cancer":
            column_names = [
                "Class",
                "age",
                "menopause",
                "tumor-size",
                "inv-nodes",
                "node-caps",
                "deg-malig",
                "breast",
                "breast-quad",
                "irradiat",
            ]
        elif self.dataset_name == "car":
            column_names = [
                "buying",
                "maint",
                "doors",
                "persons",
                "lug_boot",
                "safety",
                "Class",
            ]
        elif self.dataset_name == "hayes-roth":
            column_names = [
                "name",
                "hobby",
                "age",
                "educational level",
                "marital status",
                "Class",
            ]
        else:
            raise ValueError("Unsupported dataset")

        data = []
        with open(self.filepath, 'r') as file:
            for line in file:
                row = line.strip().split(',')
                if len(row) == len(column_names):
                    data.append(row)
        return data


class Preprocessor:
    @staticmethod
    def preprocess(data):
        for i in range(len(data)):
            for j in range(len(data[i])):
                if data[i][j] == '?':
                    data[i][j] = Preprocessor.most_frequent(data, j)
        label_encoders = {}
        for i in range(len(data[0])):
            if isinstance(data[0][i], str):
                le = LabelEncoder()
                le.fit([row[i] for row in data])
                label_encoders[i] = le
        for row in data:
            for i in range(len(row)):
                if isinstance(row[i], str):
                    row[i] = label_encoders[i].transform([row[i]])[0]
                row[i] = float(row[i])
        return data

    @staticmethod
    def most_frequent(data, col):
        freq_map = {}
        for row in data:
            if row[col] not in freq_map:
                freq_map[row[col]] = 1
            else:
                freq_map[row[col]] += 1
        return max(freq_map, key=freq_map.get)


def main():
    datasets = {
        "breast-cancer": "dataset/breast-cancer.data",
        "car": "dataset/car.data",
        "hayes-roth": "dataset/hayes-roth.data",
    }

    summary_results = []

    for dataset_name, filepath in datasets.items():
        print(f"\nProcessing dataset: {dataset_name}")
        loader = DatasetLoader(filepath, dataset_name)
        data = loader.load()
        preprocessor = Preprocessor()
        data = preprocessor.preprocess(data)

        X = [row[:-1] for row in data]
        y = [row[-1] for row in data]

        num_folds = 10
        num_neighbors = 5
        custom_knn_scores = CustomKNN.k_fold_cross_validation(
            X, y, num_folds, num_neighbors
        )

        sklearn_knn = KNeighborsClassifier(n_neighbors=num_neighbors)
        sklearn_scores = cross_val_score(sklearn_knn, X, y, cv=num_folds) * 100

        print("Fold | Custom KNN Score (%) | Scikit-learn KNN Score (%)")
        print("-" * 57)
        for i, (custom_score, sklearn_score) in enumerate(
            zip(custom_knn_scores, sklearn_scores), 1
        ):
            print(f"{i:<4} | {custom_score:<20} | {sklearn_score:<25}")

        t_stat, p_value = ttest_rel(custom_knn_scores, sklearn_scores)
        print(f"\nT-Test Results Table for {dataset_name}:")
        print(f"{'Metric':<15}{'Value':<10}")
        print(f"{'-'*24}")
        print(f"{'t-statistic':<15}{t_stat:<10.3f}")
        print(f"{'p-value':<15}{p_value:<10.3f}")

        if p_value < 0.05:
            print(
                "There is a significant difference between the two classifiers' accuracies on this dataset."
            )
        else:
            print(
                "There is no significant difference between the two classifiers' accuracies on this dataset."
            )

        custom_mean_accuracy = sum(custom_knn_scores) / len(custom_knn_scores)
        sklearn_mean_accuracy = sum(sklearn_scores) / len(sklearn_scores)

        summary_results.append(
            (dataset_name, custom_mean_accuracy, sklearn_mean_accuracy)
        )

    print(
        "\nDataset Name | Custom KNN Mean Accuracy (%) | Scikit-learn KNN Mean Accuracy (%)"
    )
    print("-" * 78)
    for dataset_name, custom_mean, sklearn_mean in summary_results:
        print(
            f"{dataset_name:<13}| {custom_mean:<28.2f} | {sklearn_mean:<25.2f}"
        )

if __name__ == "__main__":
    main()



Processing dataset: breast-cancer
Fold | Custom KNN Score (%) | Scikit-learn KNN Score (%)
---------------------------------------------------------
1    | 79.3103448275862     | 79.3103448275862         
2    | 68.96551724137932    | 79.3103448275862         
3    | 72.41379310344827    | 72.41379310344827        

  prediction = mode(output_values)[0][0]
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)



4    | 79.3103448275862     | 79.3103448275862         
5    | 79.3103448275862     | 82.75862068965517        
6    | 65.51724137931035    | 68.96551724137932        
7    | 85.71428571428571    | 60.71428571428571        
8    | 57.14285714285714    | 92.85714285714286        
9    | 78.57142857142857    | 60.71428571428571        
10   | 89.28571428571429    | 60.71428571428571        

T-Test Results Table for breast-cancer:
Metric         Value     
------------------------
t-statistic    0.313     
p-value        0.761     
There is no significant difference between the two classifiers' accuracies on this dataset.

Processing dataset: car


  prediction = mode(output_values)[0][0]
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Fold | Custom KNN Score (%) | Scikit-learn KNN Score (%)
---------------------------------------------------------
1    | 90.17341040462428    | 56.64739884393064        
2    | 83.8150289017341     | 68.78612716763006        
3    | 81.5028901734104     | 72.25433526011561        
4    | 87.28323699421965    | 70.52023121387283        
5    | 84.39306358381504    | 75.14450867052022        
6    | 87.28323699421965    | 70.52023121387283        
7    | 86.70520231213872    | 74.56647398843931        
8    | 88.4393063583815     | 79.1907514450867         
9    | 86.04651162790698    | 84.30232558139535        
10   | 82.55813953488372    | 86.04651162790698        

T-Test Results Table for car:
Metric         Value     
------------------------
t-statistic    3.832     
p-value        0.004     
There is a significant difference between the two classifiers' accuracies on this dataset.

Processing dataset: hayes-roth
Fold | Custom KNN Score (%) | Scikit-learn KNN Score (%)
-----------

  prediction = mode(output_values)[0][0]
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
