### Note: Ensure ```wdbc.data.mb.csv``` is in the same directory as this script or modify ```def knn``` parameters to specify the full path to the file

In [None]:
import pandas as pd
import heapq
import math
import random

In [None]:
def split_sets(data_list, training_ratio):
    data_copy = data_list.copy()
    random.shuffle(data_copy)

    split_index = int(len(data_copy) * training_ratio)
    training_set = data_copy[:split_index]
    test_set = data_copy[split_index:]

    return training_set, test_set

def compute_dist(testX, trainingX):
    distance = 0
    for i in range(len(testX) - 1):
        distance += (testX[i] - trainingX[i]) ** 2
    return math.sqrt(distance)

def normalize_minmax(data_list): # min-max scaling
    normalized_data = [row[:] for row in data_list]

    for column in range(len(data_list[0]) - 1): # last column is the label
        col_values = [row[column] for row in data_list]
        minimum = min(col_values)
        maximum = max(col_values)

        if maximum == minimum: #this shouldn't arise in the current data set but it's possible that division by zero could crop up
            for i in range(len(normalized_data)):
                normalized_data[i][column] = 0.5
        else:
            for i in range(len(normalized_data)):
                normalized_data[i][col] = (data_list[i][column] - minimum) / (maximum - minimum)

    return normalized_data
def print_confusion_matrix(knn_results):
    true_positives = true_negatives = false_positives = false_negatives = 0

    for pred, actual in knn_results:
        if pred == 1 and actual == 1:
            true_positives += 1
        elif pred == -1 and actual == -1:
            true_negatives += 1
        elif pred == 1 and actual == -1:
            false_positives += 1
        elif pred == -1 and actual == 1:
            false_negatives += 1

    confusion_matrix = [[true_negatives, false_negatives], 
                       [false_positives, true_positives]] 
    
    print("\nConfusion Matrix:\n          Actual\n          Neg  Pos")
    print("Pred Neg   {}    {}".format(confusion_matrix[0][0], confusion_matrix[0][1]))
    print("     Pos   {}    {}".format(confusion_matrix[1][0], confusion_matrix[1][1]))
    print("\nAccuracy = {:.2f}%".format((true_positives + true_negatives) / len(knn_results) * 100))

In [None]:
def knn(k=3, data_file="wdbc.data.mb.csv"):
    predictions = []

    dataframe = pd.read_csv(data_file, header=None)
    df_list = dataframe.values.tolist()

    normalized_df = normalize_minmax(df_list)
    training_set, test_set = split_sets(normalized_df, 0.7)

    for test in test_set:
        distances = []
        for training_instance in training_set:
            dist = compute_dist(test, training_instance)
            heapq.heappush(distances, (dist, training_instance))

        neighbors = heapq.nsmallest(k, distances)

        malignant = 0
        benign = 0
        for dist, neighbor_instance in neighbors:
            if neighbor_instance[-1] == 1:
                malignant += 1
            else:
                benign += 1

        classification = 1 if malignant >= benign else -1
        predictions.append((classification, test[-1]))

    return predictions

In [None]:
for i in range(1, 10, 2):
    print("\nK = {}".format(i))
    knn_results = knn(i)
    print_confusion_matrix(knn_results)
