In [5]:
import pandas as pd
import heapq
import math
import random

In [6]:
def split_sets(data_list, training_ratio):
    data_copy = data_list.copy()
    random.shuffle(data_copy)

    split_index = int(len(data_copy) * training_ratio)
    training_set = data_copy[:split_index]
    test_set = data_copy[split_index:]

    return training_set, test_set

def compute_dist(instance1, instance2):
    distance = 0
    for i in range(len(instance1) - 1):
        distance += (instance1[i] - instance2[i]) ** 2
    return math.sqrt(distance)

def normalize_minmax(data_list):
    normalized_data = [row[:] for row in data_list]
    num_features = len(data_list[0]) - 1

    for col in range(num_features):
        col_values = [row[col] for row in data_list]
        min_val = min(col_values)
        max_val = max(col_values)

        if max_val == min_val:
            for i in range(len(normalized_data)):
                normalized_data[i][col] = 0.5
        else:
            for i in range(len(normalized_data)):
                normalized_data[i][col] = (data_list[i][col] - min_val) / (max_val - min_val)

    return normalized_data

In [None]:
def knn(k=3, data_file="wdbc.data.mb.csv"):
    predictions = []

    dataframe = pd.read_csv(data_file, header=None)
    df_list = dataframe.values.tolist()

    normalized_df = normalize_minmax(df_list)
    training_set, test_set = split_sets(normalized_df, 0.7)

    for test in test_set:
        distances = []
        for training_instance in training_set:
            dist = compute_dist(test, training_instance)
            heapq.heappush(distances, (dist, training_instance))

        neighbors = heapq.nsmallest(k, distances)

        malignant = 0
        benign = 0
        for dist, neighbor_instance in neighbors:
            if neighbor_instance[-1] == 1:
                malignant += 1
            else:
                benign += 1

        classification = 1 if malignant >= benign else -1
        predictions.append(classification == test[-1])

    return predictions

results = knn(3)
accuracy = sum(results) / len(results)
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 96.74%


In [8]:
results = knn(3)
accuracy = sum(results) / len(results)
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 96.74%
