In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from math import sqrt


In [2]:
# Load from online link
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
           'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']

data = pd.read_csv(url, names=columns)
print(data.head())


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [3]:
# Replace 0s with median values in important columns
cols_with_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

for col in cols_with_zero:
    data[col] = data[col].replace(0, np.nan)
    data[col] = data[col].fillna(data[col].median()) 

# Normalize the data (values between 0 and 1)
scaler = MinMaxScaler()
X = scaler.fit_transform(data.drop('Outcome', axis=1))
y = data['Outcome'].values


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [5]:
def euclidean_distance(a, b):
    return sqrt(np.sum((a - b) ** 2))

def manhattan_distance(a, b):
    return np.sum(np.abs(a - b))

def knn_predict(X_train, y_train, x_test, k=3, distance_type='euclidean'):
    distances = []
    for i in range(len(X_train)):
        if distance_type == 'euclidean':
            dist = euclidean_distance(X_train[i], x_test)
        elif distance_type == 'manhattan':
            dist = manhattan_distance(X_train[i], x_test)
        distances.append((dist, y_train[i]))
    
    distances.sort(key=lambda x: x[0])  # Sort by distance
    k_nearest = [label for (_, label) in distances[:k]]  # Get top k labels
    prediction = max(set(k_nearest), key=k_nearest.count)  # Majority vote
    return prediction


In [6]:
def test_knn_model(k, distance_type='euclidean'):
    predictions = []
    for i in range(len(X_test)):
        pred = knn_predict(X_train, y_train, X_test[i], k=k, distance_type=distance_type)
        predictions.append(pred)
    return accuracy_score(y_test, predictions)

# Try different k values
for k in range(1, 11):
    acc_euclidean = test_knn_model(k, distance_type='euclidean')
    acc_manhattan = test_knn_model(k, distance_type='manhattan')
    print(f"K = {k} | Euclidean Accuracy = {acc_euclidean:.3f} | Manhattan Accuracy = {acc_manhattan:.3f}")


K = 1 | Euclidean Accuracy = 0.688 | Manhattan Accuracy = 0.714
K = 2 | Euclidean Accuracy = 0.773 | Manhattan Accuracy = 0.753
K = 3 | Euclidean Accuracy = 0.786 | Manhattan Accuracy = 0.753
K = 4 | Euclidean Accuracy = 0.786 | Manhattan Accuracy = 0.766
K = 5 | Euclidean Accuracy = 0.779 | Manhattan Accuracy = 0.766
K = 6 | Euclidean Accuracy = 0.792 | Manhattan Accuracy = 0.779
K = 7 | Euclidean Accuracy = 0.766 | Manhattan Accuracy = 0.818
K = 8 | Euclidean Accuracy = 0.792 | Manhattan Accuracy = 0.799
K = 9 | Euclidean Accuracy = 0.792 | Manhattan Accuracy = 0.812
K = 10 | Euclidean Accuracy = 0.831 | Manhattan Accuracy = 0.825
