In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [None]:
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indiansdiabetes.data.csv'
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']


In [None]:
data = pd.read_csv(url, header=None, names=columns)


In [None]:
data.replace(0, np.nan, inplace=True)
data.fillna(data.mean(), inplace=True)


In [None]:
X = data.drop('Outcome', axis=1)
y = data['Outcome']

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2,
random_state=42)

In [None]:
def optimal_k(X_train, X_test, y_train, y_test):
    accuracies = []
    for k in range(1, 21): # Test for K values from 1 to 20
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        accuracy = knn.score(X_test, y_test)
        accuracies.append(accuracy)
    plt.plot(range(1, 21), accuracies, marker='o')
    plt.xlabel('Value of K')
    plt.ylabel('Accuracy')
    plt.title('Accuracy vs K')
    plt.show()
    optimal_k = accuracies.index(max(accuracies)) + 1
return optimal_k, max(accuracies)

In [None]:
optimal_k_value, max_accuracy = optimal_k(X_train, X_test, y_train, y_test)
print(f"Optimal value of K: {optimal_k_value} with accuracy: {max_accuracy}")

In [None]:
knn_optimal = KNeighborsClassifier(n_neighbors=optimal_k_value)
knn_optimal.fit(X_train, y_train)

In [None]:
y_pred = knn_optimal.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))