<a href="https://colab.research.google.com/github/Avi-47/diabetes-knn-classification/blob/main/KNN_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"

columns = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness",
           "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "Outcome"]
df = pd.read_csv(url, names=columns)

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

y = df["Outcome"]
X = df.drop(["Outcome"],axis=1)

X_scaled = StandardScaler().fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)
knn_model = KNeighborsClassifier().fit(X, y)

# Evaluation
y_pred = knn_model.predict(X)
y_prob = knn_model.predict_proba(X)[:, 1]

print(classification_report(y, y_pred))
print("AUC:", roc_auc_score(y, y_prob))


              precision    recall  f1-score   support

           0       0.85      0.90      0.87       500
           1       0.79      0.70      0.74       268

    accuracy                           0.83       768
   macro avg       0.82      0.80      0.81       768
weighted avg       0.83      0.83      0.83       768

AUC: 0.9017686567164179


In [6]:
cv_results = cross_validate(knn_model, X, y, cv=5,
                            scoring=["accuracy", "f1", "roc_auc"])

In [7]:
print("CV Accuracy:", cv_results['test_accuracy'].mean())
print("CV F1:", cv_results['test_f1'].mean())
print("CV ROC AUC:", cv_results['test_roc_auc'].mean())

CV Accuracy: 0.733112638994992
CV F1: 0.5905780011534191
CV ROC AUC: 0.7805279524807827


In [8]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

cv_results = cross_validate(pipe, X, y, cv=5,
                            scoring=["accuracy", "f1", "roc_auc"])

print("CV Accuracy:", cv_results['test_accuracy'].mean())

CV Accuracy: 0.7357100415923945
