
# 🩺 Diabetes Prediction Project using K-Nearest Neighbors (KNN)
This project predicts whether a person has diabetes using the **Pima Indians Diabetes dataset**. We'll use the KNN algorithm and go through:
- Data cleaning
- Exploratory data analysis (EDA)
- Feature scaling
- Model training and tuning
- Evaluation with metrics
- Model export


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
import joblib

sns.set(style='whitegrid')


## 🔍 Load and Inspect the Data

In [None]:

data = pd.read_csv('diabetes.csv')
data.head()


## 🧹 Data Cleaning

In [None]:

# Replace zeroes with NaN for specific columns
cols_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
data[cols_with_zeros] = data[cols_with_zeros].replace(0, np.nan)
data.describe()


In [None]:

# Fill missing values with median
data[cols_with_zeros] = data[cols_with_zeros].fillna(data[cols_with_zeros].median())


## 📊 Exploratory Data Analysis

In [None]:

plt.figure(figsize=(10,6))
sns.countplot(x='Outcome', data=data)
plt.title('Class Distribution')
plt.show()


## ⚙️ Feature Scaling

In [None]:

X = data.drop('Outcome', axis=1)
y = data['Outcome']

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)


In [None]:

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


## 🤖 Train KNN and Tune K

In [None]:

train_scores = []
test_scores = []

for k in range(1, 15):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    train_scores.append(knn.score(X_train, y_train))
    test_scores.append(knn.score(X_test, y_test))

# Plot accuracy
plt.figure(figsize=(10,6))
sns.lineplot(x=range(1,15), y=train_scores, marker='*', label='Train Accuracy')
sns.lineplot(x=range(1,15), y=test_scores, marker='o', label='Test Accuracy')
plt.xlabel("K Value")
plt.ylabel("Accuracy")
plt.title("Train vs Test Accuracy for Different K Values")
plt.legend()
plt.show()


## 📈 Evaluate Best Model

In [None]:

best_k = test_scores.index(max(test_scores)) + 1
print(f"Best k value: {best_k}")

knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


## 💾 Save Model and Scaler

In [None]:

joblib.dump(knn, 'models/diabetes_knn_model.pkl')
joblib.dump(scaler, 'models/scaler.pkl')


## ✅ Conclusion
We built a clean KNN classifier that predicts diabetes with solid performance. Ready for deployment or integration into an app.