In [None]:
# ===============================
# 1. IMPORT LIBRARIES
# ===============================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib




In [None]:
import pandas as pd

df = pd.read_excel("/content/Diabetes_dataset.xlsx")

print("Initial Dataset:")
display(df.head())



Initial Dataset:


Unnamed: 0,age,BMI,glucose,physical_activity,diabetes
0,45,32.1,165.0,3,yes
1,50,28.4,178.0,2,no
2,29,22.7,130.0,5,no
3,61,35.2,195.0,1,yes
4,48,31.4,0.0,2,yes


In [None]:
# ===============================
# 3. EDA
# ===============================
print("\nMissing Values:")
print(df.isnull().sum())

print("\nSummary Statistics:")
display(df.describe())

print("\nClass Balance:")
print(df['diabetes'].value_counts())




Missing Values:
age                  0
BMI                  0
glucose              1
physical_activity    0
diabetes             0
dtype: int64

Summary Statistics:


Unnamed: 0,age,BMI,glucose,physical_activity
count,20.0,20.0,19.0,20.0
mean,47.8,30.245,160.473684,2.5
std,12.898796,4.996996,49.995965,1.432701
min,23.0,21.8,0.0,1.0
25%,40.5,27.125,144.0,1.0
50%,47.5,29.85,160.0,2.0
75%,57.5,34.075,192.5,3.0
max,70.0,38.9,230.0,6.0



Class Balance:
diabetes
yes    10
no     10
Name: count, dtype: int64


In [None]:
# ===============================
# 4. DATA CLEANING
# ===============================

# Replace missing glucose with median
df['glucose'] = df['glucose'].fillna(df['glucose'].median())

# Replace 0 or impossible glucose with median again
df.loc[df['glucose'] < 40, 'glucose'] = df['glucose'].median()

# Encode target variable
df['diabetes'] = df['diabetes'].map({'no': 0, 'yes': 1})

# Features and target
X = df[['age', 'BMI', 'glucose', 'physical_activity']]
y = df['diabetes']

# Scale BMI only (example of selective scaling)
scaler = StandardScaler()
X['BMI'] = scaler.fit_transform(X[['BMI']])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['BMI'] = scaler.fit_transform(X[['BMI']])


In [None]:
# ===============================
# 5. TRAIN/TEST SPLIT
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)



In [None]:
# ===============================
# 6. TRAIN 3 MODELS FOR COMPARISON
# ===============================

# Logistic Regression
log_model = LogisticRegression()
log_model.fit(X_train, y_train)

# Decision Tree
tree_model = DecisionTreeClassifier(max_depth=4)
tree_model.fit(X_train, y_train)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)



In [None]:
# ===============================
# 7. EVALUATION
# ===============================
models = {
    "Logistic Regression": log_model,
    "Decision Tree": tree_model,
    "Random Forest": rf_model
}

for name, model in models.items():
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f"\n{name} Accuracy: {acc:.3f}")
    print(classification_report(y_test, preds))

# ===============================
# 8. EXPORT MODEL (Mini Deployment)
# ===============================
joblib.dump(rf_model, "diabetes_model.pkl")
joblib.dump(scaler, "bmi_scaler.pkl")

print("\nModel exported as diabetes_model.pkl")
print("Scaler exported as bmi_scaler.pkl")


Logistic Regression Accuracy: 0.800
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.75      1.00      0.86         3

    accuracy                           0.80         5
   macro avg       0.88      0.75      0.76         5
weighted avg       0.85      0.80      0.78         5


Decision Tree Accuracy: 1.000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         3

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5


Random Forest Accuracy: 0.600
              precision    recall  f1-score   support

           0       0.50      0.50      0.50         2
           1       0.67      0.67      0.67         3

    accuracy                           0.60         5
   macro avg       0.58      