# Early-Stage Diabetes Risk Prediction

**Project objective:** Build a model to predict early-stage diabetes using simple clinical features. This notebook includes step-by-step teaching notes.


## Step 0 — Setup

Run the next cell to import required packages. If a package is missing in Colab, uncomment the pip install line.

In [None]:
# !pip install --quiet seaborn scikit-learn
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
from sklearn.inspection import permutation_importance
sns.set(style='whitegrid')
%matplotlib inline
print('Packages loaded')

## Step 1 — Load data

Place `diabetes_data.csv` in the same folder and run the cell below. If using Colab, upload the file first.

In [None]:
DATA_PATH = 'diabetes_data.csv'
df = pd.read_csv(DATA_PATH)
print('Rows,Cols:', df.shape)
df.head()

## Step 2 — Inspect & Clean

Standardize column names and inspect unique values.

In [None]:
# Standardize column names
cols = [c.strip().lower().replace(' ', '_').replace('-', '_') for c in df.columns]
df.columns = cols
print('Columns:', cols)
print('\nMissing values:')
print(df.isnull().sum())

## Step 3 — Encode categorical variables

Map Yes/No to 1/0, Male/Female to 1/0, and class to 1/0.

In [None]:
# Mappings
yes_no_map = {'yes':1, 'no':0, 'Yes':1, 'No':0, 'YES':1, 'NO':0}
gender_map = {'male':1, 'female':0, 'Male':1, 'Female':0, 'M':1, 'F':0}
class_map = {'positive':1, 'negative':0, 'Positive':1, 'Negative':0}

df_clean = df.copy()
for col in df_clean.columns:
    if df_clean[col].dtype == 'object':
        # strip spaces
        df_clean[col] = df_clean[col].str.strip()
        if set(df_clean[col].dropna().unique()).intersection({'Yes','No','yes','no','YES','NO'}):
            df_clean[col] = df_clean[col].map(yes_no_map)
        elif set(df_clean[col].dropna().unique()).intersection({'Male','Female','male','female','M','F'}):
            df_clean[col] = df_clean[col].map(gender_map)
        elif col == 'class' or 'class' in col:
            df_clean[col] = df_clean[col].map(class_map)

print('Types after mapping:')
print(df_clean.dtypes)

## Step 4 — Handle missing values

Convert columns to numeric where possible and impute simple missing values.

In [None]:
for col in df_clean.columns:
    df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

print('Missing before impute:\n', df_clean.isnull().sum())

for col in df_clean.columns:
    if df_clean[col].isnull().sum() > 0:
        if df_clean[col].dropna().isin([0,1]).all():
            df_clean[col] = df_clean[col].fillna(0)
        else:
            df_clean[col] = df_clean[col].fillna(df_clean[col].median())

print('Missing after impute:\n', df_clean.isnull().sum())

## Step 5 — EDA

Plot class balance, age distribution, and symptom prevalence heatmap.

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x=df_clean['class'])
plt.title('Class balance (0=Neg,1=Pos)')
plt.show()

plt.figure(figsize=(8,4))
sns.boxplot(x=df_clean['class'], y=df_clean['age'])
plt.title('Age by class')
plt.show()

symptom_cols = [c for c in df_clean.columns if c not in ['age','class','gender']]
symptom_means = df_clean.groupby('class')[symptom_cols].mean().T
plt.figure(figsize=(10,8))
sns.heatmap(symptom_means, annot=True, fmt='.2f')
plt.title('Mean symptom prevalence by class')
plt.show()

## Step 6 — Prepare data and split

Scale age and do a stratified train-test split.

In [None]:
X = df_clean.drop('class', axis=1)
y = df_clean['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

scaler = StandardScaler()
if 'age' in X.columns:
    X_train = X_train.copy()
    X_test = X_test.copy()
    X_train['age'] = scaler.fit_transform(X_train[['age']])
    X_test['age'] = scaler.transform(X_test[['age']])

print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

## Step 7 — Train models

Fit Logistic Regression and Random Forest, then evaluate.

In [None]:
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
y_proba_lr = lr.predict_proba(X_test)[:,1]

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:,1]

def evaluate(y_true, y_pred, y_proba):
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred),
        'roc_auc': roc_auc_score(y_true, y_proba)
    }

lr_metrics = evaluate(y_test, y_pred_lr, y_proba_lr)
rf_metrics = evaluate(y_test, y_pred_rf, y_proba_rf)
pd.DataFrame([lr_metrics, rf_metrics], index=['LogisticRegression','RandomForest'])

## Step 8 — Confusion matrices & ROC

Visualize confusion matrices and ROC curves for both models.

In [None]:
fig, axes = plt.subplots(1,2, figsize=(12,5))
sns.heatmap(confusion_matrix(y_test, y_pred_lr), annot=True, fmt='d', ax=axes[0])
axes[0].set_title('LR Confusion Matrix')
sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt='d', ax=axes[1])
axes[1].set_title('RF Confusion Matrix')
plt.show()

fpr_lr, tpr_lr, _ = roc_curve(y_test, y_proba_lr)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_proba_rf)
plt.figure(figsize=(7,6))
plt.plot(fpr_lr, tpr_lr, label=f'LR AUC={lr_metrics["roc_auc"]:.3f}')
plt.plot(fpr_rf, tpr_rf, label=f'RF AUC={rf_metrics["roc_auc"]:.3f}')
plt.plot([0,1],[0,1],'k--')
plt.legend()
plt.show()

## Step 9 — Interpretability

Show Random Forest importances and permutation importance.

In [None]:
imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values()
plt.figure(figsize=(8,6))
imp.tail(15).plot(kind='barh')
plt.title('RF Feature Importances')
plt.show()

perm = permutation_importance(rf, X_test, y_test, n_repeats=20, random_state=42)
perm_imp = pd.Series(perm.importances_mean, index=X.columns).sort_values()
plt.figure(figsize=(8,6))
perm_imp.tail(15).plot(kind='barh')
plt.title('Permutation Importance (RF)')
plt.show()

## Step 10 — Save outputs & next steps

Save cleaned data and model metrics. Next: hyperparameter tuning, CV, SHAP, and a Power BI dashboard showing predicted risk by patient.

In [None]:
os.makedirs('outputs', exist_ok=True)
df_clean.to_csv('outputs/data_clean.csv', index=False)
pd.DataFrame([lr_metrics, rf_metrics], index=['LogisticRegression','RandomForest']).to_csv('outputs/model_metrics.csv')
print('Saved outputs to outputs/')