
# Diabetes Prediction — Decision Tree  
**Task date:** 2025-08-31

This notebook follows the required steps:
1. EDA (pandas, matplotlib)  
2. Data preprocessing  
3. Feature extraction & selection  
4. Train/Test split  
5. Train the model  
6. Evaluate with **F1 score** and **confusion matrix** (*bonus*)  
7. (*bonus*) Export a multi-page PDF report of key figures and metrics  
8. Save the trained model to `./model/decision_tree_model.pkl` for the GUI app

> **Dataset path:** `./data/diabetes_prediction_dataset.csv`  
Place the CSV there before running. Column names expected by the task:  
`gender, age, hypertension, heart_disease, smoking_history, bmi, HbA1c_level, blood_glucose_level, diabetes`


In [None]:

# Imports
import os, json, math, pickle, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from matplotlib.backends.backend_pdf import PdfPages

# Utility: ensure folders
os.makedirs("./model", exist_ok=True)
os.makedirs("./figures", exist_ok=True)

DATA_PATH = "./data/diabetes_prediction_dataset.csv"

# Helper to show value counts nicely
def vc(df, col):
    return df[col].value_counts(dropna=False).rename_axis(col).to_frame("count")


In [None]:

# Load data
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset not found at {DATA_PATH}. Please add the CSV first.")

df = pd.read_csv(DATA_PATH)

# Standardize column names if needed
df.columns = [c.strip().lower() for c in df.columns]

expected = ["gender","age","hypertension","heart_disease","smoking_history","bmi","hba1c_level","blood_glucose_level","diabetes"]
missing = [c for c in expected if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns: {missing}. Please ensure the dataset has the correct headers.")

# Basic check
print(df.head())
print("\nShape:", df.shape)
print("\nTarget distribution:")
print(df['diabetes'].value_counts(normalize=True).round(3))


In [None]:

# === EDA ===
print("Info:")
print(df.info())

print("\nDescribe:")
print(df.describe(include='all'))

# Missing values
print("\nMissing per column:")
print(df.isna().sum())

# Distributions
num_cols = ["age","bmi","hba1c_level","blood_glucose_level"]
for c in num_cols:
    plt.figure()
    df[c].hist(bins=30)
    plt.title(f"Distribution of {c}")
    plt.xlabel(c)
    plt.ylabel("Frequency")
    plt.show()

# Categorical summaries
for c in ["gender","smoking_history","hypertension","heart_disease","diabetes"]:
    display(vc(df, c))


In [None]:

# === Preprocessing & Split ===
X = df.drop(columns=["diabetes"])
y = df["diabetes"].astype(int)

cat_cols = ["gender","smoking_history","hypertension","heart_disease"]
num_cols = ["age","bmi","hba1c_level","blood_glucose_level"]

# Cast binary to category to be safe (hypertension, heart_disease)
for c in ["hypertension","heart_disease"]:
    if X[c].dtype != 'O':
        X[c] = X[c].astype('category')

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ]
)

# Train/test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape, " Test shape:", X_test.shape)
print("Train target balance:", y_train.mean().round(3), " Test target balance:", y_test.mean().round(3))


In [None]:

# === Evaluation helpers ===
def evaluate_and_plots(model, X_test, y_test, model_name="Model", pdf_path=None):
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    print(f"{model_name} F1 score:", round(f1, 4))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, digits=4))

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    fig_cm = plt.figure()
    plt.imshow(cm, interpolation='nearest')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.colorbar()
    tick_marks = np.arange(2)
    plt.xticks(tick_marks, ['No Diabetes', 'Diabetes'])
    plt.yticks(tick_marks, ['No Diabetes', 'Diabetes'])
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, format(cm[i, j], 'd'),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()

    # ROC Curve (if predict_proba available)
    try:
        y_proba = model.predict_proba(X_test)[:,1]
        auc = roc_auc_score(y_test, y_proba)
        print(f"ROC-AUC: {auc:.4f}")
        fpr, tpr, thr = roc_curve(y_test, y_proba)
        fig_roc = plt.figure()
        plt.plot(fpr, tpr, label=f"AUC={auc:.3f}")
        plt.plot([0,1],[0,1], linestyle="--")
        plt.title(f"ROC Curve - {model_name}")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.legend(loc="lower right")
        plt.tight_layout()
        plt.show()
    except Exception as e:
        fig_roc = None
        print("predict_proba not available or ROC failed:", e)

    # Save figures to multipage PDF (bonus)
    if pdf_path is not None:
        with PdfPages(pdf_path) as pdf:
            # Add a text page
            fig_text = plt.figure()
            plt.axis('off')
            lines = [
                f"Report: {model_name}",
                "",
                f"F1 score: {f1:.4f}",
            ]
            try:
                lines.append(f"ROC-AUC: {auc:.4f}")
            except:
                pass
            plt.text(0.01, 0.99, "\n".join(lines), va='top')
            pdf.savefig(fig_text)
            plt.close(fig_text)

            # Save CM figure
            if plt.fignum_exists(fig_cm.number):
                pdf.savefig(fig_cm)
            plt.close(fig_cm)

            # Save ROC figure if created
            if 'fig_roc' in locals() and fig_roc is not None and plt.fignum_exists(fig_roc.number):
                pdf.savefig(fig_roc)
                plt.close(fig_roc)

        print(f"\nSaved PDF report to: {pdf_path}")


In [None]:

from sklearn.tree import DecisionTreeClassifier

dt = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("clf", DecisionTreeClassifier(random_state=42, max_depth=None, class_weight=None))
])

dt.fit(X_train, y_train)

with open("./model/decision_tree_model.pkl", "wb") as f:
    pickle.dump(dt, f)

evaluate_and_plots(dt, X_test, y_test, model_name="Decision Tree",
                   pdf_path="./figures/decision_tree_report.pdf")


In [None]:

# Feature importance (Decision Tree / Random Forest) - approximate via one-hot names
def get_feature_names(preprocessor, X):
    num_cols = preprocessor.transformers_[0][2]
    cat = preprocessor.transformers_[1][1].named_steps["onehot"]
    cat_cols = preprocessor.transformers_[1][2]
    ohe_names = list(cat.get_feature_names_out(cat_cols))
    return list(num_cols) + ohe_names

def plot_importances(pipeline, X, topn=20, title="Feature Importance"):
    try:
        clf = pipeline.named_steps["clf"]
        if hasattr(clf, "feature_importances_"):
            feature_names = get_feature_names(pipeline.named_steps["preprocess"], X)
            importances = clf.feature_importances_
            order = np.argsort(importances)[::-1][:topn]
            names = [feature_names[i] for i in order]
            vals = importances[order]

            plt.figure()
            y = np.arange(len(names))[::-1]
            plt.barh(y, vals)
            plt.yticks(y, names)
            plt.title(title)
            plt.xlabel("Importance")
            plt.tight_layout()
            plt.show()
        else:
            print("Classifier has no feature_importances_.")
    except Exception as e:
        print("Failed to plot importances:", e)


In [None]:
plot_importances(dt, X_train, topn=20, title='Decision Tree Feature Importance')