In [1]:
# 02_feature_engineering.ipynb
# I build engineered features, handle categorical encoding safely,
# create train/val/test splits, and save engineered CSVs.

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")


def main():
    print("\n=== 02: FEATURE ENGINEERING + SPLITS ===")

    CLEANED_PATH = r"C:\Users\User\Desktop\OSIRI UNIVERSITY Files\diabetes_prediction_dashboard\cleaned_diabetes_data.csv"

    TRAIN_OUT = r"C:\Users\User\Desktop\OSIRI UNIVERSITY Files\diabetes_prediction_dashboard\engineered_train_data.csv"
    VAL_OUT   = r"C:\Users\User\Desktop\OSIRI UNIVERSITY Files\diabetes_prediction_dashboard\engineered_val_data.csv"
    TEST_OUT  = r"C:\Users\User\Desktop\OSIRI UNIVERSITY Files\diabetes_prediction_dashboard\engineered_test_data.csv"

    
    #  Load cleaned dataset
    
    thankgod_israel = pd.read_csv(CLEANED_PATH)
    print("Loaded cleaned dataset:", thankgod_israel.shape)
    print("Columns:", list(thankgod_israel.columns))

    
    #  Minimal required columns check
    
    required_cols_min = [
        "gender",
        "age",
        "hypertension",
        "heart_disease",
        "bmi",
        "HbA1c_level",
        "blood_glucose_level",
        "diabetes",
    ]

    missing_cols = [c for c in required_cols_min if c not in thankgod_israel.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")

    
    #  Gender encoding (robust: handles already-numeric)
    
    if thankgod_israel["gender"].dtype == "object":
        gender_map = {"Female": 0, "Male": 1, "Other": 2}
        thankgod_israel["gender"] = thankgod_israel["gender"].map(gender_map).fillna(2).astype(int)
    else:
        # if already numeric, ensure integer
        thankgod_israel["gender"] = thankgod_israel["gender"].fillna(2).astype(int)

   
    #  Smoking history encoding (THIS FIXES YOUR ERROR)
    #    - If smoking_history exists -> one-hot encode
    #    - Else if already dummy columns exist -> keep
    #    - Else -> create expected dummy columns with zeros

    expected_smoking_cols = [
        "smoking_No Info",
        "smoking_current",
        "smoking_ever",
        "smoking_former",
        "smoking_never",
        "smoking_not current",
    ]

    if "smoking_history" in thankgod_israel.columns:
        # Normalize and one-hot encode
        thankgod_israel["smoking_history"] = thankgod_israel["smoking_history"].astype(str).str.strip()

        smoking_dummies = pd.get_dummies(thankgod_israel["smoking_history"], prefix="smoking")

        # Ensure expected columns exist
        for col in expected_smoking_cols:
            if col not in smoking_dummies.columns:
                smoking_dummies[col] = 0

        # Merge + drop original
        thankgod_israel = pd.concat(
            [thankgod_israel.drop(columns=["smoking_history"]), smoking_dummies[expected_smoking_cols]],
            axis=1
        )
        print("Smoking history encoded from 'smoking_history' column.")

    else:
        # smoking_history not present: check if dummy cols already exist
        already_present = [c for c in expected_smoking_cols if c in thankgod_israel.columns]

        if len(already_present) == len(expected_smoking_cols):
            print("Smoking dummy columns already exist. Using them as-is.")
        else:
            # create missing smoking dummy cols as zeros
            for col in expected_smoking_cols:
                if col not in thankgod_israel.columns:
                    thankgod_israel[col] = 0
            print("No smoking_history column found; created missing smoking dummy columns as zeros.")

    
    # 5) Feature engineering 

    thankgod_israel["age_bmi_interaction"] = thankgod_israel["age"] * thankgod_israel["bmi"]
    thankgod_israel["hba1c_glucose_interaction"] = thankgod_israel["HbA1c_level"] * thankgod_israel["blood_glucose_level"]

    thankgod_israel["age_squared"] = thankgod_israel["age"] ** 2
    thankgod_israel["bmi_squared"] = thankgod_israel["bmi"] ** 2
    thankgod_israel["hba1c_squared"] = thankgod_israel["HbA1c_level"] ** 2

    thankgod_israel["risk_score"] = (
        (thankgod_israel["age"] * 0.3)
        + (thankgod_israel["bmi"] * 0.2)
        + (thankgod_israel["HbA1c_level"] * 0.4)
        + (thankgod_israel["blood_glucose_level"] * 0.1)
    )

    print("After feature engineering:", thankgod_israel.shape)

    
    # 6) Split train/val/test (stratified)
    
    X = thankgod_israel.drop(columns=["diabetes"])
    y = thankgod_israel["diabetes"].astype(int)

    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.30, random_state=42, stratify=y
    )

    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.3, random_state=42, stratify=y_temp
    )

    print("Train:", X_train.shape, y_train.shape)
    print("Val:  ", X_val.shape, y_val.shape)
    print("Test: ", X_test.shape, y_test.shape)

    
    #  Save engineered splits
  
    engineered_train = pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
    engineered_val   = pd.concat([X_val.reset_index(drop=True), y_val.reset_index(drop=True)], axis=1)
    engineered_test  = pd.concat([X_test.reset_index(drop=True), y_test.reset_index(drop=True)], axis=1)

    engineered_train.to_csv(TRAIN_OUT, index=False)
    engineered_val.to_csv(VAL_OUT, index=False)
    engineered_test.to_csv(TEST_OUT, index=False)

    print("\nSaved engineered datasets:")
    print(TRAIN_OUT)
    print(VAL_OUT)
    print(TEST_OUT)


    # Sanity checks: confirm all expected columns exist
   
    print("\nSanity Check: Expected smoking dummy columns present?")
    for col in expected_smoking_cols:
        print(col, "" if col in engineered_train.columns else "")

    print("\nFeature engineering complete. Next I run 03_model_development.ipynb to train LightGBM and save artifacts.")


if __name__ == "__main__":
    main()



=== 02: FEATURE ENGINEERING + SPLITS ===
Loaded cleaned dataset: (96146, 14)
Columns: ['gender', 'age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'smoking_No Info', 'smoking_current', 'smoking_ever', 'smoking_former', 'smoking_never', 'smoking_not current', 'diabetes']
Smoking dummy columns already exist. Using them as-is.
After feature engineering: (96146, 20)
Train: (67302, 19) (67302,)
Val:   (20190, 19) (20190,)
Test:  (8654, 19) (8654,)

Saved engineered datasets:
C:\Users\User\Desktop\OSIRI UNIVERSITY Files\diabetes_prediction_dashboard\engineered_train_data.csv
C:\Users\User\Desktop\OSIRI UNIVERSITY Files\diabetes_prediction_dashboard\engineered_val_data.csv
C:\Users\User\Desktop\OSIRI UNIVERSITY Files\diabetes_prediction_dashboard\engineered_test_data.csv

Sanity Check: Expected smoking dummy columns present?
smoking_No Info 
smoking_current 
smoking_ever 
smoking_former 
smoking_never 
smoking_not current 

Feature engineering complete. Nex