"# Pipeline: load → preprocess → feature engineering → train → predict\n",
This notebook runs the same steps as main.py but shows intermediate results and computes ROC AUC on the test set.

In [3]:
from process_data.data.data_preprocessing_all import CSVDataLoader, DataPreprocessor
from process_data.features.feature_transform import BMICalculator, EthnicityEncoder, GenderBinaryEncoder
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np

ModuleNotFoundError: No module named 'process_data.data'

In [4]:
# File path to data
file_name = "C:/EC/BSE/DSDM/Term 1/21DM004 Computing for Data Science/hw5/process_data_project/process_data/data/sample_diabetes_mellitus_data.csv"
loader = CSVDataLoader(file_name)
train_df, test_df = loader.split_data()
print("Loaded: \ntrain rows=", len(train_df), "\ntest rows=", len(test_df))
  


Data loaded. Shape: (10000, 53)

Data split: 
Train ((8000, 53)), 
Test ((2000, 53))

Loaded: 
train rows= 8000 
test rows= 2000


In [None]:
# Preprocess (remove required nans, fill mean for numeric cols)
preprocessor = DataPreprocessor(train_df, test_df)
train_clean, test_clean = preprocessor.remove_nans(['age', 'gender', 'ethnicity']).fill_nans_with_mean(['height', 'weight']).get_data()
print("After preprocessing: train columns=", train_clean.columns.tolist())  

Data after removing rows with NaNs in ['age', 'gender', 'ethnicity']: 
Train ((7494, 53)), 
Test ((1874, 53))

Filled NaNs in ['height', 'weight'] with mean.

After preprocessing: train columns= ['Unnamed: 0', 'encounter_id', 'hospital_id', 'age', 'bmi', 'elective_surgery', 'ethnicity', 'gender', 'height', 'hospital_admit_source', 'icu_admit_source', 'icu_id', 'icu_stay_type', 'icu_type', 'pre_icu_los_days', 'readmission_status', 'weight', 'albumin_apache', 'apache_2_diagnosis', 'apache_3j_diagnosis', 'apache_post_operative', 'arf_apache', 'bilirubin_apache', 'bun_apache', 'creatinine_apache', 'fio2_apache', 'gcs_eyes_apache', 'gcs_motor_apache', 'gcs_unable_apache', 'gcs_verbal_apache', 'glucose_apache', 'heart_rate_apache', 'hematocrit_apache', 'intubated_apache', 'map_apache', 'paco2_apache', 'paco2_for_ph_apache', 'pao2_apache', 'ph_apache', 'resprate_apache', 'sodium_apache', 'temp_apache', 'urineoutput_apache', 'ventilated_apache', 'wbc_apache', 'aids', 'cirrhosis', 'hepatic_fail

In [7]:
# Apply feature transformers
transformers = [BMICalculator(), EthnicityEncoder(), GenderBinaryEncoder()]
for t in transformers:
    train_clean = t.transform(train_clean)
    test_clean = t.transform(test_clean)
    print(f"Applied {t.__class__.__name__}: produced -> {t.get_feature_names()}")
    print('\nFinal train columns:', train_clean.columns.tolist())  

Applied BMICalculator: produced -> ['bmi']

Final train columns: ['Unnamed: 0', 'encounter_id', 'hospital_id', 'age', 'bmi', 'elective_surgery', 'ethnicity', 'gender', 'height', 'hospital_admit_source', 'icu_admit_source', 'icu_id', 'icu_stay_type', 'icu_type', 'pre_icu_los_days', 'readmission_status', 'weight', 'albumin_apache', 'apache_2_diagnosis', 'apache_3j_diagnosis', 'apache_post_operative', 'arf_apache', 'bilirubin_apache', 'bun_apache', 'creatinine_apache', 'fio2_apache', 'gcs_eyes_apache', 'gcs_motor_apache', 'gcs_unable_apache', 'gcs_verbal_apache', 'glucose_apache', 'heart_rate_apache', 'hematocrit_apache', 'intubated_apache', 'map_apache', 'paco2_apache', 'paco2_for_ph_apache', 'pao2_apache', 'ph_apache', 'resprate_apache', 'sodium_apache', 'temp_apache', 'urineoutput_apache', 'ventilated_apache', 'wbc_apache', 'aids', 'cirrhosis', 'hepatic_failure', 'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis', 'diabetes_mellitus']
Applied EthnicityEncoder: p

In [8]:
# Define features and target
feature_cols = ['age', 'bmi', 'gender_M', 'gender_F'] + [c for c in train_clean.columns if c.startswith('ethnicity_')]
target_col = 'diabetes_mellitus'
print('Feature cols used:', feature_cols)
print('Target:', target_col)
 

Feature cols used: ['age', 'bmi', 'gender_M', 'gender_F', 'ethnicity_African American', 'ethnicity_Asian', 'ethnicity_Caucasian', 'ethnicity_Hispanic', 'ethnicity_Native American', 'ethnicity_Other/Unknown']
Target: diabetes_mellitus


In [None]:
# Initialize and train model
from model.predictor import DiabetesModel
model = DiabetesModel(feature_columns=feature_cols, target_column=target_col, hyperparameters={'n_estimators':200, 'max_depth':10, 'random_state':42})
model.train(train_clean)
print('Model trained.') 

Model trained.


In [13]:
# Predict probabilities on test set
probs = model.predict(test_clean)  # shape (n_samples, n_classes)
if probs.ndim == 2:
    pos_probs = probs[:, 1]
else:
    #fallback: if single-column probabilities provided
    pos_probs = probs
test_with_preds = test_clean.copy()
test_with_preds['predictions_prob'] = pos_probs
test_with_preds['predictions'] = (pos_probs >= 0.5).astype(int)
print('Predictions added to test dataframe.')

Predictions added to test dataframe.


In [14]:
# Compute ROC AUC on test set (and train set if desired)
y_test = test_with_preds[target_col]
test_auc = roc_auc_score(y_test, test_with_preds['predictions_prob'])
print(f"Test ROC AUC: {test_auc:.4f}")
# Optionally compute train AUC as well
train_probs = model.predict(train_clean)
train_pos = train_probs[:, 1] if train_probs.ndim == 2 else train_probs
train_auc = roc_auc_score(train_clean[target_col], train_pos)
print(f"Train ROC AUC: {train_auc:.4f}")

Test ROC AUC: 0.6618
Train ROC AUC: 0.8454


In [None]:
# Show sample of test_with_preds\n",
test_with_preds[['predictions_prob','predictions', target_col]].head(10)