In [8]:
"""
Diabetes Prediction Pipeline
Demonstrates the complete workflow using our OOP library.
"""

# Imports
import pandas as pd
from sklearn.metrics import roc_auc_score

# Import our library classes
from hw5lib import (
    DataLoader,
    NaNRowRemover,
    NaNMeanFiller,
    BMICalculator,
    GenderEncoder,
    AgeSquared,
    DiabetesModel
)


In [9]:
print("=" * 60)
print("STEP 1: Loading Data")
print("=" * 60)

loader = DataLoader(
    csv_path="../sample_diabetes_mellitus_data.csv",
    target="diabetes_mellitus",
    test_size=0.2,
    random_state=42
)

train_df, test_df = loader.load()
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print()


STEP 1: Loading Data
Train shape: (8000, 53)
Test shape: (2000, 53)



In [10]:

print("=" * 60)
print("STEP 2: Removing NaN Rows (age, gender, ethnicity)")
print("=" * 60)

nan_remover = NaNRowRemover(columns_to_check=["age", "gender", "ethnicity"])
train_df = nan_remover.fit_transform(train_df)
test_df = nan_remover.transform(test_df)

print(f"After removing NaN rows:")
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print()
#check

STEP 2: Removing NaN Rows (age, gender, ethnicity)
After removing NaN rows:
Train shape: (7503, 53)
Test shape: (1865, 53)



In [11]:


print("=" * 60)
print("STEP 3: Filling NaN with Mean (height, weight)")
print("=" * 60)

nan_filler = NaNMeanFiller(columns_to_fill=["height", "weight"])
train_df = nan_filler.fit_transform(train_df)
test_df = nan_filler.transform(test_df)

print(f"Learned means from training data:")
print(f"  height: {nan_filler.means_['height']:.2f}")
print(f"  weight: {nan_filler.means_['weight']:.2f}")
print()


STEP 3: Filling NaN with Mean (height, weight)
Learned means from training data:
  height: 170.06
  weight: 86.79



In [12]:


print("=" * 60)
print("STEP 4: Creating Features")
print("=" * 60)

# Feature 1: BMI
print("Creating BMI feature...")
bmi_calculator = BMICalculator(height_col="height", weight_col="weight", output_col="bmi")
train_df = bmi_calculator.fit_transform(train_df)
test_df = bmi_calculator.transform(test_df)

# Feature 2: Gender encoding
print("Encoding gender...")
gender_encoder = GenderEncoder(gender_col="gender", output_col="gender_numeric")
train_df = gender_encoder.fit_transform(train_df)
test_df = gender_encoder.transform(test_df)
print(f"Gender mapping: {gender_encoder.gender_mapping_}")

# Feature 3: Age squared
print("Creating age squared feature...")
age_squared = AgeSquared(age_col="age", output_col="age_squared")
train_df = age_squared.fit_transform(train_df)
test_df = age_squared.transform(test_df)

print(f"\nNew features created: bmi, gender_numeric, age_squared")
print(f"Train columns: {list(train_df.columns)}")
print()


STEP 4: Creating Features
Creating BMI feature...
Encoding gender...
Gender mapping: {'F': 0, 'M': 1}
Creating age squared feature...

New features created: bmi, gender_numeric, age_squared
Train columns: ['Unnamed: 0', 'encounter_id', 'hospital_id', 'age', 'bmi', 'elective_surgery', 'ethnicity', 'gender', 'height', 'hospital_admit_source', 'icu_admit_source', 'icu_id', 'icu_stay_type', 'icu_type', 'pre_icu_los_days', 'readmission_status', 'weight', 'albumin_apache', 'apache_2_diagnosis', 'apache_3j_diagnosis', 'apache_post_operative', 'arf_apache', 'bilirubin_apache', 'bun_apache', 'creatinine_apache', 'fio2_apache', 'gcs_eyes_apache', 'gcs_motor_apache', 'gcs_unable_apache', 'gcs_verbal_apache', 'glucose_apache', 'heart_rate_apache', 'hematocrit_apache', 'intubated_apache', 'map_apache', 'paco2_apache', 'paco2_for_ph_apache', 'pao2_apache', 'ph_apache', 'resprate_apache', 'sodium_apache', 'temp_apache', 'urineoutput_apache', 'ventilated_apache', 'wbc_apache', 'aids', 'cirrhosis', 'he

In [13]:

print("=" * 60)
print("STEP 5: Training Model")
print("=" * 60)

# Define features to use (pick columns that exist and make sense)
# Adjust these based on what columns are actually in your diabetes dataset
feature_columns = [
    'age',
    'gender_numeric',
    'bmi',
    'age_squared',
    # Add more columns from your dataset as needed
    # e.g., 'hypertension', 'heart_disease', 'smoking_history_numeric', etc.
]

# Filter to only use columns that actually exist
feature_columns = [col for col in feature_columns if col in train_df.columns]
print(f"Using features: {feature_columns}")
print()

# Initialize model
model = DiabetesModel(
    feature_columns=feature_columns,
    target_column='diabetes_mellitus',
    hyperparameters={
        'n_estimators': 100,
        'max_depth': 10,
        'random_state': 42
    }
)

# Train model
print("Training Random Forest model...")
model.train(train_df)
print("✓ Model trained successfully!")
print()

# Optional: Show feature importance
try:
    importance_df = model.get_feature_importance()
    print("Feature Importances:")
    print(importance_df)
    print()
except:
    pass


STEP 5: Training Model
Using features: ['age', 'gender_numeric', 'bmi', 'age_squared']

Training Random Forest model...
✓ Model trained successfully!

Feature Importances:
          feature  importance
2             bmi    0.663353
0             age    0.147752
3     age_squared    0.147017
1  gender_numeric    0.041878



In [14]:

print("=" * 60)
print("STEP 6: Making Predictions on Test Set")
print("=" * 60)

# Get predicted probabilities
predictions_df = model.predict(test_df)
print(f"Predictions shape: {predictions_df.shape}")
print(f"Prediction columns: {list(predictions_df.columns)}")
print()

# Add predictions to test dataframe
# Assuming binary classification: prob_class_1 is probability of diabetes
test_df['predictions'] = predictions_df['prob_class_1'].values

print("Sample predictions:")
print(test_df[['diabetes_mellitus', 'predictions']].head(10))
print()


STEP 6: Making Predictions on Test Set
Predictions shape: (1865, 2)
Prediction columns: ['prob_class_0', 'prob_class_1']

Sample predictions:
      diabetes_mellitus  predictions
5012                  0     0.295345
9412                  0     0.011179
5597                  0     0.026377
1143                  0     0.167516
7148                  1     0.503860
6421                  1     0.314951
1435                  0     0.378985
4907                  1     0.154092
9910                  0     0.118071
7239                  1     0.218949



In [15]:

print("=" * 60)
print("STEP 7: Computing ROC AUC Score")
print("=" * 60)

# Get true labels and predicted probabilities
y_true = test_df['diabetes_mellitus']
y_pred_proba = test_df['predictions']

# Compute ROC AUC score
roc_auc = roc_auc_score(y_true, y_pred_proba)

print(f"ROC AUC Score: {roc_auc:.4f}")
print()


STEP 7: Computing ROC AUC Score
ROC AUC Score: 0.6683



In [16]:

print("=" * 60)
print("PIPELINE COMPLETE!")
print("=" * 60)
print(f"✓ Data loaded and split")
print(f"✓ Preprocessing applied")
print(f"✓ Features engineered")
print(f"✓ Model trained")
print(f"✓ Predictions generated")
print(f"✓ ROC AUC Score: {roc_auc:.4f}")
print()


PIPELINE COMPLETE!
✓ Data loaded and split
✓ Preprocessing applied
✓ Features engineered
✓ Model trained
✓ Predictions generated
✓ ROC AUC Score: 0.6683

