In [10]:
from pathlib import Path
import os, sys, pandas as pd

print(f"[env] CWD  : {os.getcwd()}")
print(f"[env] Py   : {sys.executable}")

from diabetes_library.data_loader import load_data                    
from diabetes_library.preprocessing import (                           
    drop_invalid_rows,
    fill_missing_values,
    encode_gender,
    encode_ethnicity,
)
from diabetes_library import modeling                                  
from diabetes_library.evaluation import compute_auc                    

DATA_PATH = Path("data/sample_diabetes_mellitus_data.csv")
TARGET = "diabetes_mellitus"
print("[imports] OK")

df_raw = load_data(DATA_PATH)
print(f"[load] Shape={df_raw.shape}")
print(f"[load] Columns ({len(df_raw.columns)}): {list(df_raw.columns)}")
display(df_raw.head(3))



[env] CWD  : /Users/tizianschenk/Documents/BSE/Computing for Data Science/ComputingForDataScienceHW4/diabetes_project
[env] Py   : /Users/tizianschenk/Documents/BSE/Computing for Data Science/ComputingForDataScienceHW4/.venv/bin/python
[imports] OK
[load] Shape=(10000, 53)
[load] Columns (53): ['Unnamed: 0', 'encounter_id', 'hospital_id', 'age', 'bmi', 'elective_surgery', 'ethnicity', 'gender', 'height', 'hospital_admit_source', 'icu_admit_source', 'icu_id', 'icu_stay_type', 'icu_type', 'pre_icu_los_days', 'readmission_status', 'weight', 'albumin_apache', 'apache_2_diagnosis', 'apache_3j_diagnosis', 'apache_post_operative', 'arf_apache', 'bilirubin_apache', 'bun_apache', 'creatinine_apache', 'fio2_apache', 'gcs_eyes_apache', 'gcs_motor_apache', 'gcs_unable_apache', 'gcs_verbal_apache', 'glucose_apache', 'heart_rate_apache', 'hematocrit_apache', 'intubated_apache', 'map_apache', 'paco2_apache', 'paco2_for_ph_apache', 'pao2_apache', 'ph_apache', 'resprate_apache', 'sodium_apache', 'temp_

Unnamed: 0.1,Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,...,ventilated_apache,wbc_apache,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus
0,0,214826,118,68.0,22.732803,0,Caucasian,M,180.3,Floor,...,0,14.1,0,0,0,0,0,0,0,1
1,1,246060,81,77.0,27.421875,0,Caucasian,F,160.0,Floor,...,1,12.7,0,0,0,0,0,0,0,1
2,2,276985,118,25.0,31.952749,0,Caucasian,F,172.7,Emergency Department,...,0,,0,0,0,0,0,0,0,0


Clean and engineer features before split

In [11]:
df_clean = drop_invalid_rows(df_raw) 
df_clean = fill_missing_values(df_clean) 
df_clean = encode_ethnicity(df_clean)  
df_clean = encode_gender(df_clean)      
print(f"[preprocess] After cleaning/encoding: {df_clean.shape}")
display(df_clean.head(3))


[preprocess] After cleaning/encoding: (9368, 57)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())


Unnamed: 0.1,Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,gender,height,hospital_admit_source,icu_admit_source,...,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus,ethnicity_Asian,ethnicity_Caucasian,ethnicity_Hispanic,ethnicity_Native American,ethnicity_Other/Unknown
0,0,214826,118,68.0,22.732803,0,1,180.3,Floor,Floor,...,0,0,0,0,1,False,True,False,False,False
1,1,246060,81,77.0,27.421875,0,0,160.0,Floor,Floor,...,0,0,0,0,1,False,True,False,False,False
2,2,276985,118,25.0,31.952749,0,0,172.7,Emergency Department,Accident & Emergency,...,0,0,0,0,0,False,True,False,False,False


Create the train and test split

In [12]:
train_df, test_df = modeling.split_data(df_clean, test_size=0.2, random_state=42)
print(f"[split] Train={train_df.shape} | Test={test_df.shape}")


[split] Train=(7494, 57) | Test=(1874, 57)


Modeling features

In [13]:
#this is the one
base_features = list(modeling.FEATURES)
eth_cols = [c for c in df_clean.columns if c.startswith("ethnicity_")]
features = base_features + eth_cols + (["gender"] if "gender" in df_clean.columns else [])
print(f"[features] total={len(features)}")
print(features)

model = modeling.train_model(train_df, features)


[features] total=16
['age', 'height', 'weight', 'aids', 'cirrhosis', 'hepatic_failure', 'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis', 'ethnicity_Asian', 'ethnicity_Caucasian', 'ethnicity_Hispanic', 'ethnicity_Native American', 'ethnicity_Other/Unknown', 'gender']


Run the actual training function

In [14]:
#new model with features
model = modeling.train_model(train_df, features)
print("[train] done")


[train] done


Run prediction

In [15]:
#updated too
train_pred = modeling.add_predictions(train_df.copy(), model, features)
test_pred  = modeling.add_predictions(test_df.copy(), model, features)

print("[predict] added 'predictions' to train/test")


[predict] added 'predictions' to train/test


AUC

In [16]:
train_auc = compute_auc(train_pred)
test_auc  = compute_auc(test_pred)
print(f"[AUC] train={train_auc:.3f} | test={test_auc:.3f}")


[AUC] train=0.689 | test=0.679
