This notebook involves new preprocessing steps (added feature engeneering, which was implemented on 28/11 at 21:10)   met de nieuwe stappen aangerdan door de ta 

In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import pickle
from pathlib import Path

data_path = Path("../data/")   # full path to the HEF folder

train = pd.read_csv(data_path / "mimic_train_HEF.csv", low_memory=False)
test  = pd.read_csv(data_path / "mimic_test_HEF.csv",  low_memory=False)

train.shape, test.shape


((20885, 44), (5221, 39))

In [3]:
# =============================================================================
# ENHANCED PREPROCESSING - Hospital History + ICD9 Diagnoses
# Based on TA hints for significant improvement
# =============================================================================

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import pickle
from pathlib import Path

data_path = Path("../data/")

print("="*70)
print("ENHANCED PREPROCESSING WITH HOSPITAL HISTORY + ICD9")
print("="*70)

# =============================================================================
# 1. LOAD ALL DATA SOURCES
# =============================================================================

print("\n--- Loading data sources ---")

# Main datasets (keep IDs this time!)
train = pd.read_csv(data_path / "mimic_train_HEF.csv", low_memory=False)
test = pd.read_csv(data_path / "mimic_test_HEF.csv", low_memory=False)

# Diagnoses
diagnoses = pd.read_csv(data_path / "extra_data" / "MIMIC_diagnoses.csv")

print(f"Train: {train.shape}")
print(f"Test: {test.shape}")
print(f"Diagnoses: {diagnoses.shape}")

# Save test IDs
test_ids = test['icustay_id'].copy()

# =============================================================================
# 2. CREATE HOSPITAL HISTORY FEATURES
# =============================================================================

print("\n" + "="*70)
print("CREATING HOSPITAL HISTORY FEATURES")
print("="*70)

def create_history_features(df):
    """
    Create features based on patient's hospital history
    """
    df = df.copy()
    
    # Check actual column names (they might be lowercase)
    subject_col = 'subject_id' if 'subject_id' in df.columns else 'SUBJECT_ID'
    hadm_col = 'hadm_id' if 'hadm_id' in df.columns else 'HADM_ID'
    admit_col = 'ADMITTIME' if 'ADMITTIME' in df.columns else 'admittime'
    
    # Sort by patient and admission time to get chronological order
    if admit_col in df.columns:
        df[admit_col] = pd.to_datetime(df[admit_col])
        df = df.sort_values([subject_col, admit_col])
    else:
        df = df.sort_values([subject_col, hadm_col])
    
    # --- Feature 1: Previous ICU stays for this patient ---
    df['n_previous_icu_stays'] = df.groupby(subject_col).cumcount()
    
    # --- Feature 2: Is this the patient's first ICU visit? ---
    df['is_first_icu_visit'] = (df['n_previous_icu_stays'] == 0).astype(int)
    
    # --- Feature 3: Previous hospital admissions ---
    # Count unique HADM_IDs before current row for each patient
    df['n_previous_admissions'] = df.groupby(subject_col)[hadm_col].transform(
        lambda x: pd.Series(range(len(x)), index=x.index).map(
            lambda i: x.iloc[:i].nunique() if i > 0 else 0
        )
    )
    
    # --- Feature 4: Time since last admission (if applicable) ---
    if admit_col in df.columns:
        df['days_since_last_admission'] = df.groupby(subject_col)[admit_col].diff().dt.total_seconds() / (24*3600)
        df['days_since_last_admission'] = df['days_since_last_admission'].fillna(-1)  # -1 for first visit
    
    # --- Feature 5: Frequent flyer indicator ---
    # Patients with 3+ ICU stays in dataset are high risk
    icu_counts = df.groupby(subject_col).size()
    frequent_flyers = icu_counts[icu_counts >= 3].index
    df['is_frequent_flyer'] = df[subject_col].isin(frequent_flyers).astype(int)
    
    print(f"\n  ‚úì Created history features:")
    print(f"    - n_previous_icu_stays (mean: {df['n_previous_icu_stays'].mean():.2f})")
    print(f"    - is_first_icu_visit ({df['is_first_icu_visit'].sum()} first visits)")
    print(f"    - n_previous_admissions (mean: {df['n_previous_admissions'].mean():.2f})")
    if 'days_since_last_admission' in df.columns:
        valid_days = df[df['days_since_last_admission'] >= 0]['days_since_last_admission']
        if len(valid_days) > 0:
            print(f"    - days_since_last_admission (median: {valid_days.median():.1f} days)")
    print(f"    - is_frequent_flyer ({df['is_frequent_flyer'].sum()} frequent flyers)")
    
    return df

# Create history features for train and test
train = create_history_features(train)
test = create_history_features(test)

# =============================================================================
# 3. CREATE ICD9 DIAGNOSIS FEATURES
# =============================================================================

print("\n" + "="*70)
print("CREATING ICD9 DIAGNOSIS FEATURES")
print("="*70)

# Check column names in diagnoses file (might also be lowercase)
if 'HADM_ID' not in diagnoses.columns and 'hadm_id' in diagnoses.columns:
    diagnoses.columns = diagnoses.columns.str.upper()

# Now use consistent column names
hadm_col = 'hadm_id' if 'hadm_id' in train.columns else 'HADM_ID'
diagnoses_hadm_col = 'HADM_ID' if 'HADM_ID' in diagnoses.columns else 'hadm_id'

# --- Feature 1: Number of diagnoses per admission ---
n_diagnoses = diagnoses.groupby(diagnoses_hadm_col).size().to_dict()
train['n_diagnoses'] = train[hadm_col].map(n_diagnoses).fillna(0)
test['n_diagnoses'] = test[hadm_col].map(n_diagnoses).fillna(0)

print(f"\n  ‚úì n_diagnoses (mean: {train['n_diagnoses'].mean():.1f})")

# --- Feature 2: Primary diagnosis (SEQ_NUM = 1) ---
seq_col = 'SEQ_NUM' if 'SEQ_NUM' in diagnoses.columns else 'seq_num'
icd_col = 'ICD9_CODE' if 'ICD9_CODE' in diagnoses.columns else 'icd9_code'

primary_diagnoses = diagnoses[diagnoses[seq_col] == 1].set_index(diagnoses_hadm_col)[icd_col]
train['primary_diagnosis'] = train[hadm_col].map(primary_diagnoses)
test['primary_diagnosis'] = test[hadm_col].map(primary_diagnoses)

print(f"  ‚úì primary_diagnosis ({train['primary_diagnosis'].notna().sum()} found)")

# --- Feature 3: Extract category (first 3 digits) ---
def extract_icd9_category(code):
    if pd.isna(code):
        return 'UNKNOWN'
    code_str = str(code).strip().replace('.', '')
    if len(code_str) >= 3:
        return code_str[:3]
    elif len(code_str) > 0:
        return code_str
    else:
        return 'UNKNOWN'

train['primary_diag_category'] = train['primary_diagnosis'].apply(extract_icd9_category)
test['primary_diag_category'] = test['primary_diagnosis'].apply(extract_icd9_category)

print(f"  ‚úì primary_diag_category ({train['primary_diag_category'].nunique()} categories)")

# --- Feature 4: Major disease categories ---
def get_major_category(code):
    if pd.isna(code):
        return 'UNKNOWN'
    code_str = str(code).strip().replace('.', '')
    if len(code_str) == 0:
        return 'UNKNOWN'
    
    first_digit = code_str[0]
    
    if first_digit in ['0', '1']:
        return 'INFECTIOUS'
    elif first_digit == '2':
        return 'NEOPLASM'
    elif first_digit == '3':
        return 'ENDOCRINE'
    elif first_digit == '4':
        return 'BLOOD'
    elif first_digit == '5':
        return 'MENTAL'
    elif first_digit in ['6', '7']:
        return 'NERVOUS'
    elif first_digit == '8':
        return 'CIRCULATORY'
    elif first_digit == '9':
        return 'RESPIRATORY'
    elif first_digit == 'V' or first_digit == 'E':
        return 'EXTERNAL'
    else:
        return 'OTHER'

train['primary_major_category'] = train['primary_diagnosis'].apply(get_major_category)
test['primary_major_category'] = test['primary_diagnosis'].apply(get_major_category)

print(f"  ‚úì primary_major_category:")
for cat, count in train['primary_major_category'].value_counts().head().items():
    print(f"    {cat}: {count} ({count/len(train)*100:.1f}%)")

# --- Feature 5: Condition flags ---
print("\n  Creating condition flags...")

hadm_diagnoses = diagnoses.groupby(diagnoses_hadm_col)[icd_col].apply(lambda x: set(x.astype(str)))

def check_conditions(hadm_id):
    if hadm_id not in hadm_diagnoses.index:
        return {
            'has_sepsis': 0,
            'has_heart_failure': 0,
            'has_resp_failure': 0,
            'has_aki': 0,
            'has_diabetes_comp': 0
        }
    
    codes = hadm_diagnoses[hadm_id]
    
    return {
        'has_sepsis': int(any(c.startswith(('99591', '99592', '78552')) for c in codes)),
        'has_heart_failure': int(any(c.startswith('428') for c in codes)),
        'has_resp_failure': int(any(c.startswith('518') for c in codes)),
        'has_aki': int(any(c.startswith('584') for c in codes)),
        'has_diabetes_comp': int(any(c.startswith('250') for c in codes))
    }

# Apply to train
condition_flags_train = train[hadm_col].apply(check_conditions).apply(pd.Series)
train = pd.concat([train, condition_flags_train], axis=1)

# Apply to test
condition_flags_test = test[hadm_col].apply(check_conditions).apply(pd.Series)
test = pd.concat([test, condition_flags_test], axis=1)

print(f"  ‚úì Condition flags:")
for col in ['has_sepsis', 'has_heart_failure', 'has_resp_failure', 'has_aki', 'has_diabetes_comp']:
    count = train[col].sum()
    print(f"    {col}: {count} ({count/len(train)*100:.1f}%)")
    
# =============================================================================
# 4. NOW CONTINUE WITH REGULAR PREPROCESSING
# =============================================================================

print("\n" + "="*70)
print("STANDARD PREPROCESSING")
print("="*70)

# Drop leakage columns (but keep subject_id, hadm_id for now)
columns_to_drop = [
    'DISCHTIME', 'DEATHTIME', 'DOD', 'LOS',
    'ADMITTIME', 'Diff',
    'icustay_id',  # Drop after using it
    'primary_diagnosis'  # Drop the raw diagnosis, keep encoded versions
]

train_clean = train.drop(columns=[c for c in columns_to_drop if c in train.columns], errors='ignore')
test_clean = test.drop(columns=[c for c in columns_to_drop if c in test.columns], errors='ignore')

# Separate target
y = train_clean['HOSPITAL_EXPIRE_FLAG']
X = train_clean.drop('HOSPITAL_EXPIRE_FLAG', axis=1)
X_test = test_clean.copy()

# Now we can drop IDs (we've used them for feature engineering)
X = X.drop(['SUBJECT_ID', 'HADM_ID'], axis=1, errors='ignore')
X_test = X_test.drop(['SUBJECT_ID', 'HADM_ID'], axis=1, errors='ignore')

print(f"\nAfter ID removal: {X.shape[1]} features")

# Identify feature types
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print(f"Numeric: {len(numeric_features)}, Categorical: {len(categorical_features)}")

# =============================================================================
# 5. IMPUTATION
# =============================================================================

print("\n--- Imputation ---")

numeric_imputer = SimpleImputer(strategy='median')
X[numeric_features] = numeric_imputer.fit_transform(X[numeric_features])
X_test[numeric_features] = numeric_imputer.transform(X_test[numeric_features])

categorical_imputer = SimpleImputer(strategy='most_frequent')
X[categorical_features] = categorical_imputer.fit_transform(X[categorical_features])
X_test[categorical_features] = categorical_imputer.transform(X_test[categorical_features])

print("‚úì Imputation complete")

# =============================================================================
# 6. HANDLE DOB ‚Üí age
# =============================================================================

print("\n--- Converting DOB to age ---")

if 'DOB' in categorical_features:
    train_original = pd.read_csv(data_path / 'mimic_train_HEF.csv')
    test_original = pd.read_csv(data_path / 'mimic_test_HEF.csv')
    
    dob_train = pd.to_datetime(X['DOB'], errors='coerce')
    dob_test = pd.to_datetime(X_test['DOB'], errors='coerce')
    admit_train = pd.to_datetime(train_original['ADMITTIME'], errors='coerce')
    admit_test = pd.to_datetime(test_original['ADMITTIME'], errors='coerce')
    
    def calculate_age(admit_time, dob):
        if pd.isna(admit_time) or pd.isna(dob):
            return np.nan
        try:
            age_days = (admit_time - dob).days
            age_years = age_days / 365.25
            return age_years
        except:
            return np.nan
    
    X['age'] = [calculate_age(admit, dob) for admit, dob in zip(admit_train, dob_train)]
    X_test['age'] = [calculate_age(admit, dob) for admit, dob in zip(admit_test, dob_test)]
    
    X['age'] = pd.to_numeric(X['age'], errors='coerce')
    X_test['age'] = pd.to_numeric(X_test['age'], errors='coerce')
    
    X.loc[(X['age'] < 0) | (X['age'] > 120), 'age'] = np.nan
    X_test.loc[(X_test['age'] < 0) | (X_test['age'] > 120), 'age'] = np.nan
    
    age_median = X['age'].median()
    X['age'].fillna(age_median, inplace=True)
    X_test['age'].fillna(age_median, inplace=True)
    
    X = X.drop('DOB', axis=1)
    X_test = X_test.drop('DOB', axis=1)
    categorical_features.remove('DOB')
    numeric_features.append('age')
    
    print(f"‚úì Age: {X['age'].min():.1f} - {X['age'].max():.1f} years")

# =============================================================================
# 7. TARGET ENCODE primary_diag_category (PROPERLY!)
# =============================================================================

print("\n--- Target encoding primary diagnosis ---")

if 'primary_diag_category' in X.columns:
    # Target encode using mortality rate per category
    encoding_map = y.groupby(X['primary_diag_category']).mean().to_dict()
    global_mean = y.mean()
    
    X['primary_diag_encoded'] = X['primary_diag_category'].map(encoding_map)
    X_test['primary_diag_encoded'] = X_test['primary_diag_category'].map(encoding_map).fillna(global_mean)
    
    numeric_features.append('primary_diag_encoded')
    
    X = X.drop('primary_diag_category', axis=1)
    X_test = X_test.drop('primary_diag_category', axis=1)
    categorical_features.remove('primary_diag_category')
    
    print(f"‚úì Primary diagnosis encoded (mortality rates: {X['primary_diag_encoded'].min():.3f} - {X['primary_diag_encoded'].max():.3f})")

# =============================================================================
# 8. HANDLE OTHER CATEGORICALS (same as before)
# =============================================================================

print("\n--- Processing categorical features ---")

# ICD9_diagnosis (from original data)
if 'ICD9_diagnosis' in X.columns:
    def extract_icd9_category(code):
        if pd.isna(code):
            return 'UNKNOWN'
        code_str = str(code).strip().replace('.', '')
        if len(code_str) >= 3:
            return code_str[:3]
        elif len(code_str) > 0:
            return code_str
        else:
            return 'UNKNOWN'
    
    X['ICD9_category'] = X['ICD9_diagnosis'].apply(extract_icd9_category)
    X_test['ICD9_category'] = X_test['ICD9_diagnosis'].apply(extract_icd9_category)
    
    encoding_map = y.groupby(X['ICD9_category']).mean().to_dict()
    
    X['ICD9_encoded'] = X['ICD9_category'].map(encoding_map)
    X_test['ICD9_encoded'] = X_test['ICD9_category'].map(encoding_map).fillna(global_mean)
    
    numeric_features.append('ICD9_encoded')
    
    X = X.drop(['ICD9_diagnosis', 'ICD9_category'], axis=1)
    X_test = X_test.drop(['ICD9_diagnosis', 'ICD9_category'], axis=1)
    categorical_features.remove('ICD9_diagnosis')
    
    print("‚úì ICD9_diagnosis encoded")

# Drop DIAGNOSIS (free text)
if 'DIAGNOSIS' in categorical_features:
    X = X.drop('DIAGNOSIS', axis=1)
    X_test = X_test.drop('DIAGNOSIS', axis=1)
    categorical_features.remove('DIAGNOSIS')

# Group ethnicity
if 'ETHNICITY' in categorical_features:
    def group_ethnicity(ethnicity):
        if pd.isna(ethnicity):
            return 'UNKNOWN'
        ethnicity = str(ethnicity).upper()
        if 'WHITE' in ethnicity:
            return 'WHITE'
        elif 'BLACK' in ethnicity or 'AFRICAN' in ethnicity:
            return 'BLACK'
        elif 'HISPANIC' in ethnicity or 'LATINO' in ethnicity:
            return 'HISPANIC'
        elif 'ASIAN' in ethnicity:
            return 'ASIAN'
        elif 'AMERICAN INDIAN' in ethnicity or 'ALASKA NATIVE' in ethnicity:
            return 'NATIVE'
        elif 'HAWAIIAN' in ethnicity or 'PACIFIC ISLANDER' in ethnicity:
            return 'PACIFIC_ISLANDER'
        elif any(x in ethnicity for x in ['UNKNOWN', 'UNABLE', 'DECLINED', 'NOT SPECIFIED']):
            return 'UNKNOWN'
        else:
            return 'OTHER'
    
    X['ETHNICITY'] = X['ETHNICITY'].apply(group_ethnicity)
    X_test['ETHNICITY'] = X_test['ETHNICITY'].apply(group_ethnicity)

# Group religion
if 'RELIGION' in categorical_features:
    def group_religion(religion):
        if pd.isna(religion):
            return 'UNKNOWN'
        religion = str(religion).upper()
        if 'CATHOLIC' in religion:
            return 'CATHOLIC'
        elif any(x in religion for x in ['PROTESTANT', 'EPISCOPALIAN', 'QUAKER']):
            return 'PROTESTANT'
        elif 'JEWISH' in religion or 'HEBREW' in religion:
            return 'JEWISH'
        elif 'MUSLIM' in religion:
            return 'MUSLIM'
        elif 'ORTHODOX' in religion:
            return 'ORTHODOX'
        elif any(x in religion for x in ['BUDDHIST', 'HINDU', 'JEHOVAH', 'CHRISTIAN SCIENTIST', 
                                          '7TH DAY ADVENTIST', 'UNITARIAN']):
            return 'OTHER_RELIGION'
        elif any(x in religion for x in ['UNOBTAINABLE', 'NOT SPECIFIED', 'UNKNOWN']):
            return 'UNKNOWN'
        else:
            return 'OTHER'
    
    X['RELIGION'] = X['RELIGION'].apply(group_religion)
    X_test['RELIGION'] = X_test['RELIGION'].apply(group_religion)

# Group marital status
if 'MARITAL_STATUS' in categorical_features:
    def group_marital_status(status):
        if pd.isna(status):
            return 'UNKNOWN'
        status = str(status).upper()
        if 'MARRIED' in status or 'LIFE PARTNER' in status:
            return 'MARRIED'
        elif 'SINGLE' in status:
            return 'SINGLE'
        elif 'WIDOWED' in status:
            return 'WIDOWED'
        elif 'DIVORCED' in status or 'SEPARATED' in status:
            return 'DIVORCED_SEPARATED'
        elif 'UNKNOWN' in status:
            return 'UNKNOWN'
        else:
            return 'UNKNOWN'
    
    X['MARITAL_STATUS'] = X['MARITAL_STATUS'].apply(group_marital_status)
    X_test['MARITAL_STATUS'] = X_test['MARITAL_STATUS'].apply(group_marital_status)

# One-hot encode major category
if 'primary_major_category' in X.columns:
    X_combined = pd.concat([X, X_test], keys=['train', 'test'])
    X_encoded = pd.get_dummies(X_combined, columns=['primary_major_category'], drop_first=True, prefix='diag')
    X = X_encoded.xs('train')
    X_test = X_encoded.xs('test')

# One-hot encode remaining
remaining_categorical = [col for col in categorical_features if col in X.columns]

if len(remaining_categorical) > 0:
    X_combined = pd.concat([X, X_test], keys=['train', 'test'])
    X_encoded = pd.get_dummies(X_combined, columns=remaining_categorical, drop_first=True)
    X = X_encoded.xs('train')
    X_test = X_encoded.xs('test')

print(f"‚úì Categorical encoding complete")

# =============================================================================
# 9. FEATURE ENGINEERING (same as before - vitals-based)
# =============================================================================

print("\n--- Feature engineering (vitals) ---")

original_features = X.shape[1]

# Blood pressure
if all(col in X.columns for col in ['SysBP_Mean', 'DiasBP_Mean']):
    X['PulsePressure'] = X['SysBP_Mean'] - X['DiasBP_Mean']
    X_test['PulsePressure'] = X_test['SysBP_Mean'] - X_test['DiasBP_Mean']

if all(col in X.columns for col in ['SysBP_Min', 'SysBP_Max']):
    X['SysBP_Range'] = X['SysBP_Max'] - X['SysBP_Min']
    X_test['SysBP_Range'] = X_test['SysBP_Max'] - X_test['SysBP_Min']

# Shock indices
if all(col in X.columns for col in ['HeartRate_Mean', 'SysBP_Mean']):
    X['ShockIndex'] = (X['HeartRate_Mean'] / (X['SysBP_Mean'] + 1)).clip(0, 3)
    X_test['ShockIndex'] = (X_test['HeartRate_Mean'] / (X_test['SysBP_Mean'] + 1)).clip(0, 3)

if all(col in X.columns for col in ['HeartRate_Mean', 'MeanBP_Mean']):
    X['ModifiedShockIndex'] = (X['HeartRate_Mean'] / (X['MeanBP_Mean'] + 1)).clip(0, 3)
    X_test['ModifiedShockIndex'] = (X_test['HeartRate_Mean'] / (X_test['MeanBP_Mean'] + 1)).clip(0, 3)

# Respiratory
if 'SpO2_Min' in X.columns:
    X['Hypoxemia'] = (X['SpO2_Min'] < 90).astype(int)
    X_test['Hypoxemia'] = (X_test['SpO2_Min'] < 90).astype(int)

if 'RespRate_Mean' in X.columns:
    X['RespRate_Abnormal'] = ((X['RespRate_Mean'] < 12) | (X['RespRate_Mean'] > 20)).astype(int)
    X_test['RespRate_Abnormal'] = ((X_test['RespRate_Mean'] < 12) | (X_test['RespRate_Mean'] > 20)).astype(int)

# Temperature
if 'TempC_Max' in X.columns:
    X['Fever'] = (X['TempC_Max'] > 38).astype(int)
    X_test['Fever'] = (X_test['TempC_Max'] > 38).astype(int)

if 'TempC_Min' in X.columns:
    X['Hypothermia'] = (X['TempC_Min'] < 36).astype(int)
    X_test['Hypothermia'] = (X_test['TempC_Min'] < 36).astype(int)

if all(col in X.columns for col in ['TempC_Min', 'TempC_Max']):
    X['Temp_Range'] = X['TempC_Max'] - X['TempC_Min']
    X_test['Temp_Range'] = X_test['TempC_Max'] - X_test['TempC_Min']

# Glucose
if 'Glucose_Max' in X.columns:
    X['Hyperglycemia'] = (X['Glucose_Max'] > 180).astype(int)
    X_test['Hyperglycemia'] = (X_test['Glucose_Max'] > 180).astype(int)

if 'Glucose_Min' in X.columns:
    X['Hypoglycemia'] = (X['Glucose_Min'] < 70).astype(int)
    X_test['Hypoglycemia'] = (X_test['Glucose_Min'] < 70).astype(int)

if all(col in X.columns for col in ['Glucose_Min', 'Glucose_Max']):
    X['Glucose_Range'] = X['Glucose_Max'] - X['Glucose_Min']
    X_test['Glucose_Range'] = X_test['Glucose_Max'] - X_test['Glucose_Min']

# Age
if 'age' in X.columns:
    X['Elderly'] = (X['age'] > 65).astype(int)
    X_test['Elderly'] = (X_test['age'] > 65).astype(int)
    
    X['age_squared'] = X['age'] ** 2
    X_test['age_squared'] = X_test['age'] ** 2
    
    # TA hint: more age features!
    X['age_risk_group'] = pd.cut(X['age'], 
                                   bins=[0, 18, 45, 65, 80, 120],
                                   labels=['pediatric', 'young_adult', 'middle_age', 'elderly', 'very_old'])
    X_test['age_risk_group'] = pd.cut(X_test['age'],
                                        bins=[0, 18, 45, 65, 80, 120],
                                        labels=['pediatric', 'young_adult', 'middle_age', 'elderly', 'very_old'])
    
    # One-hot encode age groups
    X = pd.get_dummies(X, columns=['age_risk_group'], drop_first=True, prefix='age')
    X_test = pd.get_dummies(X_test, columns=['age_risk_group'], drop_first=True, prefix='age')

# Heart rate
if all(col in X.columns for col in ['HeartRate_Min', 'HeartRate_Max']):
    X['HeartRate_Range'] = X['HeartRate_Max'] - X['HeartRate_Min']
    X_test['HeartRate_Range'] = X_test['HeartRate_Max'] - X_test['HeartRate_Min']

# Severity score
severity_components = []
if 'ShockIndex' in X.columns:
    severity_components.append((X['ShockIndex'] > 0.9).astype(int))
if 'Hypoxemia' in X.columns:
    severity_components.append(X['Hypoxemia'])
if 'RespRate_Abnormal' in X.columns:
    severity_components.append(X['RespRate_Abnormal'])
if 'Fever' in X.columns:
    severity_components.append(X['Fever'])
if 'Hypothermia' in X.columns:
    severity_components.append(X['Hypothermia'])

if severity_components:
    X['Severity_Score'] = sum(severity_components)
    severity_components_test = []
if 'ShockIndex' in X_test.columns:
    severity_components_test.append((X_test['ShockIndex'] > 0.9).astype(int))
if 'Hypoxemia' in X_test.columns:
    severity_components_test.append(X_test['Hypoxemia'])
if 'RespRate_Abnormal' in X_test.columns:
    severity_components_test.append(X_test['RespRate_Abnormal'])
if 'Fever' in X_test.columns:
    severity_components_test.append(X_test['Fever'])
if 'Hypothermia' in X_test.columns:
    severity_components_test.append(X_test['Hypothermia'])

X_test['Severity_Score'] = sum(severity_components_test)
print(f"‚úì Added {X.shape[1] - original_features} engineered features")
#=============================================================================
#10. SCALING
#=============================================================================
print("\n--- Scaling ---")
all_numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
binary_features = [
'Hypoxemia', 'RespRate_Abnormal', 'Fever', 'Hypothermia',
'Hyperglycemia', 'Hypoglycemia', 'Elderly',
'is_first_icu_visit', 'is_frequent_flyer',
'has_sepsis', 'has_heart_failure', 'has_resp_failure', 'has_aki', 'has_diabetes_comp'
]
one_hot_features = [col for col in X.columns if ('_' in col and X[col].nunique() <= 2)]
exclude_from_scaling = binary_features + one_hot_features
features_to_scale = [col for col in all_numeric_cols if col not in exclude_from_scaling]
scaler = StandardScaler()
X[features_to_scale] = scaler.fit_transform(X[features_to_scale])
X_test[features_to_scale] = scaler.transform(X_test[features_to_scale])
print(f"‚úì Scaled {len(features_to_scale)} continuous features")
print(f"‚úì Left {len(exclude_from_scaling)} binary features unscaled")
#=============================================================================
#11. SAVE
#=============================================================================
print("\n--- Saving ---")
import os
os.makedirs('../data/processed_enhanced', exist_ok=True)
X.to_pickle('../data/processed_enhanced/X_train_processed.pkl')
y.to_pickle('../data/processed_enhanced/y_train.pkl')
X_test.to_pickle('../data/processed_enhanced/X_test_processed.pkl')
test_ids.to_pickle('../data/processed_enhanced/test_ids.pkl')
with open('../data/processed_enhanced/numeric_imputer.pkl', 'wb') as f:
    pickle.dump(numeric_imputer, f)
with open('../data/processed_enhanced/categorical_imputer.pkl', 'wb') as f:
    pickle.dump(categorical_imputer, f)
with open('../data/processed_enhanced/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print("‚úì Saved to ../data/processed_enhanced/")
print("\n" + "="*70)
print("PREPROCESSING COMPLETE!")
print("="*70)
print(f"\nFinal feature count: {X.shape[1]}")
print(f"  Original vitals: ~35")
print(f"  Hospital history: ~5")
print(f"  ICD9 diagnoses: ~6")
print(f"  Condition flags: 5")
print(f"  Engineered: ~20")
print(f"  One-hot encoded: ~{X.shape[1] - 71}")
print("\nüéØ Ready to train models with enhanced features!")
print("   Expected improvement: significant boost from hospital history + diagnoses")

ENHANCED PREPROCESSING WITH HOSPITAL HISTORY + ICD9

--- Loading data sources ---
Train: (20885, 44)
Test: (5221, 39)
Diagnoses: (651047, 4)

CREATING HOSPITAL HISTORY FEATURES

  ‚úì Created history features:
    - n_previous_icu_stays (mean: 0.40)
    - is_first_icu_visit (16317 first visits)
    - n_previous_admissions (mean: 0.37)
    - days_since_last_admission (median: 46.6 days)
    - is_frequent_flyer (3388 frequent flyers)

  ‚úì Created history features:
    - n_previous_icu_stays (mean: 0.09)
    - is_first_icu_visit (4847 first visits)
    - n_previous_admissions (mean: 0.09)
    - days_since_last_admission (median: 93.5 days)
    - is_frequent_flyer (172 frequent flyers)

CREATING ICD9 DIAGNOSIS FEATURES

  ‚úì n_diagnoses (mean: 14.8)
  ‚úì primary_diagnosis (20885 found)
  ‚úì primary_diag_category (530 categories)
  ‚úì primary_major_category:
    BLOOD: 7507 (35.9%)
    MENTAL: 3912 (18.7%)
    INFECTIOUS: 3208 (15.4%)
    CIRCULATORY: 1795 (8.6%)
    RESPIRATORY: 1578

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X['age'].fillna(age_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test['age'].fillna(age_median, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in 

‚úì Age: 0.0 - 120.0 years

--- Target encoding primary diagnosis ---
‚úì Primary diagnosis encoded (mortality rates: 0.000 - 1.000)

--- Processing categorical features ---
‚úì ICD9_diagnosis encoded
‚úì Categorical encoding complete

--- Feature engineering (vitals) ---
‚úì Added 20 engineered features

--- Scaling ---
‚úì Scaled 42 continuous features
‚úì Left 63 binary features unscaled

--- Saving ---
‚úì Saved to ../data/processed_enhanced/

PREPROCESSING COMPLETE!

Final feature count: 97
  Original vitals: ~35
  Hospital history: ~5
  ICD9 diagnoses: ~6
  Condition flags: 5
  Engineered: ~20
  One-hot encoded: ~26

üéØ Ready to train models with enhanced features!
   Expected improvement: significant boost from hospital history + diagnoses


In [2]:
# =============================================================================
# SAVE TEST IDs FIRST (BEFORE DROPPING!)
# =============================================================================

test_ids = test['icustay_id'].copy()
print(f"\n‚úì Saved {len(test_ids)} test IDs for submission")


‚úì Saved 5221 test IDs for submission


In [3]:
# =============================================================================
# 2. DROP LEAKAGE COLUMNS
# =============================================================================
columns_to_drop = [
    'DISCHTIME', 'DEATHTIME', 'DOD', 'LOS',
    'subject_id', 'hadm_id', 'icustay_id',
    'ADMITTIME', 'Diff'
]

train_clean = train.drop(columns=columns_to_drop, errors='ignore')
test_clean = test.drop(columns=columns_to_drop, errors='ignore')

In [4]:
# =============================================================================
# 3. SEPARATE TARGET
# =============================================================================
y = train_clean['HOSPITAL_EXPIRE_FLAG']
X = train_clean.drop('HOSPITAL_EXPIRE_FLAG', axis=1)
X_test = test_clean.copy()

In [5]:
# =============================================================================
# 4. IDENTIFY FEATURE TYPES
# =============================================================================
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print(f"\nNumeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")


Numeric features: 24
Categorical features: 10


In [6]:
# =============================================================================
# 5. IMPUTATION
# =============================================================================
print("\n--- Imputing missing values ---")

# Numeric
numeric_imputer = SimpleImputer(strategy='median')
X[numeric_features] = numeric_imputer.fit_transform(X[numeric_features])
X_test[numeric_features] = numeric_imputer.transform(X_test[numeric_features])

# Categorical
categorical_imputer = SimpleImputer(strategy='most_frequent')
X[categorical_features] = categorical_imputer.fit_transform(X[categorical_features])
X_test[categorical_features] = categorical_imputer.transform(X_test[categorical_features])


--- Imputing missing values ---


In [7]:
# -----------------------------------------------------------------------------
# STEP 1: Convert DOB to age (handling MIMIC-III date shifting)
# -----------------------------------------------------------------------------
print("\n--- Step 1: Converting DOB to age ---")

if 'DOB' in X.columns and 'DOB' in categorical_features:
    # MIMIC-III shifts all dates forward by ~200 years for anonymization
    # But the RELATIVE age is preserved
    # Strategy: Calculate age = ADMITTIME - DOB
    
    # Reload original data to get ADMITTIME
    train_original = pd.read_csv('../data/mimic_train_HEF.csv')
    test_original = pd.read_csv('../data/mimic_test_HEF.csv')
    
    # Convert to datetime
    dob_train = pd.to_datetime(X['DOB'], errors='coerce')
    dob_test = pd.to_datetime(X_test['DOB'], errors='coerce')
    admit_train = pd.to_datetime(train_original['ADMITTIME'], errors='coerce')
    admit_test = pd.to_datetime(test_original['ADMITTIME'], errors='coerce')
    
    # Calculate age using timedelta and convert to years
    # Use .apply() to avoid overflow
    def calculate_age(admit_time, dob):
        if pd.isna(admit_time) or pd.isna(dob):
            return np.nan
        try:
            # Calculate difference in days, then convert to years
            age_days = (admit_time - dob).days
            age_years = age_days / 365.25
            return age_years
        except:
            return np.nan
    
    # Calculate age for train
    X['age'] = [calculate_age(admit, dob) for admit, dob in zip(admit_train, dob_train)]
    
    # Calculate age for test
    X_test['age'] = [calculate_age(admit, dob) for admit, dob in zip(admit_test, dob_test)]
    
    # Convert to numeric (in case of any issues)
    X['age'] = pd.to_numeric(X['age'], errors='coerce')
    X_test['age'] = pd.to_numeric(X_test['age'], errors='coerce')
    
    # Clean up
    X = X.drop('DOB', axis=1)
    X_test = X_test.drop('DOB', axis=1)
    categorical_features.remove('DOB')
    
    print(f"  ‚úì Calculated age from DOB and ADMITTIME")
    print(f"    Age range: {X['age'].min():.1f} - {X['age'].max():.1f} years")
    print(f"    Mean age: {X['age'].mean():.1f} years")
    print(f"    Missing ages: {X['age'].isna().sum()}")
    
    # Sanity check: ages should be reasonable (0-120 years)
    if X['age'].max() > 120 or X['age'].min() < 0:
        print(f"    ‚ö†Ô∏è WARNING: Unusual age range detected!")
        print(f"    Sample ages: {X['age'].head(10).tolist()}")
    
    # Handle missing or invalid ages
    if X['age'].isna().sum() > 0 or (X['age'] < 0).any() or (X['age'] > 120).any():
        # Set invalid ages to NaN
        X.loc[(X['age'] < 0) | (X['age'] > 120), 'age'] = np.nan
        X_test.loc[(X_test['age'] < 0) | (X_test['age'] > 120), 'age'] = np.nan
        
        # Impute with median
        age_median = X['age'].median()
        X['age'].fillna(age_median, inplace=True)
        X_test['age'].fillna(age_median, inplace=True)
        print(f"    ‚úì Imputed invalid ages with median: {age_median:.1f}")
    
    # Add to numeric features for scaling later
    if 'age' not in numeric_features:
        numeric_features.append('age')


--- Step 1: Converting DOB to age ---
  ‚úì Calculated age from DOB and ADMITTIME
    Age range: 15.0 - 89.0 years
    Mean age: 62.7 years
    Missing ages: 1107
    ‚úì Imputed invalid ages with median: 64.5


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X['age'].fillna(age_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test['age'].fillna(age_median, inplace=True)


In [8]:
# =============================================================================
# 5.5. DIAGNOSE HIGH-CARDINALITY CATEGORICAL FEATURES
# =============================================================================
print("\n" + "="*70)
print("DIAGNOSING CATEGORICAL FEATURES")
print("="*70)

print(f"\nTotal categorical features: {len(categorical_features)}")

# Check cardinality (number of unique values) for each categorical feature
for cat_col in categorical_features:
    n_unique = X[cat_col].nunique()
    print(f"  {cat_col}: {n_unique} unique values")
    
    # Show distribution if few unique values
    if n_unique <= 10:
        print(f"    Distribution: {X[cat_col].value_counts().to_dict()}")
    else:
        print(f"    ‚ö†Ô∏è HIGH CARDINALITY - will create {n_unique} one-hot columns!")

# Estimate final feature count after one-hot encoding
estimated_features = len(numeric_features)
for cat_col in categorical_features:
    estimated_features += X[cat_col].nunique() - 1  # -1 because drop_first=True

print(f"\n‚ö†Ô∏è ESTIMATED TOTAL FEATURES AFTER ONE-HOT ENCODING: {estimated_features}")
print(f"Current numeric features: {len(numeric_features)}")


DIAGNOSING CATEGORICAL FEATURES

Total categorical features: 9
  GENDER: 2 unique values
    Distribution: {'M': 11759, 'F': 9126}
  ADMISSION_TYPE: 3 unique values
    Distribution: {'EMERGENCY': 17817, 'ELECTIVE': 2848, 'URGENT': 220}
  INSURANCE: 5 unique values
    Distribution: {'Medicare': 11718, 'Private': 6245, 'Medicaid': 2117, 'Government': 611, 'Self Pay': 194}
  RELIGION: 17 unique values
    ‚ö†Ô∏è HIGH CARDINALITY - will create 17 one-hot columns!
  MARITAL_STATUS: 7 unique values
    Distribution: {'MARRIED': 10386, 'SINGLE': 5910, 'WIDOWED': 2819, 'DIVORCED': 1413, 'SEPARATED': 240, 'UNKNOWN (DEFAULT)': 103, 'LIFE PARTNER': 14}
  ETHNICITY: 41 unique values
    ‚ö†Ô∏è HIGH CARDINALITY - will create 41 one-hot columns!
  DIAGNOSIS: 6193 unique values
    ‚ö†Ô∏è HIGH CARDINALITY - will create 6193 one-hot columns!
  ICD9_diagnosis: 1853 unique values
    ‚ö†Ô∏è HIGH CARDINALITY - will create 1853 one-hot columns!
  FIRST_CAREUNIT: 5 unique values
    Distribution: {'MICU

In [9]:
# =============================================================================
# 7. SMART CATEGORICAL ENCODING
# =============================================================================
print("\n" + "="*70)
print("SMART CATEGORICAL ENCODING")
print("="*70)

# -----------------------------------------------------------------------------
# STEP 2: Handle ICD9_diagnosis (extract category, then target encode)
# -----------------------------------------------------------------------------
print("\n--- Step 2: Processing ICD9_diagnosis codes ---")

if 'ICD9_diagnosis' in X.columns:
    # ICD9 codes have hierarchical structure:
    # First digit = broad category (e.g., 4XX = circulatory system)
    # First 3 digits = more specific category
    
    def extract_icd9_category(code):
        """Extract first 3 characters from ICD9 code"""
        if pd.isna(code):
            return 'UNKNOWN'
        code_str = str(code).strip()
        # Remove decimal point and take first 3 characters
        code_str = code_str.replace('.', '')
        if len(code_str) >= 3:
            return code_str[:3]
        elif len(code_str) > 0:
            return code_str
        else:
            return 'UNKNOWN'
    
    X['ICD9_category'] = X['ICD9_diagnosis'].apply(extract_icd9_category)
    X_test['ICD9_category'] = X_test['ICD9_diagnosis'].apply(extract_icd9_category)
    
    n_icd9_categories = X['ICD9_category'].nunique()
    print(f"  ‚úì Extracted ICD9 categories: {n_icd9_categories} unique categories")
    
    # Target encode (because still likely 100+ categories)
    print(f"  ‚Üí Using target encoding for ICD9 categories")
    encoding_map = y.groupby(X['ICD9_category']).mean().to_dict()
    global_mean = y.mean()
    
    X['ICD9_encoded'] = X['ICD9_category'].map(encoding_map)
    X_test['ICD9_encoded'] = X_test['ICD9_category'].map(encoding_map).fillna(global_mean)
    
    # Add to numeric features (target encoding creates numeric feature)
    numeric_features.append('ICD9_encoded')
    
    # Drop originals
    X = X.drop(['ICD9_diagnosis', 'ICD9_category'], axis=1)
    X_test = X_test.drop(['ICD9_diagnosis', 'ICD9_category'], axis=1)
    categorical_features.remove('ICD9_diagnosis')
    
    print(f"  ‚úì ICD9_diagnosis ‚Üí ICD9_encoded (numeric)")

# -----------------------------------------------------------------------------
# STEP 3: Handle DIAGNOSIS (free text - extract keywords or drop)
# -----------------------------------------------------------------------------
print("\n--- Step 3: Processing DIAGNOSIS (free text) ---")

if 'DIAGNOSIS' in X.columns:
    # Option 1: Drop it (safest - free text is very high cardinality)
    # Option 2: Extract common keywords (more complex)
    
    # For now, let's DROP it to keep things simple
    # (We already have ICD9 codes which are more structured)
    
    print(f"  ‚úì Dropping DIAGNOSIS (free text, {X['DIAGNOSIS'].nunique()} unique values)")
    print(f"    ‚Üí Keeping ICD9_encoded instead (more structured)")
    
    X = X.drop('DIAGNOSIS', axis=1)
    X_test = X_test.drop('DIAGNOSIS', axis=1)
    categorical_features.remove('DIAGNOSIS')

# -----------------------------------------------------------------------------
# STEP 4: Group ETHNICITY into broader categories
# -----------------------------------------------------------------------------
print("\n--- Step 4: Grouping ETHNICITY ---")

if 'ETHNICITY' in X.columns:
    def group_ethnicity(ethnicity):
        if pd.isna(ethnicity):
            return 'UNKNOWN'
        ethnicity = str(ethnicity).upper()
        
        # WHITE (includes variants like WHITE - RUSSIAN, WHITE - BRAZILIAN, etc.)
        if 'WHITE' in ethnicity:
            return 'WHITE'
        
        # BLACK (includes BLACK/AFRICAN AMERICAN, BLACK/HAITIAN, BLACK/CAPE VERDEAN, etc.)
        elif 'BLACK' in ethnicity or 'AFRICAN' in ethnicity:
            return 'BLACK'
        
        # HISPANIC/LATINO (all variants)
        elif 'HISPANIC' in ethnicity or 'LATINO' in ethnicity:
            return 'HISPANIC'
        
        # ASIAN (includes ASIAN - CHINESE, ASIAN - VIETNAMESE, etc.)
        elif 'ASIAN' in ethnicity:
            return 'ASIAN'
        
        # NATIVE/INDIGENOUS (American Indian/Alaska Native)
        elif 'AMERICAN INDIAN' in ethnicity or 'ALASKA NATIVE' in ethnicity:
            return 'NATIVE'
        
        # PACIFIC ISLANDER
        elif 'HAWAIIAN' in ethnicity or 'PACIFIC ISLANDER' in ethnicity:
            return 'PACIFIC_ISLANDER'
        
        # UNKNOWN/NOT SPECIFIED/DECLINED
        elif any(x in ethnicity for x in ['UNKNOWN', 'UNABLE', 'DECLINED', 'NOT SPECIFIED']):
            return 'UNKNOWN'
        
        # OTHER (includes MULTI RACE, MIDDLE EASTERN, CARIBBEAN, PORTUGUESE, etc.)
        else:
            return 'OTHER'
    
    X['ETHNICITY'] = X['ETHNICITY'].apply(group_ethnicity)
    X_test['ETHNICITY'] = X_test['ETHNICITY'].apply(group_ethnicity)
    
    print(f"  ‚úì Grouped ETHNICITY: 41 ‚Üí {X['ETHNICITY'].nunique()} categories")
    print(f"    New categories: {sorted(X['ETHNICITY'].unique())}")
    print(f"    Distribution:")
    for cat, count in X['ETHNICITY'].value_counts().items():
        print(f"      {cat}: {count} ({count/len(X)*100:.1f}%)")

# -----------------------------------------------------------------------------
# STEP 5: Group RELIGION into broader categories
# -----------------------------------------------------------------------------
print("\n--- Step 5: Grouping RELIGION ---")

if 'RELIGION' in X.columns:
    def group_religion(religion):
        if pd.isna(religion):
            return 'UNKNOWN'
        religion = str(religion).upper()
        
        # CATHOLIC
        if 'CATHOLIC' in religion:
            return 'CATHOLIC'
        
        # PROTESTANT/CHRISTIAN (includes PROTESTANT QUAKER, EPISCOPALIAN, etc.)
        elif any(x in religion for x in ['PROTESTANT', 'EPISCOPALIAN', 'QUAKER']):
            return 'PROTESTANT'
        
        # JEWISH (includes HEBREW)
        elif 'JEWISH' in religion or 'HEBREW' in religion:
            return 'JEWISH'
        
        # MUSLIM
        elif 'MUSLIM' in religion:
            return 'MUSLIM'
        
        # ORTHODOX (GREEK ORTHODOX, ROMANIAN ORTHODOX)
        elif 'ORTHODOX' in religion:
            return 'ORTHODOX'
        
        # OTHER RELIGIONS (Buddhist, Hindu, Christian Scientist, Jehovah's Witness, etc.)
        elif any(x in religion for x in ['BUDDHIST', 'HINDU', 'JEHOVAH', 'CHRISTIAN SCIENTIST', 
                                          '7TH DAY ADVENTIST', 'UNITARIAN']):
            return 'OTHER_RELIGION'
        
        # UNKNOWN/NOT SPECIFIED
        elif any(x in religion for x in ['UNOBTAINABLE', 'NOT SPECIFIED', 'UNKNOWN']):
            return 'UNKNOWN'
        
        # OTHER
        else:
            return 'OTHER'
    
    X['RELIGION'] = X['RELIGION'].apply(group_religion)
    X_test['RELIGION'] = X_test['RELIGION'].apply(group_religion)
    
    print(f"  ‚úì Grouped RELIGION: 17 ‚Üí {X['RELIGION'].nunique()} categories")
    print(f"    New categories: {sorted(X['RELIGION'].unique())}")
    print(f"    Distribution:")
    for cat, count in X['RELIGION'].value_counts().items():
        print(f"      {cat}: {count} ({count/len(X)*100:.1f}%)")

# -----------------------------------------------------------------------------
# STEP 6: Group MARITAL_STATUS
# -----------------------------------------------------------------------------
print("\n--- Step 6: Grouping MARITAL_STATUS ---")

if 'MARITAL_STATUS' in X.columns:
    def group_marital_status(status):
        if pd.isna(status):
            return 'UNKNOWN'
        status = str(status).upper()
        
        # MARRIED (includes LIFE PARTNER)
        if 'MARRIED' in status or 'LIFE PARTNER' in status:
            return 'MARRIED'
        
        # SINGLE
        elif 'SINGLE' in status:
            return 'SINGLE'
        
        # WIDOWED
        elif 'WIDOWED' in status:
            return 'WIDOWED'
        
        # DIVORCED/SEPARATED (group together - both indicate ended relationship)
        elif 'DIVORCED' in status or 'SEPARATED' in status:
            return 'DIVORCED_SEPARATED'
        
        # UNKNOWN
        elif 'UNKNOWN' in status:
            return 'UNKNOWN'
        
        else:
            return 'UNKNOWN'
    
    X['MARITAL_STATUS'] = X['MARITAL_STATUS'].apply(group_marital_status)
    X_test['MARITAL_STATUS'] = X_test['MARITAL_STATUS'].apply(group_marital_status)
    
    print(f"  ‚úì Grouped MARITAL_STATUS: 7 ‚Üí {X['MARITAL_STATUS'].nunique()} categories")
    print(f"    New categories: {sorted(X['MARITAL_STATUS'].unique())}")
    print(f"    Distribution:")
    for cat, count in X['MARITAL_STATUS'].value_counts().items():
        print(f"      {cat}: {count} ({count/len(X)*100:.1f}%)")
# -----------------------------------------------------------------------------
# STEP 7: One-hot encode remaining low-cardinality features
# -----------------------------------------------------------------------------
print("\n--- Step 7: One-hot encoding remaining categorical features ---")

# Update categorical_features list
remaining_categorical = [col for col in categorical_features if col in X.columns]
print(f"\nFeatures to one-hot encode ({len(remaining_categorical)}):")

# Verify cardinality
total_new_features = 0
for col in remaining_categorical:
    n_unique = X[col].nunique()
    total_new_features += (n_unique - 1)  # drop_first=True
    print(f"  {col}: {n_unique} categories ‚Üí {n_unique-1} binary features")

print(f"\nEstimated new binary features from one-hot encoding: {total_new_features}")

if len(remaining_categorical) > 0:
    # One-hot encode
    X_combined = pd.concat([X, X_test], keys=['train', 'test'])
    X_encoded = pd.get_dummies(X_combined, columns=remaining_categorical, drop_first=True)
    X = X_encoded.xs('train')
    X_test = X_encoded.xs('test')
    
    print(f"‚úì One-hot encoding complete")

# -----------------------------------------------------------------------------
# FINAL SUMMARY
# -----------------------------------------------------------------------------
print("\n" + "="*70)
print("ENCODING COMPLETE - SUMMARY")
print("="*70)
print(f"Original numeric features: {len(numeric_features)}")
print(f"Target-encoded features: 1 (ICD9_encoded)")
print(f"Binary features from one-hot encoding: {total_new_features}")
print(f"\nFinal shapes:")
print(f"  X: {X.shape}")
print(f"  X_test: {X_test.shape}")
print(f"  Total features: {X.shape[1]}")

# Check if reasonable
if X.shape[1] > 200:
    print(f"\n‚ö†Ô∏è WARNING: {X.shape[1]} features might still be too many")
    print("Consider more aggressive grouping or feature selection")
elif X.shape[1] < 50:
    print(f"\n‚ö†Ô∏è WARNING: Only {X.shape[1]} features - might be too few")
    print("Consider keeping more granular categories")
else:
    print(f"\n‚úì Feature count looks good ({X.shape[1]} features)")

# Show a sample of the final feature names
print(f"\nSample of final features (first 20):")
print(list(X.columns[:20]))


SMART CATEGORICAL ENCODING

--- Step 2: Processing ICD9_diagnosis codes ---
  ‚úì Extracted ICD9 categories: 530 unique categories
  ‚Üí Using target encoding for ICD9 categories
  ‚úì ICD9_diagnosis ‚Üí ICD9_encoded (numeric)

--- Step 3: Processing DIAGNOSIS (free text) ---
  ‚úì Dropping DIAGNOSIS (free text, 6193 unique values)
    ‚Üí Keeping ICD9_encoded instead (more structured)

--- Step 4: Grouping ETHNICITY ---
  ‚úì Grouped ETHNICITY: 41 ‚Üí 8 categories
    New categories: ['ASIAN', 'BLACK', 'HISPANIC', 'NATIVE', 'OTHER', 'PACIFIC_ISLANDER', 'UNKNOWN', 'WHITE']
    Distribution:
      WHITE: 15330 (73.4%)
      BLACK: 2201 (10.5%)
      UNKNOWN: 1320 (6.3%)
      HISPANIC: 852 (4.1%)
      OTHER: 616 (2.9%)
      ASIAN: 545 (2.6%)
      NATIVE: 15 (0.1%)
      PACIFIC_ISLANDER: 6 (0.0%)

--- Step 5: Grouping RELIGION ---
  ‚úì Grouped RELIGION: 17 ‚Üí 8 categories
    New categories: ['CATHOLIC', 'JEWISH', 'MUSLIM', 'ORTHODOX', 'OTHER', 'OTHER_RELIGION', 'PROTESTANT', 'UNK

In [10]:
# =============================================================================
# 7.5. TRANSFORM SKEWED FEATURES
# =============================================================================
print("\n" + "="*70)
print("TRANSFORMING SKEWED FEATURES")
print("="*70)

# FIRST: Create glucose binary indicators BEFORE transformation
print("\n--- Creating glucose indicators (before transformation) ---")

if 'Glucose_Max' in X.columns:
    X['Hyperglycemia'] = (X['Glucose_Max'] > 180).astype(int)
    X_test['Hyperglycemia'] = (X_test['Glucose_Max'] > 180).astype(int)
    print(f"  ‚úì Hyperglycemia: {X['Hyperglycemia'].sum()} cases ({X['Hyperglycemia'].mean()*100:.1f}%)")

if 'Glucose_Min' in X.columns:
    X['Hypoglycemia'] = (X['Glucose_Min'] < 70).astype(int)
    X_test['Hypoglycemia'] = (X_test['Glucose_Min'] < 70).astype(int)
    print(f"  ‚úì Hypoglycemia: {X['Hypoglycemia'].sum()} cases ({X['Hypoglycemia'].mean()*100:.1f}%)")


# Apply log transformation to highly skewed features
# This helps models learn better from skewed distributions

skewed_features_to_transform = [
    'Glucose_Max', 'Glucose_Mean', 'Glucose_Min', 'Glucose_Range',
    'MeanBP_Max', 'Temp_Range', 'age_squared'
]

for feat in skewed_features_to_transform:
    if feat in X.columns:
        # Add 1 to handle any zeros, then log transform
        X[feat] = np.log1p(X[feat] - X[feat].min() + 1)
        X_test[feat] = np.log1p(X_test[feat] - X_test[feat].min() + 1)
        print(f"  ‚úì Log-transformed {feat}")

print(f"\nTransformed {len([f for f in skewed_features_to_transform if f in X.columns])} skewed features")


TRANSFORMING SKEWED FEATURES

--- Creating glucose indicators (before transformation) ---
  ‚úì Hyperglycemia: 7537 cases (36.1%)
  ‚úì Hypoglycemia: 1722 cases (8.2%)
  ‚úì Log-transformed Glucose_Max
  ‚úì Log-transformed Glucose_Mean
  ‚úì Log-transformed Glucose_Min
  ‚úì Log-transformed MeanBP_Max

Transformed 4 skewed features


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Hyperglycemia'] = (X['Glucose_Max'] > 180).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['Hyperglycemia'] = (X_test['Glucose_Max'] > 180).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Hypoglycemia'] = (X['Glucose_Min'] < 70).astype(int)
A value is trying to be

In [11]:
# =============================================================================
# 8.5. FEATURE ENGINEERING - MEDICAL DOMAIN KNOWLEDGE
# =============================================================================
print("\n" + "="*70)
print("FEATURE ENGINEERING")
print("="*70)

original_feature_count = X.shape[1]

# -----------------------------------------------------------------------------
# Blood Pressure Features
# -----------------------------------------------------------------------------
print("\n--- Blood Pressure Features ---")

# Pulse Pressure (SysBP - DiasBP) - cardiovascular health indicator
if all(col in X.columns for col in ['SysBP_Mean', 'DiasBP_Mean']):
    X['PulsePressure'] = X['SysBP_Mean'] - X['DiasBP_Mean']
    X_test['PulsePressure'] = X_test['SysBP_Mean'] - X_test['DiasBP_Mean']
    print("  ‚úì Pulse pressure")

# Blood pressure variability
if all(col in X.columns for col in ['SysBP_Min', 'SysBP_Max']):
    X['SysBP_Range'] = X['SysBP_Max'] - X['SysBP_Min']
    X_test['SysBP_Range'] = X_test['SysBP_Max'] - X_test['SysBP_Min']
    print("  ‚úì Systolic BP range")

# -----------------------------------------------------------------------------
# Shock Indices (Critical for ICU mortality prediction)
# -----------------------------------------------------------------------------
print("\n--- Shock Indices ---")

# Shock Index = HR / SysBP (>0.9 indicates shock)
if all(col in X.columns for col in ['HeartRate_Mean', 'SysBP_Mean']):
    X['ShockIndex'] = X['HeartRate_Mean'] / (X['SysBP_Mean'] + 1)
    X_test['ShockIndex'] = X_test['HeartRate_Mean'] / (X_test['SysBP_Mean'] + 1)
    
    # Cap extreme values (clinical range: 0.5-2.0 is reasonable)
    X['ShockIndex'] = X['ShockIndex'].clip(0, 3)
    X_test['ShockIndex'] = X_test['ShockIndex'].clip(0, 3)
    print("  ‚úì Shock index (HR/SysBP) - capped at [0, 3]")

# Modified Shock Index = HR / MAP
if all(col in X.columns for col in ['HeartRate_Mean', 'MeanBP_Mean']):
    X['ModifiedShockIndex'] = X['HeartRate_Mean'] / (X['MeanBP_Mean'] + 1)
    X_test['ModifiedShockIndex'] = X_test['HeartRate_Mean'] / (X_test['MeanBP_Mean'] + 1)
    
    # Cap extreme values
    X['ModifiedShockIndex'] = X['ModifiedShockIndex'].clip(0, 3)
    X_test['ModifiedShockIndex'] = X_test['ModifiedShockIndex'].clip(0, 3)
    print("  ‚úì Modified shock index (HR/MAP) - capped at [0, 3]")
    
# -----------------------------------------------------------------------------
# Respiratory Features
# -----------------------------------------------------------------------------
print("\n--- Respiratory Features ---")

# Hypoxemia indicator (SpO2 < 90% is clinically significant)
if 'SpO2_Min' in X.columns:
    X['Hypoxemia'] = (X['SpO2_Min'] < 90).astype(int)
    X_test['Hypoxemia'] = (X_test['SpO2_Min'] < 90).astype(int)
    print("  ‚úì Hypoxemia (SpO2 < 90%)")

# Respiratory rate abnormality (normal: 12-20 breaths/min)
if 'RespRate_Mean' in X.columns:
    X['RespRate_Abnormal'] = ((X['RespRate_Mean'] < 12) | (X['RespRate_Mean'] > 20)).astype(int)
    X_test['RespRate_Abnormal'] = ((X_test['RespRate_Mean'] < 12) | (X_test['RespRate_Mean'] > 20)).astype(int)
    print("  ‚úì Abnormal respiratory rate")

# Respiratory distress score (high RR + low SpO2)
if all(col in X.columns for col in ['RespRate_Mean', 'SpO2_Mean']):
    X['RespDistress_Score'] = X['RespRate_Mean'] * (100 - X['SpO2_Mean'])
    X_test['RespDistress_Score'] = X_test['RespRate_Mean'] * (100 - X_test['SpO2_Mean'])
    print("  ‚úì Respiratory distress score")

# -----------------------------------------------------------------------------
# Temperature Features
# -----------------------------------------------------------------------------
print("\n--- Temperature Features ---")

# Fever (>38¬∞C)
if 'TempC_Max' in X.columns:
    X['Fever'] = (X['TempC_Max'] > 38).astype(int)
    X_test['Fever'] = (X_test['TempC_Max'] > 38).astype(int)
    print("  ‚úì Fever indicator")

# Hypothermia (<36¬∞C)
if 'TempC_Min' in X.columns:
    X['Hypothermia'] = (X['TempC_Min'] < 36).astype(int)
    X_test['Hypothermia'] = (X_test['TempC_Min'] < 36).astype(int)
    print("  ‚úì Hypothermia indicator")

# Temperature instability
if all(col in X.columns for col in ['TempC_Min', 'TempC_Max']):
    X['Temp_Range'] = X['TempC_Max'] - X['TempC_Min']
    X_test['Temp_Range'] = X_test['TempC_Max'] - X_test['TempC_Min']
    print("  ‚úì Temperature range")

# -----------------------------------------------------------------------------
# Glucose Features
# -----------------------------------------------------------------------------
print("\n--- Glucose Features ---")



# Glucose variability
if all(col in X.columns for col in ['Glucose_Min', 'Glucose_Max']):
    X['Glucose_Range'] = X['Glucose_Max'] - X['Glucose_Min']
    X_test['Glucose_Range'] = X_test['Glucose_Max'] - X_test['Glucose_Min']
    print("  ‚úì Glucose variability")

# -----------------------------------------------------------------------------
# Age-Related Features
# -----------------------------------------------------------------------------
print("\n--- Age-Related Features ---")

# Elderly indicator (>65 years)
if 'age' in X.columns:
    X['Elderly'] = (X['age'] > 65).astype(int)
    X_test['Elderly'] = (X_test['age'] > 65).astype(int)
    print("  ‚úì Elderly indicator (>65 years)")
    
    # Age squared (capture non-linear effects)
    X['age_squared'] = X['age'] ** 2
    X_test['age_squared'] = X_test['age'] ** 2
    print("  ‚úì Age squared")

# -----------------------------------------------------------------------------
# Vital Sign Variability
# -----------------------------------------------------------------------------
print("\n--- Vital Sign Variability ---")

# Heart rate variability
if all(col in X.columns for col in ['HeartRate_Min', 'HeartRate_Max']):
    X['HeartRate_Range'] = X['HeartRate_Max'] - X['HeartRate_Min']
    X_test['HeartRate_Range'] = X_test['HeartRate_Max'] - X_test['HeartRate_Min']
    print("  ‚úì Heart rate range")

# -----------------------------------------------------------------------------
# Composite Risk Score
# -----------------------------------------------------------------------------
print("\n--- Composite Risk Score ---")

# Count abnormal vital signs
severity_components = []

if 'ShockIndex' in X.columns:
    severity_components.append((X['ShockIndex'] > 0.9).astype(int))
if 'Hypoxemia' in X.columns:
    severity_components.append(X['Hypoxemia'])
if 'RespRate_Abnormal' in X.columns:
    severity_components.append(X['RespRate_Abnormal'])
if 'Fever' in X.columns:
    severity_components.append(X['Fever'])
if 'Hypothermia' in X.columns:
    severity_components.append(X['Hypothermia'])

if severity_components:
    X['Severity_Score'] = sum(severity_components)
    
    # Repeat for test set
    severity_components_test = []
    if 'ShockIndex' in X_test.columns:
        severity_components_test.append((X_test['ShockIndex'] > 0.9).astype(int))
    if 'Hypoxemia' in X_test.columns:
        severity_components_test.append(X_test['Hypoxemia'])
    if 'RespRate_Abnormal' in X_test.columns:
        severity_components_test.append(X_test['RespRate_Abnormal'])
    if 'Fever' in X_test.columns:
        severity_components_test.append(X_test['Fever'])
    if 'Hypothermia' in X_test.columns:
        severity_components_test.append(X_test['Hypothermia'])
    
    X_test['Severity_Score'] = sum(severity_components_test)
    print("  ‚úì Composite severity score (0-5)")

# -----------------------------------------------------------------------------
# Summary
# -----------------------------------------------------------------------------
new_feature_count = X.shape[1]
added_features = new_feature_count - original_feature_count

print(f"\n{'='*70}")
print("FEATURE ENGINEERING COMPLETE")
print(f"{'='*70}")
print(f"Original features: {original_feature_count}")
print(f"New features: {new_feature_count}")
print(f"Added: {added_features} engineered features")

# Update numeric_features list to include new engineered features
new_engineered_features = [col for col in X.columns if col not in numeric_features + ['ICD9_encoded']]
numeric_features.extend(new_engineered_features)

print(f"\nEngineered features added to numeric_features list for scaling")

# -----------------------------------------------------------------------------
# Remove Redundant Features
# -----------------------------------------------------------------------------
print("\n--- Removing Redundant Features ---")

redundant_features = []

# RespDistress_Score is 99.99% correlated with RespRate_Mean
# Keep RespRate_Mean (original feature) and drop engineered one
if 'RespDistress_Score' in X.columns:
    X = X.drop('RespDistress_Score', axis=1)
    X_test = X_test.drop('RespDistress_Score', axis=1)
    redundant_features.append('RespDistress_Score')
    print("  ‚úì Dropped RespDistress_Score (redundant with RespRate_Mean)")

# MeanBP_Mean is 90% correlated with DiasBP_Mean
# Keep DiasBP_Mean and drop MeanBP_Mean
if 'MeanBP_Mean' in X.columns:
    X = X.drop('MeanBP_Mean', axis=1)
    X_test = X_test.drop('MeanBP_Mean', axis=1)
    redundant_features.append('MeanBP_Mean')
    print("  ‚úì Dropped MeanBP_Mean (redundant with DiasBP_Mean)")

if redundant_features:
    print(f"\nRemoved {len(redundant_features)} redundant features")



FEATURE ENGINEERING

--- Blood Pressure Features ---
  ‚úì Pulse pressure
  ‚úì Systolic BP range

--- Shock Indices ---
  ‚úì Shock index (HR/SysBP) - capped at [0, 3]
  ‚úì Modified shock index (HR/MAP) - capped at [0, 3]

--- Respiratory Features ---
  ‚úì Hypoxemia (SpO2 < 90%)
  ‚úì Abnormal respiratory rate
  ‚úì Respiratory distress score

--- Temperature Features ---
  ‚úì Fever indicator
  ‚úì Hypothermia indicator
  ‚úì Temperature range

--- Glucose Features ---
  ‚úì Glucose variability

--- Age-Related Features ---
  ‚úì Elderly indicator (>65 years)
  ‚úì Age squared

--- Vital Sign Variability ---
  ‚úì Heart rate range

--- Composite Risk Score ---
  ‚úì Composite severity score (0-5)

FEATURE ENGINEERING COMPLETE
Original features: 57
New features: 72
Added: 15 engineered features

Engineered features added to numeric_features list for scaling

--- Removing Redundant Features ---
  ‚úì Dropped RespDistress_Score (redundant with RespRate_Mean)
  ‚úì Dropped MeanBP_Mean

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['PulsePressure'] = X['SysBP_Mean'] - X['DiasBP_Mean']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['PulsePressure'] = X_test['SysBP_Mean'] - X_test['DiasBP_Mean']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['SysBP_Range'] = X['SysBP_Max'] - X['SysBP_Min']
A value is trying to be set

In [12]:
# =============================================================================
# 8. FEATURE SCALING
# =============================================================================
print("\n" + "="*70)
print("SCALING NUMERIC FEATURES")
print("="*70)

# Get all numeric columns
all_numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# Identify binary features (created in feature engineering)
binary_indicator_features = [
    'Hypoxemia', 'RespRate_Abnormal', 'Fever', 'Hypothermia',
    'Hyperglycemia', 'Hypoglycemia', 'Elderly'
]

# Also identify one-hot encoded features (they're binary too)
one_hot_features = [col for col in X.columns if '_' in col and X[col].nunique() <= 2]

# Combine all binary features to EXCLUDE from scaling
exclude_from_scaling = binary_indicator_features + one_hot_features

# Features to scale = all numeric EXCEPT binary indicators and one-hot encoded
features_to_scale = [col for col in all_numeric_cols if col not in exclude_from_scaling]

print(f"\nTotal numeric features: {len(all_numeric_cols)}")
print(f"Binary features (will NOT scale): {len(exclude_from_scaling)}")
print(f"Continuous features (will scale): {len(features_to_scale)}")

# Scale only continuous features
scaler = StandardScaler()
X[features_to_scale] = scaler.fit_transform(X[features_to_scale])
X_test[features_to_scale] = scaler.transform(X_test[features_to_scale])

print(f"\n‚úì Scaled {len(features_to_scale)} continuous features")
print(f"‚úì Left {len(exclude_from_scaling)} binary features unscaled")

# Verify binary features still have variance
print("\nVerifying binary features have variance:")
for feat in binary_indicator_features:
    if feat in X.columns:
        var = X[feat].var()
        unique = X[feat].nunique()
        print(f"  {feat}: variance={var:.4f}, unique_values={unique}")
        if var == 0:
            print(f"    ‚ö†Ô∏è WARNING: {feat} has zero variance!")


SCALING NUMERIC FEATURES

Total numeric features: 41
Binary features (will NOT scale): 37
Continuous features (will scale): 34

‚úì Scaled 34 continuous features
‚úì Left 37 binary features unscaled

Verifying binary features have variance:
  Hypoxemia: variance=0.1580, unique_values=2
  RespRate_Abnormal: variance=0.2120, unique_values=2
  Fever: variance=0.1424, unique_values=2
  Hypothermia: variance=0.2331, unique_values=2
  Hyperglycemia: variance=0.2307, unique_values=2
  Hypoglycemia: variance=0.0757, unique_values=2
  Elderly: variance=0.2485, unique_values=2


In [14]:
# =============================================================================
# 8. SAVE PROCESSED DATA
# =============================================================================
print("\n--- Saving processed data ---")

import os
os.makedirs('../data/processed', exist_ok=True)

# Save as pickle (preserves dtypes and column names)
X.to_pickle('../data/processed/X_train_processed.pkl')
y.to_pickle('../data/processed/y_train.pkl')
X_test.to_pickle('../data/processed/X_test_processed.pkl')
test_ids.to_pickle('../data/processed/test_ids.pkl')

# Also save preprocessing objects (to use on new data if needed)
with open('../data/processed/numeric_imputer.pkl', 'wb') as f:
    pickle.dump(numeric_imputer, f)
with open('../data/processed/categorical_imputer.pkl', 'wb') as f:
    pickle.dump(categorical_imputer, f)
with open('../data/processed/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("‚úì Processed data saved to ../data/processed/")
print("\n" + "="*70)
print("PREPROCESSING COMPLETE!")
print("="*70)
print("\nYou can now run modeling notebooks without repeating preprocessing.")


--- Saving processed data ---
‚úì Processed data saved to ../data/processed/

PREPROCESSING COMPLETE!

You can now run modeling notebooks without repeating preprocessing.


In [13]:
# =============================================================================
# 9. VALIDATION - CHECK PREPROCESSING QUALITY
# =============================================================================
print("\n" + "="*70)
print("PREPROCESSING VALIDATION")
print("="*70)

validation_passed = True
issues = []

# -----------------------------------------------------------------------------
# Check 1: Binary features should have variance > 0
# -----------------------------------------------------------------------------
print("\n--- Check 1: Binary Feature Variance ---")

binary_features_to_check = [
    'Hypoxemia', 'RespRate_Abnormal', 'Fever', 'Hypothermia',
    'Hyperglycemia', 'Hypoglycemia', 'Elderly'
]

for feat in binary_features_to_check:
    if feat in X.columns:
        variance = X[feat].var()
        unique_vals = X[feat].nunique()
        unique_set = set(X[feat].unique())
        
        print(f"  {feat}:")
        print(f"    Variance: {variance:.4f}")
        print(f"    Unique values: {unique_vals} {sorted(unique_set)}")
        
        if variance == 0:
            issues.append(f"‚ùå {feat} has ZERO variance (constant)")
            validation_passed = False
        elif unique_vals == 1:
            issues.append(f"‚ùå {feat} only has one value: {list(unique_set)[0]}")
            validation_passed = False
        elif not unique_set.issubset({0, 1, 0.0, 1.0}):
            issues.append(f"‚ùå {feat} is not binary: {sorted(unique_set)[:5]}")
            validation_passed = False
        else:
            print(f"    ‚úì Valid binary feature")

# -----------------------------------------------------------------------------
# Check 2: No NaN or Inf values
# -----------------------------------------------------------------------------
print("\n--- Check 2: Invalid Values ---")

nan_count = X.isnull().sum().sum()
inf_count = np.isinf(X.select_dtypes(include=[np.number])).sum().sum()

print(f"  NaN values: {nan_count}")
print(f"  Infinite values: {inf_count}")

if nan_count > 0:
    nan_cols = X.isnull().sum()[X.isnull().sum() > 0]
    issues.append(f"‚ùå {nan_count} NaN values in: {list(nan_cols.index)}")
    validation_passed = False
else:
    print("  ‚úì No NaN values")

if inf_count > 0:
    inf_cols = X.select_dtypes(include=[np.number]).columns[
        np.isinf(X.select_dtypes(include=[np.number])).any()
    ]
    issues.append(f"‚ùå {inf_count} Infinite values in: {list(inf_cols)}")
    validation_passed = False
else:
    print("  ‚úì No infinite values")

# -----------------------------------------------------------------------------
# Check 3: Shock indices were clipped
# -----------------------------------------------------------------------------
print("\n--- Check 3: Shock Indices Clipped ---")

shock_features = ['ShockIndex', 'ModifiedShockIndex']
for feat in shock_features:
    if feat in X.columns:
        min_val = X[feat].min()
        max_val = X[feat].max()
        print(f"  {feat}: [{min_val:.2f}, {max_val:.2f}]")
        
        if max_val > 10:
            issues.append(f"‚ùå {feat} not clipped: max={max_val:.2f}")
            validation_passed = False
        else:
            print(f"    ‚úì Properly clipped")

# -----------------------------------------------------------------------------
# Check 4: Redundant features were dropped
# -----------------------------------------------------------------------------
print("\n--- Check 4: Redundant Features Dropped ---")

should_be_dropped = ['RespDistress_Score', 'MeanBP_Mean']
still_present = [f for f in should_be_dropped if f in X.columns]

if still_present:
    issues.append(f"‚ùå Redundant features still present: {still_present}")
    validation_passed = False
else:
    print(f"  ‚úì Redundant features dropped: {should_be_dropped}")

# -----------------------------------------------------------------------------
# Check 5: Feature count is reasonable
# -----------------------------------------------------------------------------
print("\n--- Check 5: Feature Count ---")

n_features = X.shape[1]
print(f"  Total features: {n_features}")

if n_features < 50:
    issues.append(f"‚ö†Ô∏è Only {n_features} features - might be too few")
elif n_features > 100:
    issues.append(f"‚ö†Ô∏è {n_features} features - might be too many")
else:
    print(f"  ‚úì Feature count looks good")

# -----------------------------------------------------------------------------
# Check 6: Train and test have same features
# -----------------------------------------------------------------------------
print("\n--- Check 6: Train/Test Consistency ---")

if X.shape[1] != X_test.shape[1]:
    issues.append(f"‚ùå Shape mismatch: Train {X.shape[1]} vs Test {X_test.shape[1]} features")
    validation_passed = False
else:
    print(f"  ‚úì Train and test have same number of features: {X.shape[1]}")

if list(X.columns) != list(X_test.columns):
    issues.append(f"‚ùå Train and test have different column names")
    validation_passed = False
else:
    print(f"  ‚úì Train and test have identical column names")

# -----------------------------------------------------------------------------
# Check 7: Scaled features look normalized
# -----------------------------------------------------------------------------
print("\n--- Check 7: Scaling Quality ---")

# Sample a few continuous features
continuous_sample = ['HeartRate_Mean', 'SysBP_Mean', 'age', 'Glucose_Mean']
for feat in continuous_sample:
    if feat in X.columns:
        mean = X[feat].mean()
        std = X[feat].std()
        print(f"  {feat}: mean={mean:.4f}, std={std:.4f}")
        
        # Scaled features should have mean‚âà0, std‚âà1
        if abs(mean) > 0.1 or abs(std - 1.0) > 0.1:
            issues.append(f"‚ö†Ô∏è {feat} might not be properly scaled")

# -----------------------------------------------------------------------------
# Check 8: Target distribution unchanged
# -----------------------------------------------------------------------------
print("\n--- Check 8: Target Distribution ---")

target_mean = y.mean()
target_count = y.sum()
print(f"  Mortality rate: {target_mean:.3f} ({target_count}/{len(y)})")

expected_mortality = 0.112  # From your original data
if abs(target_mean - expected_mortality) > 0.01:
    issues.append(f"‚ö†Ô∏è Target distribution changed: {target_mean:.3f} vs expected {expected_mortality:.3f}")
else:
    print(f"  ‚úì Target distribution matches expected: ~11.2%")

# -----------------------------------------------------------------------------
# FINAL VERDICT
# -----------------------------------------------------------------------------
print("\n" + "="*70)
if validation_passed and len(issues) == 0:
    print("‚úÖ ALL VALIDATION CHECKS PASSED!")
    print("="*70)
    print("\nüéâ Preprocessing completed successfully!")
    print("   Ready to train models with confidence.")
else:
    print("üö® VALIDATION ISSUES FOUND")
    print("="*70)
    
    if issues:
        print("\nIssues to fix:")
        for i, issue in enumerate(issues, 1):
            print(f"{i}. {issue}")
    
    print("\n‚ö†Ô∏è DO NOT train models until these issues are resolved!")
    print("   Review the preprocessing pipeline and fix the issues above.")

# -----------------------------------------------------------------------------
# Quick feature preview
# -----------------------------------------------------------------------------
print("\n" + "="*70)
print("FEATURE PREVIEW")
print("="*70)

print(f"\nDataset shapes:")
print(f"  X_train: {X.shape}")
print(f"  y_train: {y.shape}")
print(f"  X_test: {X_test.shape}")
print(f"  test_ids: {len(test_ids)}")

print(f"\nSample features (first 15):")
for i, col in enumerate(X.columns[:15], 1):
    print(f"  {i:2d}. {col}")

print(f"\nFeature types:")
binary_count = sum(X[col].nunique() == 2 for col in X.columns)
print(f"  Binary features: {binary_count}")
print(f"  Continuous features: {X.shape[1] - binary_count}")


PREPROCESSING VALIDATION

--- Check 1: Binary Feature Variance ---
  Hypoxemia:
    Variance: 0.1580
    Unique values: 2 [np.int64(0), np.int64(1)]
    ‚úì Valid binary feature
  RespRate_Abnormal:
    Variance: 0.2120
    Unique values: 2 [np.int64(0), np.int64(1)]
    ‚úì Valid binary feature
  Fever:
    Variance: 0.1424
    Unique values: 2 [np.int64(0), np.int64(1)]
    ‚úì Valid binary feature
  Hypothermia:
    Variance: 0.2331
    Unique values: 2 [np.int64(0), np.int64(1)]
    ‚úì Valid binary feature
  Hyperglycemia:
    Variance: 0.2307
    Unique values: 2 [np.int64(0), np.int64(1)]
    ‚úì Valid binary feature
  Hypoglycemia:
    Variance: 0.0757
    Unique values: 2 [np.int64(0), np.int64(1)]
    ‚úì Valid binary feature
  Elderly:
    Variance: 0.2485
    Unique values: 2 [np.int64(0), np.int64(1)]
    ‚úì Valid binary feature

--- Check 2: Invalid Values ---
  NaN values: 0
  Infinite values: 0
  ‚úì No NaN values
  ‚úì No infinite values

--- Check 3: Shock Indices C