Volledig nieuwe poging to preproc na tips van guillem  (28/11 21:19)

In [1]:
# =============================================================================
# FINAL PREPROCESSING PIPELINE - COMPREHENSIVE & VALIDATED
# Built with all lessons learned + TA hints
# =============================================================================

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import pickle
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("COMPREHENSIVE PREPROCESSING PIPELINE - FINAL VERSION")
print("="*80)

COMPREHENSIVE PREPROCESSING PIPELINE - FINAL VERSION


In [2]:
# =============================================================================
# SECTION 1: LOAD DATA AND BASIC INSPECTION
# =============================================================================

print("\n" + "="*80)
print("SECTION 1: LOADING DATA")
print("="*80)

data_path = Path("../data/")

# Load main datasets
train_raw = pd.read_csv(data_path / "mimic_train_HEF.csv", low_memory=False)
test_raw = pd.read_csv(data_path / "mimic_test_HEF.csv", low_memory=False)

# Load diagnoses
diagnoses_raw = pd.read_csv(data_path / "extra_data" / "MIMIC_diagnoses.csv")

print(f"\n‚úì Loaded successfully:")
print(f"  Train: {train_raw.shape}")
print(f"  Test: {test_raw.shape}")
print(f"  Diagnoses: {diagnoses_raw.shape}")

# Normalize column names (handle both upper/lower case)
train = train_raw.copy()
test = test_raw.copy()
diagnoses = diagnoses_raw.copy()

# Convert diagnoses columns to uppercase for consistency
diagnoses.columns = diagnoses.columns.str.upper()

print(f"\n‚úì Column names standardized")



# Display key info
print(f"\n--- Dataset Overview ---")
print(f"Train samples: {len(train):,}")
print(f"Test samples: {len(test):,}")
print(f"Diagnosis records: {len(diagnoses):,}")

# Check for duplicates
print(f"\n--- Duplicate Check ---")
train_dupes = train['icustay_id'].duplicated().sum()
test_dupes = test['icustay_id'].duplicated().sum()
print(f"Train duplicates: {train_dupes}")
print(f"Test duplicates: {test_dupes}")

if train_dupes > 0 or test_dupes > 0:
    print("‚ö†Ô∏è WARNING: Duplicates found!")
else:
    print("‚úì No duplicates")

# Check ID structure
print(f"\n--- ID Structure ---")
print(f"Unique patients (subject_id): {train['subject_id'].nunique():,}")
print(f"Unique admissions (hadm_id): {train['hadm_id'].nunique():,}")
print(f"Unique ICU stays (icustay_id): {train['icustay_id'].nunique():,}")

# Patient visit statistics
visits_per_patient = train.groupby('subject_id').size()
print(f"\n--- Visit Statistics ---")
print(f"Mean ICU stays per patient: {visits_per_patient.mean():.2f}")
print(f"Median: {visits_per_patient.median():.0f}")
print(f"Max: {visits_per_patient.max():.0f}")
print(f"Patients with multiple visits: {(visits_per_patient > 1).sum():,} ({(visits_per_patient > 1).sum()/len(visits_per_patient)*100:.1f}%)")

print("\n" + "="*80)
print("‚úì SECTION 1 COMPLETE")
print("="*80)


SECTION 1: LOADING DATA

‚úì Loaded successfully:
  Train: (20885, 44)
  Test: (5221, 39)
  Diagnoses: (651047, 4)

‚úì Column names standardized

--- Dataset Overview ---
Train samples: 20,885
Test samples: 5,221
Diagnosis records: 651,047

--- Duplicate Check ---
Train duplicates: 0
Test duplicates: 0
‚úì No duplicates

--- ID Structure ---
Unique patients (subject_id): 16,317
Unique admissions (hadm_id): 19,749
Unique ICU stays (icustay_id): 20,885

--- Visit Statistics ---
Mean ICU stays per patient: 1.28
Median: 1
Max: 25
Patients with multiple visits: 2,940 (18.0%)

‚úì SECTION 1 COMPLETE


In [3]:
# =============================================================================
# SECTION 2: HOSPITAL HISTORY FEATURES
# =============================================================================

print("\n" + "="*80)
print("SECTION 2: CREATING HOSPITAL HISTORY FEATURES")
print("="*80)

def create_hospital_history_features(df, df_name="dataset"):
    """
    Create features based on patient's visit history
    
    Features created:
    - n_previous_icu_stays: Number of previous ICU visits for this patient
    - is_first_icu_visit: Binary flag for first-time ICU patients
    - is_frequent_flyer: Binary flag for patients with 3+ visits
    """
    print(f"\n--- Processing {df_name} ---")
    
    df = df.copy()
    
    # Sort by patient and time
    if 'ADMITTIME' in df.columns:
        df['ADMITTIME'] = pd.to_datetime(df['ADMITTIME'], errors='coerce')
        df = df.sort_values(['subject_id', 'ADMITTIME'])
        print("  ‚úì Sorted by patient and admission time")
    else:
        df = df.sort_values(['subject_id', 'hadm_id', 'icustay_id'])
        print("  ‚úì Sorted by patient and IDs")
    
    # Feature 1: Previous ICU stays
    df['n_previous_icu_stays'] = df.groupby('subject_id').cumcount()
    
    # Feature 2: First visit flag
    df['is_first_icu_visit'] = (df['n_previous_icu_stays'] == 0).astype(int)
    
    # Feature 3: Frequent flyer (3+ visits in entire dataset)
    total_visits = df.groupby('subject_id').size()
    frequent_patients = total_visits[total_visits >= 3].index
    df['is_frequent_flyer'] = df['subject_id'].isin(frequent_patients).astype(int)
    
    # Validation
    print(f"\n  Validation:")
    print(f"    n_previous_icu_stays - Min: {df['n_previous_icu_stays'].min()}, Max: {df['n_previous_icu_stays'].max()}, Mean: {df['n_previous_icu_stays'].mean():.2f}")
    print(f"    is_first_icu_visit - First visits: {df['is_first_icu_visit'].sum()} ({df['is_first_icu_visit'].mean()*100:.1f}%)")
    print(f"    is_frequent_flyer - Frequent flyers: {df['is_frequent_flyer'].sum()} ({df['is_frequent_flyer'].mean()*100:.1f}%)")
    
    # Check: Every patient's first row should have n_previous = 0
    first_rows = df.groupby('subject_id').first()
    assert (first_rows['n_previous_icu_stays'] == 0).all(), "ERROR: Not all first visits have n_previous = 0!"
    print(f"    ‚úì Check passed: All first visits correctly marked")
    
    return df

# Apply to train and test
train = create_hospital_history_features(train, "train")
test = create_hospital_history_features(test, "test")

# Save test IDs NOW (after sorting!)
test_ids = test['icustay_id'].copy()
print(f"\n‚úì Saved {len(test_ids)} test IDs in correct order")

print("\n" + "="*80)
print("‚úì SECTION 2 COMPLETE - Hospital history features created")
print("="*80)


SECTION 2: CREATING HOSPITAL HISTORY FEATURES

--- Processing train ---
  ‚úì Sorted by patient and admission time

  Validation:
    n_previous_icu_stays - Min: 0, Max: 24, Mean: 0.40
    is_first_icu_visit - First visits: 16317 (78.1%)
    is_frequent_flyer - Frequent flyers: 3388 (16.2%)
    ‚úì Check passed: All first visits correctly marked

--- Processing test ---
  ‚úì Sorted by patient and admission time

  Validation:
    n_previous_icu_stays - Min: 0, Max: 4, Mean: 0.09
    is_first_icu_visit - First visits: 4847 (92.8%)
    is_frequent_flyer - Frequent flyers: 172 (3.3%)
    ‚úì Check passed: All first visits correctly marked

‚úì Saved 5221 test IDs in correct order

‚úì SECTION 2 COMPLETE - Hospital history features created


In [4]:
# =============================================================================
# SECTION 3: ICD9 DIAGNOSIS FEATURES
# =============================================================================

print("\n" + "="*80)
print("SECTION 3: CREATING ICD9 DIAGNOSIS FEATURES")
print("="*80)

# Verify diagnoses data structure
print(f"\n--- Diagnoses Data Structure ---")
print(f"Columns: {diagnoses.columns.tolist()}")
print(f"Sample:")
print(diagnoses.head(3))

# Check for missing values
print(f"\n--- Missing Values in Diagnoses ---")
print(diagnoses.isnull().sum())

# Build diagnosis lookup
print(f"\n--- Building Diagnosis Features ---")

# Feature 1: Number of diagnoses per admission
n_diagnoses_per_admission = diagnoses.groupby('HADM_ID').size()
print(f"\n1. Number of diagnoses per admission:")
print(f"   Mean: {n_diagnoses_per_admission.mean():.1f}")
print(f"   Median: {n_diagnoses_per_admission.median():.0f}")
print(f"   Max: {n_diagnoses_per_admission.max():.0f}")

train['n_diagnoses'] = train['hadm_id'].map(n_diagnoses_per_admission).fillna(0).astype(int)
test['n_diagnoses'] = test['hadm_id'].map(n_diagnoses_per_admission).fillna(0).astype(int)

print(f"   Train - Admissions with diagnoses: {(train['n_diagnoses'] > 0).sum()} ({(train['n_diagnoses'] > 0).mean()*100:.1f}%)")
print(f"   Test - Admissions with diagnoses: {(test['n_diagnoses'] > 0).sum()} ({(test['n_diagnoses'] > 0).mean()*100:.1f}%)")

# Feature 2: Primary diagnosis (SEQ_NUM = 1)
primary_diagnoses = diagnoses[diagnoses['SEQ_NUM'] == 1][['HADM_ID', 'ICD9_CODE']].set_index('HADM_ID')['ICD9_CODE']
print(f"\n2. Primary diagnoses:")
print(f"   Unique primary diagnoses: {primary_diagnoses.nunique()}")

train['primary_diagnosis_raw'] = train['hadm_id'].map(primary_diagnoses)
test['primary_diagnosis_raw'] = test['hadm_id'].map(primary_diagnoses)

print(f"   Train - Matched: {train['primary_diagnosis_raw'].notna().sum()} ({train['primary_diagnosis_raw'].notna().mean()*100:.1f}%)")
print(f"   Test - Matched: {test['primary_diagnosis_raw'].notna().sum()} ({test['primary_diagnosis_raw'].notna().mean()*100:.1f}%)")

# Feature 3: ICD9 category (first 3 characters)
def extract_icd9_category(code):
    """Extract first 3 characters from ICD9 code"""
    if pd.isna(code):
        return 'UNKNOWN'
    code_str = str(code).strip().replace('.', '').replace(' ', '')
    if len(code_str) >= 3:
        return code_str[:3]
    elif len(code_str) > 0:
        return code_str
    return 'UNKNOWN'

train['primary_diag_cat'] = train['primary_diagnosis_raw'].apply(extract_icd9_category)
test['primary_diag_cat'] = test['primary_diagnosis_raw'].apply(extract_icd9_category)

print(f"\n3. ICD9 categories (3-digit):")
print(f"   Unique categories: {train['primary_diag_cat'].nunique()}")
print(f"   Top 5 categories:")
for cat, count in train['primary_diag_cat'].value_counts().head().items():
    print(f"     {cat}: {count} ({count/len(train)*100:.1f}%)")

# Feature 4: Major disease category (first digit)
def get_disease_category(code):
    """Map ICD9 code to major disease category"""
    if pd.isna(code):
        return 'UNKNOWN'
    
    code_str = str(code).strip().replace('.', '').replace(' ', '')
    if len(code_str) == 0:
        return 'UNKNOWN'
    
    first_char = code_str[0].upper()
    
    # ICD9 structure
    if first_char in ['0', '1']:
        return 'INFECTIOUS'
    elif first_char == '2':
        return 'NEOPLASM'
    elif first_char == '3':
        return 'ENDOCRINE'
    elif first_char == '4':
        return 'BLOOD'
    elif first_char == '5':
        return 'MENTAL'
    elif first_char in ['6', '7']:
        return 'NERVOUS'
    elif first_char == '8':
        return 'CIRCULATORY'
    elif first_char == '9':
        return 'RESPIRATORY'
    elif first_char == 'V':
        return 'V_CODE'  # Supplementary classification
    elif first_char == 'E':
        return 'E_CODE'  # External causes
    else:
        return 'OTHER'

train['disease_category'] = train['primary_diagnosis_raw'].apply(get_disease_category)
test['disease_category'] = test['primary_diagnosis_raw'].apply(get_disease_category)

print(f"\n4. Major disease categories:")
for cat, count in train['disease_category'].value_counts().items():
    print(f"   {cat}: {count} ({count/len(train)*100:.1f}%)")

# Feature 5: Specific high-risk condition flags
print(f"\n5. High-risk condition flags:")

# Build efficient lookup: hadm_id -> set of all ICD9 codes
all_diagnoses_per_admission = diagnoses.groupby('HADM_ID')['ICD9_CODE'].apply(
    lambda x: set(str(code).replace('.', '').replace(' ', '') for code in x)
)

def check_condition_presence(hadm_id, code_patterns):
    """Check if any diagnosis matches the pattern"""
    if hadm_id not in all_diagnoses_per_admission.index:
        return 0
    
    codes = all_diagnoses_per_admission[hadm_id]
    
    for pattern in code_patterns:
        if any(code.startswith(pattern) for code in codes):
            return 1
    return 0

# Define condition patterns
conditions = {
    'has_sepsis': ['99591', '99592', '78552'],  # Sepsis codes
    'has_heart_failure': ['428'],  # Heart failure
    'has_respiratory_failure': ['518'],  # Respiratory failure
    'has_aki': ['584'],  # Acute kidney injury
    'has_diabetes': ['250'],  # Diabetes
    'has_copd': ['491', '492', '496'],  # COPD
    'has_pneumonia': ['480', '481', '482', '483', '484', '485', '486']  # Pneumonia
}

for condition_name, patterns in conditions.items():
    train[condition_name] = train['hadm_id'].apply(lambda x: check_condition_presence(x, patterns))
    test[condition_name] = test['hadm_id'].apply(lambda x: check_condition_presence(x, patterns))
    
    count = train[condition_name].sum()
    print(f"   {condition_name}: {count} ({count/len(train)*100:.1f}%)")

# Validation checks
print(f"\n--- Validation Checks ---")

# Check 1: n_diagnoses should be >= 1 if we have a primary diagnosis
has_primary = train['primary_diagnosis_raw'].notna()
has_n_diag = train['n_diagnoses'] > 0
mismatch = has_primary & ~has_n_diag
if mismatch.sum() > 0:
    print(f"  ‚ö†Ô∏è Warning: {mismatch.sum()} cases have primary diagnosis but n_diagnoses=0")
else:
    print(f"  ‚úì Check passed: n_diagnoses consistent with primary diagnosis")

# Check 2: Condition flags should be <= n_diagnoses
condition_cols = [col for col in train.columns if col.startswith('has_')]
for col in condition_cols:
    # If has_condition=1, should have n_diagnoses >= 1
    invalid = (train[col] == 1) & (train['n_diagnoses'] == 0)
    if invalid.sum() > 0:
        print(f"  ‚ö†Ô∏è Warning: {invalid.sum()} cases with {col}=1 but n_diagnoses=0")

print(f"  ‚úì Condition flag validation complete")

print("\n" + "="*80)
print("‚úì SECTION 3 COMPLETE - ICD9 features created and validated")
print("="*80)


SECTION 3: CREATING ICD9 DIAGNOSIS FEATURES

--- Diagnoses Data Structure ---
Columns: ['SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'ICD9_CODE']
Sample:
   SUBJECT_ID  HADM_ID  SEQ_NUM ICD9_CODE
0         256   108811      1.0     53240
1         256   108811      2.0     41071
2         256   108811      3.0     53560

--- Missing Values in Diagnoses ---
SUBJECT_ID     0
HADM_ID        0
SEQ_NUM       47
ICD9_CODE     47
dtype: int64

--- Building Diagnosis Features ---

1. Number of diagnoses per admission:
   Mean: 11.0
   Median: 9
   Max: 39
   Train - Admissions with diagnoses: 20885 (100.0%)
   Test - Admissions with diagnoses: 5221 (100.0%)

2. Primary diagnoses:
   Unique primary diagnoses: 2789
   Train - Matched: 20885 (100.0%)
   Test - Matched: 5221 (100.0%)

3. ICD9 categories (3-digit):
   Unique categories: 530
   Top 5 categories:
     038: 1595 (7.6%)
     414: 1115 (5.3%)
     410: 948 (4.5%)
     424: 744 (3.6%)
     428: 686 (3.3%)

4. Major disease categories:
   BLOOD: 

In [5]:
# =============================================================================
# SECTION 4: DROP LEAKAGE COLUMNS
# =============================================================================

print("\n" + "="*80)
print("SECTION 4: REMOVING LEAKAGE COLUMNS")
print("="*80)

# Columns that leak information about the target
leakage_columns = [
    'DISCHTIME',      # Discharge time (only known after outcome)
    'DEATHTIME',      # Death time (IS the target!)
    'DOD',            # Date of death (IS the target!)
    'LOS',            # Length of stay (correlated with outcome)
    'Diff',           # Some time difference (likely leakage)
    'ADMITTIME',      # We already used it for history features, now drop
]

# IDs - we've extracted all useful info, now drop
id_columns = [
    'icustay_id',     # Already saved as test_ids
    'subject_id',     # Used for history features, now drop
    'hadm_id',        # Used for diagnosis matching, now drop
]

# Diagnosis column - we've extracted all features from it
diagnosis_columns = [
    'primary_diagnosis_raw'  # Keep the encoded versions only
]

all_columns_to_drop = leakage_columns + id_columns + diagnosis_columns

print(f"\n--- Columns to Drop ---")
for col in all_columns_to_drop:
    train_has = "‚úì" if col in train.columns else "‚úó"
    test_has = "‚úì" if col in test.columns else "‚úó"
    print(f"  {col:25s} Train:{train_has}  Test:{test_has}")

# Drop from train
train_clean = train.drop(columns=[c for c in all_columns_to_drop if c in train.columns], errors='ignore')

# Drop from test  
test_clean = test.drop(columns=[c for c in all_columns_to_drop if c in test.columns], errors='ignore')

print(f"\n--- Shape Changes ---")
print(f"  Train: {train.shape} ‚Üí {train_clean.shape}")
print(f"  Test:  {test.shape} ‚Üí {test_clean.shape}")

# Separate target from train
print(f"\n--- Separating Target ---")
if 'HOSPITAL_EXPIRE_FLAG' not in train_clean.columns:
    print("  ‚ùå ERROR: Target column not found!")
    raise ValueError("HOSPITAL_EXPIRE_FLAG column missing!")

y = train_clean['HOSPITAL_EXPIRE_FLAG'].copy()
X = train_clean.drop('HOSPITAL_EXPIRE_FLAG', axis=1)
X_test = test_clean.copy()

print(f"  ‚úì Target separated")
print(f"  ‚úì y shape: {y.shape}")
print(f"  ‚úì X shape: {X.shape}")
print(f"  ‚úì X_test shape: {X_test.shape}")

# Validate target
print(f"\n--- Target Validation ---")
print(f"  Target name: HOSPITAL_EXPIRE_FLAG")
print(f"  Unique values: {y.unique()}")
print(f"  Mortality rate: {y.mean():.3f} ({y.sum()}/{len(y)})")
print(f"  Class balance: 0={y.value_counts()[0]}, 1={y.value_counts()[1]}")

expected_mortality = 0.112
if abs(y.mean() - expected_mortality) > 0.01:
    print(f"  ‚ö†Ô∏è Warning: Mortality rate {y.mean():.3f} differs from expected {expected_mortality:.3f}")
else:
    print(f"  ‚úì Mortality rate matches expected (~11.2%)")

# Verify train and test have same columns (except target)
print(f"\n--- Column Consistency Check ---")
train_cols = set(X.columns)
test_cols = set(X_test.columns)

cols_only_in_train = train_cols - test_cols
cols_only_in_test = test_cols - train_cols

if cols_only_in_train:
    print(f"  ‚ö†Ô∏è Columns only in train: {cols_only_in_train}")
if cols_only_in_test:
    print(f"  ‚ö†Ô∏è Columns only in test: {cols_only_in_test}")

if train_cols == test_cols:
    print(f"  ‚úì Train and test have identical columns ({len(train_cols)} columns)")
else:
    print(f"  ‚ùå ERROR: Train and test column mismatch!")

print("\n" + "="*80)
print("‚úì SECTION 4 COMPLETE - Leakage columns removed, target separated")
print("="*80)


SECTION 4: REMOVING LEAKAGE COLUMNS

--- Columns to Drop ---
  DISCHTIME                 Train:‚úì  Test:‚úó
  DEATHTIME                 Train:‚úì  Test:‚úó
  DOD                       Train:‚úì  Test:‚úó
  LOS                       Train:‚úì  Test:‚úó
  Diff                      Train:‚úì  Test:‚úì
  ADMITTIME                 Train:‚úì  Test:‚úì
  icustay_id                Train:‚úì  Test:‚úì
  subject_id                Train:‚úì  Test:‚úì
  hadm_id                   Train:‚úì  Test:‚úì
  primary_diagnosis_raw     Train:‚úì  Test:‚úì

--- Shape Changes ---
  Train: (20885, 58) ‚Üí (20885, 48)
  Test:  (5221, 53) ‚Üí (5221, 47)

--- Separating Target ---
  ‚úì Target separated
  ‚úì y shape: (20885,)
  ‚úì X shape: (20885, 47)
  ‚úì X_test shape: (5221, 47)

--- Target Validation ---
  Target name: HOSPITAL_EXPIRE_FLAG
  Unique values: [0 1]
  Mortality rate: 0.112 (2345/20885)
  Class balance: 0=18540, 1=2345
  ‚úì Mortality rate matches expected (~11.2%)

--- Column Consistency Chec

In [6]:
# =============================================================================
# SECTION 5: CONVERT DOB TO AGE
# =============================================================================

print("\n" + "="*80)
print("SECTION 5: CONVERTING DOB TO AGE")
print("="*80)

if 'DOB' not in X.columns:
    print("  ‚ö†Ô∏è DOB column not found, skipping age calculation")
else:
    print("\n--- Loading original data for ADMITTIME ---")
    
    # Need to reload to get ADMITTIME (we dropped it earlier)
    train_original = pd.read_csv(data_path / 'mimic_train_HEF.csv')
    test_original = pd.read_csv(data_path / 'mimic_test_HEF.csv')
    
    print("  ‚úì Original data loaded")
    
    # Convert to datetime
    print("\n--- Converting dates ---")
    dob_train = pd.to_datetime(X['DOB'], errors='coerce')
    dob_test = pd.to_datetime(X_test['DOB'], errors='coerce')
    admit_train = pd.to_datetime(train_original['ADMITTIME'], errors='coerce')
    admit_test = pd.to_datetime(test_original['ADMITTIME'], errors='coerce')
    
    print(f"  Train - DOB parsed: {dob_train.notna().sum()}/{len(dob_train)}")
    print(f"  Train - ADMITTIME parsed: {admit_train.notna().sum()}/{len(admit_train)}")
    print(f"  Test - DOB parsed: {dob_test.notna().sum()}/{len(dob_test)}")
    print(f"  Test - ADMITTIME parsed: {admit_test.notna().sum()}/{len(admit_test)}")
    
    # Calculate age
    print("\n--- Calculating ages ---")
    
    def calculate_age(admit_time, dob):
        """Calculate age in years from admission time and DOB"""
        if pd.isna(admit_time) or pd.isna(dob):
            return np.nan
        try:
            age_days = (admit_time - dob).days
            age_years = age_days / 365.25
            return age_years
        except:
            return np.nan
    
    X['age'] = [calculate_age(admit, dob) for admit, dob in zip(admit_train, dob_train)]
    X_test['age'] = [calculate_age(admit, dob) for admit, dob in zip(admit_test, dob_test)]
    
    # Convert to numeric
    X['age'] = pd.to_numeric(X['age'], errors='coerce')
    X_test['age'] = pd.to_numeric(X_test['age'], errors='coerce')
    
    print(f"  ‚úì Ages calculated")
    
    # Analyze age distribution
    print(f"\n--- Age Distribution (Before Cleaning) ---")
    print(f"  Train:")
    print(f"    Min: {X['age'].min():.1f}")
    print(f"    Max: {X['age'].max():.1f}")
    print(f"    Mean: {X['age'].mean():.1f}")
    print(f"    Median: {X['age'].median():.1f}")
    print(f"    Missing: {X['age'].isna().sum()}")
    
    # Clean invalid ages
    print(f"\n--- Cleaning Invalid Ages ---")
    
    # Flag invalid ages (< 0 or > 120)
    invalid_train = (X['age'] < 0) | (X['age'] > 120)
    invalid_test = (X_test['age'] < 0) | (X_test['age'] > 120)
    
    print(f"  Train - Invalid ages: {invalid_train.sum()}")
    print(f"  Test - Invalid ages: {invalid_test.sum()}")
    
    if invalid_train.sum() > 0:
        print(f"    Sample invalid ages: {X.loc[invalid_train, 'age'].head().tolist()}")
    
    # Set invalid to NaN
    X.loc[invalid_train, 'age'] = np.nan
    X_test.loc[invalid_test, 'age'] = np.nan
    
    # Impute missing ages with median
    age_median = X['age'].median()
    n_missing_train = X['age'].isna().sum()
    n_missing_test = X_test['age'].isna().sum()
    
    X['age'].fillna(age_median, inplace=True)
    X_test['age'].fillna(age_median, inplace=True)
    
    print(f"  ‚úì Imputed {n_missing_train} train + {n_missing_test} test missing ages with median: {age_median:.1f}")
    
    # Final age distribution
    print(f"\n--- Age Distribution (After Cleaning) ---")
    print(f"  Train:")
    print(f"    Range: {X['age'].min():.1f} - {X['age'].max():.1f} years")
    print(f"    Mean: {X['age'].mean():.1f} years")
    print(f"    Std: {X['age'].std():.1f} years")
    
    # Age percentiles
    percentiles = X['age'].quantile([0.25, 0.5, 0.75])
    print(f"    25th percentile: {percentiles[0.25]:.1f}")
    print(f"    50th percentile: {percentiles[0.5]:.1f}")
    print(f"    75th percentile: {percentiles[0.75]:.1f}")
    
    # Drop DOB column
    X = X.drop('DOB', axis=1)
    X_test = X_test.drop('DOB', axis=1)
    
    print(f"\n  ‚úì Dropped DOB column")
    
    # Validation
    print(f"\n--- Validation ---")
    assert X['age'].notna().all(), "ERROR: Still have NaN ages in train!"
    assert X_test['age'].notna().all(), "ERROR: Still have NaN ages in test!"
    assert (X['age'] >= 0).all() and (X['age'] <= 120).all(), "ERROR: Invalid ages in train!"
    assert (X_test['age'] >= 0).all() and (X_test['age'] <= 120).all(), "ERROR: Invalid ages in test!"
    
    print(f"  ‚úì All validation checks passed")

print("\n" + "="*80)
print("‚úì SECTION 5 COMPLETE - DOB converted to age")
print("="*80)


SECTION 5: CONVERTING DOB TO AGE

--- Loading original data for ADMITTIME ---
  ‚úì Original data loaded

--- Converting dates ---
  Train - DOB parsed: 20885/20885
  Train - ADMITTIME parsed: 20885/20885
  Test - DOB parsed: 5221/5221
  Test - ADMITTIME parsed: 5221/5221

--- Calculating ages ---
  ‚úì Ages calculated

--- Age Distribution (Before Cleaning) ---
  Train:
    Min: -71.9
    Max: 292.0
    Mean: 67.4
    Median: 64.6
    Missing: 635

--- Cleaning Invalid Ages ---
  Train - Invalid ages: 4340
  Test - Invalid ages: 1063
    Sample invalid ages: [242.56810403832992, -11.915126625598905, -11.118412046543463, -15.780971937029431, -13.05407255304586]
  ‚úì Imputed 4975 train + 1222 test missing ages with median: 62.0

--- Age Distribution (After Cleaning) ---
  Train:
    Range: 0.0 - 120.0 years
    Mean: 61.4 years
    Std: 27.3 years
    25th percentile: 44.9
    50th percentile: 62.0
    75th percentile: 78.2

  ‚úì Dropped DOB column

--- Validation ---
  ‚úì All valid

In [7]:
# =============================================================================
# SECTION 6: IDENTIFY FEATURE TYPES AND HANDLE MISSING VALUES
# =============================================================================

print("\n" + "="*80)
print("SECTION 6: FEATURE TYPE IDENTIFICATION & IMPUTATION")
print("="*80)

# Identify feature types
print("\n--- Identifying Feature Types ---")

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print(f"  Numeric features: {len(numeric_features)}")
print(f"  Categorical features: {len(categorical_features)}")

# Show sample of each type
print(f"\n--- Sample Features ---")
print(f"  Numeric (first 10): {numeric_features[:10]}")
print(f"  Categorical: {categorical_features}")

# Check missing values
print(f"\n--- Missing Value Analysis ---")

missing_numeric = X[numeric_features].isnull().sum()
missing_numeric = missing_numeric[missing_numeric > 0].sort_values(ascending=False)

if len(missing_numeric) > 0:
    print(f"  Numeric features with missing values:")
    for feat, count in missing_numeric.items():
        pct = count / len(X) * 100
        print(f"    {feat:30s} {count:6d} ({pct:5.1f}%)")
else:
    print(f"  ‚úì No missing values in numeric features")

missing_categorical = X[categorical_features].isnull().sum()
missing_categorical = missing_categorical[missing_categorical > 0].sort_values(ascending=False)

if len(missing_categorical) > 0:
    print(f"\n  Categorical features with missing values:")
    for feat, count in missing_categorical.items():
        pct = count / len(X) * 100
        print(f"    {feat:30s} {count:6d} ({pct:5.1f}%)")
else:
    print(f"  ‚úì No missing values in categorical features")

# Imputation
print(f"\n--- Imputation Strategy ---")

# Numeric: median imputation
if len(numeric_features) > 0:
    print(f"  Numeric features: Median imputation")
    numeric_imputer = SimpleImputer(strategy='median')
    X[numeric_features] = numeric_imputer.fit_transform(X[numeric_features])
    X_test[numeric_features] = numeric_imputer.transform(X_test[numeric_features])
    print(f"  ‚úì Imputed {len(numeric_features)} numeric features")

# Categorical: most frequent imputation
if len(categorical_features) > 0:
    print(f"  Categorical features: Most frequent imputation")
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    X[categorical_features] = categorical_imputer.fit_transform(X[categorical_features])
    X_test[categorical_features] = categorical_imputer.transform(X_test[categorical_features])
    print(f"  ‚úì Imputed {len(categorical_features)} categorical features")

# Verify no missing values remain
print(f"\n--- Post-Imputation Validation ---")

train_missing = X.isnull().sum().sum()
test_missing = X_test.isnull().sum().sum()

print(f"  Train missing values: {train_missing}")
print(f"  Test missing values: {test_missing}")

if train_missing > 0 or test_missing > 0:
    print(f"  ‚ùå ERROR: Still have missing values after imputation!")
    if train_missing > 0:
        print(f"    Train columns with NaN: {X.columns[X.isnull().any()].tolist()}")
    if test_missing > 0:
        print(f"    Test columns with NaN: {X_test.columns[X_test.isnull().any()].tolist()}")
else:
    print(f"  ‚úì No missing values remain")

print("\n" + "="*80)
print("‚úì SECTION 6 COMPLETE - Features identified and imputed")
print("="*80)


SECTION 6: FEATURE TYPE IDENTIFICATION & IMPUTATION

--- Identifying Feature Types ---
  Numeric features: 36
  Categorical features: 11

--- Sample Features ---
  Numeric (first 10): ['HeartRate_Min', 'HeartRate_Max', 'HeartRate_Mean', 'SysBP_Min', 'SysBP_Max', 'SysBP_Mean', 'DiasBP_Min', 'DiasBP_Max', 'DiasBP_Mean', 'MeanBP_Min']
  Categorical: ['GENDER', 'ADMISSION_TYPE', 'INSURANCE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY', 'DIAGNOSIS', 'ICD9_diagnosis', 'FIRST_CAREUNIT', 'primary_diag_cat', 'disease_category']

--- Missing Value Analysis ---
  Numeric features with missing values:
    TempC_Max                        2497 ( 12.0%)
    TempC_Mean                       2497 ( 12.0%)
    TempC_Min                        2497 ( 12.0%)
    DiasBP_Min                       2209 ( 10.6%)
    DiasBP_Mean                      2209 ( 10.6%)
    DiasBP_Max                       2209 ( 10.6%)
    SysBP_Min                        2208 ( 10.6%)
    SysBP_Mean                       2208 ( 10.

In [8]:
# =============================================================================
# SECTION 7: ENCODE CATEGORICAL FEATURES
# =============================================================================

print("\n" + "="*80)
print("SECTION 7: ENCODING CATEGORICAL FEATURES")
print("="*80)

print("\n--- Categorical Feature Analysis ---")

for cat_col in categorical_features:
    n_unique = X[cat_col].nunique()
    print(f"  {cat_col:25s} {n_unique:4d} unique values")
    
    # Show distribution for low-cardinality features
    if n_unique <= 10:
        print(f"    Distribution:")
        for val, count in X[cat_col].value_counts().head(5).items():
            print(f"      {str(val):30s} {count:6d} ({count/len(X)*100:5.1f}%)")

# Strategy for each categorical feature
print("\n" + "="*80)
print("ENCODING STRATEGY")
print("="*80)

# --- 1. ICD9_diagnosis: Target encode (already have primary_diag_cat) ---
print("\n1. ICD9_diagnosis ‚Üí Target encode")

if 'ICD9_diagnosis' in X.columns:
    def extract_icd9_category(code):
        if pd.isna(code):
            return 'UNKNOWN'
        code_str = str(code).strip().replace('.', '')
        if len(code_str) >= 3:
            return code_str[:3]
        elif len(code_str) > 0:
            return code_str
        return 'UNKNOWN'
    
    X['ICD9_cat'] = X['ICD9_diagnosis'].apply(extract_icd9_category)
    X_test['ICD9_cat'] = X_test['ICD9_diagnosis'].apply(extract_icd9_category)
    
    # Target encode
    encoding_map = y.groupby(X['ICD9_cat']).mean().to_dict()
    global_mean = y.mean()
    
    X['ICD9_encoded'] = X['ICD9_cat'].map(encoding_map)
    X_test['ICD9_encoded'] = X_test['ICD9_cat'].map(encoding_map).fillna(global_mean)
    
    print(f"   ‚úì Encoded {X['ICD9_cat'].nunique()} categories")
    print(f"   Mortality range: {X['ICD9_encoded'].min():.3f} - {X['ICD9_encoded'].max():.3f}")
    
    # Drop original columns
    X = X.drop(['ICD9_diagnosis', 'ICD9_cat'], axis=1)
    X_test = X_test.drop(['ICD9_diagnosis', 'ICD9_cat'], axis=1)
    categorical_features.remove('ICD9_diagnosis')

# --- 2. primary_diag_cat: Target encode ---
print("\n2. primary_diag_cat ‚Üí Target encode")

if 'primary_diag_cat' in X.columns:
    encoding_map = y.groupby(X['primary_diag_cat']).mean().to_dict()
    
    X['primary_diag_encoded'] = X['primary_diag_cat'].map(encoding_map)
    X_test['primary_diag_encoded'] = X_test['primary_diag_cat'].map(encoding_map).fillna(global_mean)
    
    print(f"   ‚úì Encoded {X['primary_diag_cat'].nunique()} categories")
    print(f"   Mortality range: {X['primary_diag_encoded'].min():.3f} - {X['primary_diag_encoded'].max():.3f}")
    
    X = X.drop('primary_diag_cat', axis=1)
    X_test = X_test.drop('primary_diag_cat', axis=1)
    categorical_features.remove('primary_diag_cat')

# --- 3. DIAGNOSIS: Drop (free text, too high cardinality) ---
print("\n3. DIAGNOSIS ‚Üí Drop")

if 'DIAGNOSIS' in categorical_features:
    print(f"   Unique values: {X['DIAGNOSIS'].nunique()}")
    print(f"   ‚úì Dropping (free text, already have ICD9 codes)")
    
    X = X.drop('DIAGNOSIS', axis=1)
    X_test = X_test.drop('DIAGNOSIS', axis=1)
    categorical_features.remove('DIAGNOSIS')

# --- 4. Group low-frequency categories ---
print("\n4. Grouping categorical features")

# ETHNICITY
if 'ETHNICITY' in categorical_features:
    def group_ethnicity(ethnicity):
        if pd.isna(ethnicity):
            return 'UNKNOWN'
        ethnicity = str(ethnicity).upper()
        if 'WHITE' in ethnicity:
            return 'WHITE'
        elif 'BLACK' in ethnicity or 'AFRICAN' in ethnicity:
            return 'BLACK'
        elif 'HISPANIC' in ethnicity or 'LATINO' in ethnicity:
            return 'HISPANIC'
        elif 'ASIAN' in ethnicity:
            return 'ASIAN'
        elif 'AMERICAN INDIAN' in ethnicity or 'ALASKA NATIVE' in ethnicity:
            return 'NATIVE'
        elif 'HAWAIIAN' in ethnicity or 'PACIFIC ISLANDER' in ethnicity:
            return 'PACIFIC_ISLANDER'
        elif any(x in ethnicity for x in ['UNKNOWN', 'UNABLE', 'DECLINED', 'NOT SPECIFIED']):
            return 'UNKNOWN'
        else:
            return 'OTHER'
    
    X['ETHNICITY'] = X['ETHNICITY'].apply(group_ethnicity)
    X_test['ETHNICITY'] = X_test['ETHNICITY'].apply(group_ethnicity)
    
    print(f"   ETHNICITY: {X['ETHNICITY'].nunique()} categories")

# RELIGION
if 'RELIGION' in categorical_features:
    def group_religion(religion):
        if pd.isna(religion):
            return 'UNKNOWN'
        religion = str(religion).upper()
        if 'CATHOLIC' in religion:
            return 'CATHOLIC'
        elif any(x in religion for x in ['PROTESTANT', 'EPISCOPALIAN', 'QUAKER']):
            return 'PROTESTANT'
        elif 'JEWISH' in religion or 'HEBREW' in religion:
            return 'JEWISH'
        elif 'MUSLIM' in religion:
            return 'MUSLIM'
        elif 'ORTHODOX' in religion:
            return 'ORTHODOX'
        elif any(x in religion for x in ['BUDDHIST', 'HINDU', 'JEHOVAH', 'CHRISTIAN SCIENTIST']):
            return 'OTHER_RELIGION'
        elif any(x in religion for x in ['UNOBTAINABLE', 'NOT SPECIFIED', 'UNKNOWN']):
            return 'UNKNOWN'
        else:
            return 'OTHER'
    
    X['RELIGION'] = X['RELIGION'].apply(group_religion)
    X_test['RELIGION'] = X_test['RELIGION'].apply(group_religion)
    
    print(f"   RELIGION: {X['RELIGION'].nunique()} categories")

# MARITAL_STATUS
if 'MARITAL_STATUS' in categorical_features:
    def group_marital_status(status):
        if pd.isna(status):
            return 'UNKNOWN'
        status = str(status).upper()
        if 'MARRIED' in status or 'LIFE PARTNER' in status:
            return 'MARRIED'
        elif 'SINGLE' in status:
            return 'SINGLE'
        elif 'WIDOWED' in status:
            return 'WIDOWED'
        elif 'DIVORCED' in status or 'SEPARATED' in status:
            return 'DIVORCED_SEPARATED'
        else:
            return 'UNKNOWN'
    
    X['MARITAL_STATUS'] = X['MARITAL_STATUS'].apply(group_marital_status)
    X_test['MARITAL_STATUS'] = X_test['MARITAL_STATUS'].apply(group_marital_status)
    
    print(f"   MARITAL_STATUS: {X['MARITAL_STATUS'].nunique()} categories")

# --- 5. One-hot encode remaining categoricals ---
print("\n5. One-hot encoding remaining features")

# disease_category is already categorical, include it
remaining_categorical = [col for col in categorical_features if col in X.columns]
if 'disease_category' in X.columns and 'disease_category' not in remaining_categorical:
    remaining_categorical.append('disease_category')

print(f"   Features to one-hot encode: {remaining_categorical}")

if len(remaining_categorical) > 0:
    # Combine train and test to ensure same columns
    X_combined = pd.concat([X, X_test], keys=['train', 'test'])
    
    # One-hot encode
    X_encoded = pd.get_dummies(
        X_combined, 
        columns=remaining_categorical, 
        drop_first=True,
        dtype=int
    )
    
    # Split back
    X = X_encoded.xs('train')
    X_test = X_encoded.xs('test')
    
    n_new_features = len([col for col in X.columns if any(cat in col for cat in remaining_categorical)])
    print(f"   ‚úì Created {n_new_features} binary features")

# --- Validation ---
print("\n" + "="*80)
print("ENCODING VALIDATION")
print("="*80)

# Check for remaining object columns
object_cols_train = X.select_dtypes(include=['object']).columns.tolist()
object_cols_test = X_test.select_dtypes(include=['object']).columns.tolist()

if object_cols_train or object_cols_test:
    print(f"  ‚ö†Ô∏è Warning: Still have object columns!")
    print(f"    Train: {object_cols_train}")
    print(f"    Test: {object_cols_test}")
else:
    print(f"  ‚úì No object columns remain - all categorical features encoded")

# Check train/test consistency
if list(X.columns) != list(X_test.columns):
    print(f"  ‚ùå ERROR: Train/test column mismatch after encoding!")
    train_only = set(X.columns) - set(X_test.columns)
    test_only = set(X_test.columns) - set(X.columns)
    if train_only:
        print(f"    Only in train: {train_only}")
    if test_only:
        print(f"    Only in test: {test_only}")
else:
    print(f"  ‚úì Train and test have identical columns: {X.shape[1]}")

print(f"\n  Final feature count: {X.shape[1]}")

print("\n" + "="*80)
print("‚úì SECTION 7 COMPLETE - Categorical features encoded")
print("="*80)


SECTION 7: ENCODING CATEGORICAL FEATURES

--- Categorical Feature Analysis ---
  GENDER                       2 unique values
    Distribution:
      M                               11759 ( 56.3%)
      F                                9126 ( 43.7%)
  ADMISSION_TYPE               3 unique values
    Distribution:
      EMERGENCY                       17817 ( 85.3%)
      ELECTIVE                         2848 ( 13.6%)
      URGENT                            220 (  1.1%)
  INSURANCE                    5 unique values
    Distribution:
      Medicare                        11718 ( 56.1%)
      Private                          6245 ( 29.9%)
      Medicaid                         2117 ( 10.1%)
      Government                        611 (  2.9%)
      Self Pay                          194 (  0.9%)
  RELIGION                    17 unique values
  MARITAL_STATUS               7 unique values
    Distribution:
      MARRIED                         10386 ( 49.7%)
      SINGLE                  

In [9]:
# =============================================================================
# SECTION 8: MEDICAL FEATURE ENGINEERING
# =============================================================================

print("\n" + "="*80)
print("SECTION 8: MEDICAL FEATURE ENGINEERING")
print("="*80)

original_feature_count = X.shape[1]

print("\n--- Creating Vital Sign Features ---")

# Blood Pressure Features
if all(col in X.columns for col in ['SysBP_Mean', 'DiasBP_Mean']):
    X['PulsePressure'] = X['SysBP_Mean'] - X['DiasBP_Mean']
    X_test['PulsePressure'] = X_test['SysBP_Mean'] - X_test['DiasBP_Mean']
    print("  ‚úì Pulse pressure")

if all(col in X.columns for col in ['SysBP_Min', 'SysBP_Max']):
    X['SysBP_Range'] = X['SysBP_Max'] - X['SysBP_Min']
    X_test['SysBP_Range'] = X_test['SysBP_Max'] - X_test['SysBP_Min']
    print("  ‚úì Systolic BP range")

# Shock Indices (critical for ICU mortality)
if all(col in X.columns for col in ['HeartRate_Mean', 'SysBP_Mean']):
    X['ShockIndex'] = (X['HeartRate_Mean'] / (X['SysBP_Mean'] + 1)).clip(0, 3)
    X_test['ShockIndex'] = (X_test['HeartRate_Mean'] / (X_test['SysBP_Mean'] + 1)).clip(0, 3)
    print("  ‚úì Shock index (clipped 0-3)")

if all(col in X.columns for col in ['HeartRate_Mean', 'MeanBP_Mean']):
    X['ModifiedShockIndex'] = (X['HeartRate_Mean'] / (X['MeanBP_Mean'] + 1)).clip(0, 3)
    X_test['ModifiedShockIndex'] = (X_test['HeartRate_Mean'] / (X_test['MeanBP_Mean'] + 1)).clip(0, 3)
    print("  ‚úì Modified shock index (clipped 0-3)")

# Respiratory Features
if 'SpO2_Min' in X.columns:
    X['Hypoxemia'] = (X['SpO2_Min'] < 90).astype(int)
    X_test['Hypoxemia'] = (X_test['SpO2_Min'] < 90).astype(int)
    print("  ‚úì Hypoxemia indicator")

if 'RespRate_Mean' in X.columns:
    X['RespRate_Abnormal'] = ((X['RespRate_Mean'] < 12) | (X['RespRate_Mean'] > 20)).astype(int)
    X_test['RespRate_Abnormal'] = ((X_test['RespRate_Mean'] < 12) | (X_test['RespRate_Mean'] > 20)).astype(int)
    print("  ‚úì Abnormal respiratory rate")

# Temperature Features
if 'TempC_Max' in X.columns:
    X['Fever'] = (X['TempC_Max'] > 38).astype(int)
    X_test['Fever'] = (X_test['TempC_Max'] > 38).astype(int)
    print("  ‚úì Fever indicator")

if 'TempC_Min' in X.columns:
    X['Hypothermia'] = (X['TempC_Min'] < 36).astype(int)
    X_test['Hypothermia'] = (X_test['TempC_Min'] < 36).astype(int)
    print("  ‚úì Hypothermia indicator")

if all(col in X.columns for col in ['TempC_Min', 'TempC_Max']):
    X['Temp_Range'] = X['TempC_Max'] - X['TempC_Min']
    X_test['Temp_Range'] = X_test['TempC_Max'] - X_test['TempC_Min']
    print("  ‚úì Temperature range")

# Glucose Features  
if 'Glucose_Max' in X.columns:
    X['Hyperglycemia'] = (X['Glucose_Max'] > 180).astype(int)
    X_test['Hyperglycemia'] = (X_test['Glucose_Max'] > 180).astype(int)
    print("  ‚úì Hyperglycemia indicator")

if 'Glucose_Min' in X.columns:
    X['Hypoglycemia'] = (X['Glucose_Min'] < 70).astype(int)
    X_test['Hypoglycemia'] = (X_test['Glucose_Min'] < 70).astype(int)
    print("  ‚úì Hypoglycemia indicator")

if all(col in X.columns for col in ['Glucose_Min', 'Glucose_Max']):
    X['Glucose_Range'] = X['Glucose_Max'] - X['Glucose_Min']
    X_test['Glucose_Range'] = X_test['Glucose_Max'] - X_test['Glucose_Min']
    print("  ‚úì Glucose variability")

# Age Features
print("\n--- Creating Age-Based Features ---")

if 'age' in X.columns:
    # Elderly indicator
    X['Elderly'] = (X['age'] > 65).astype(int)
    X_test['Elderly'] = (X_test['age'] > 65).astype(int)
    print("  ‚úì Elderly indicator (>65)")
    
    # Age squared (non-linear effects)
    X['age_squared'] = X['age'] ** 2
    X_test['age_squared'] = X_test['age'] ** 2
    print("  ‚úì Age squared")
    
    # Age risk groups
    age_bins = [0, 18, 45, 65, 80, 120]
    age_labels = ['pediatric', 'young_adult', 'middle_age', 'elderly', 'very_old']
    
    X['age_group'] = pd.cut(X['age'], bins=age_bins, labels=age_labels)
    X_test['age_group'] = pd.cut(X_test['age'], bins=age_bins, labels=age_labels)
    
    # One-hot encode age groups
    X_combined = pd.concat([X, X_test], keys=['train', 'test'])
    X_encoded = pd.get_dummies(X_combined, columns=['age_group'], drop_first=True, prefix='age', dtype=int)
    X = X_encoded.xs('train')
    X_test = X_encoded.xs('test')
    
    print("  ‚úì Age risk groups (one-hot encoded)")

# Heart Rate Variability
if all(col in X.columns for col in ['HeartRate_Min', 'HeartRate_Max']):
    X['HeartRate_Range'] = X['HeartRate_Max'] - X['HeartRate_Min']
    X_test['HeartRate_Range'] = X_test['HeartRate_Max'] - X_test['HeartRate_Min']
    print("  ‚úì Heart rate range")

# Composite Severity Score
print("\n--- Creating Composite Severity Score ---")

severity_components = []

if 'ShockIndex' in X.columns:
    severity_components.append((X['ShockIndex'] > 0.9).astype(int))
if 'Hypoxemia' in X.columns:
    severity_components.append(X['Hypoxemia'])
if 'RespRate_Abnormal' in X.columns:
    severity_components.append(X['RespRate_Abnormal'])
if 'Fever' in X.columns:
    severity_components.append(X['Fever'])
if 'Hypothermia' in X.columns:
    severity_components.append(X['Hypothermia'])

if severity_components:
    X['Severity_Score'] = sum(severity_components)
    
    # Repeat for test
    severity_components_test = []
    if 'ShockIndex' in X_test.columns:
        severity_components_test.append((X_test['ShockIndex'] > 0.9).astype(int))
    if 'Hypoxemia' in X_test.columns:
        severity_components_test.append(X_test['Hypoxemia'])
    if 'RespRate_Abnormal' in X_test.columns:
        severity_components_test.append(X_test['RespRate_Abnormal'])
    if 'Fever' in X_test.columns:
        severity_components_test.append(X_test['Fever'])
    if 'Hypothermia' in X_test.columns:
        severity_components_test.append(X_test['Hypothermia'])
    
    X_test['Severity_Score'] = sum(severity_components_test)
    
    print(f"  ‚úì Severity score (0-{len(severity_components)})")
    print(f"    Distribution: {X['Severity_Score'].value_counts().sort_index().to_dict()}")

# Summary
new_feature_count = X.shape[1]
added_features = new_feature_count - original_feature_count

print(f"\n--- Feature Engineering Summary ---")
print(f"  Features before: {original_feature_count}")
print(f"  Features after: {new_feature_count}")
print(f"  Features added: {added_features}")

# Validate binary features have variance
print("\n--- Binary Feature Validation ---")

binary_features = [
    'Hypoxemia', 'RespRate_Abnormal', 'Fever', 'Hypothermia',
    'Hyperglycemia', 'Hypoglycemia', 'Elderly'
]

for feat in binary_features:
    if feat in X.columns:
        var = X[feat].var()
        unique = X[feat].nunique()
        if var == 0 or unique == 1:
            print(f"  ‚ö†Ô∏è {feat}: variance={var:.4f}, unique={unique} (CONSTANT!)")
        else:
            print(f"  ‚úì {feat}: variance={var:.4f}, unique={unique}")

print("\n" + "="*80)
print("‚úì SECTION 8 COMPLETE - Medical features engineered")
print("="*80)


SECTION 8: MEDICAL FEATURE ENGINEERING

--- Creating Vital Sign Features ---
  ‚úì Pulse pressure
  ‚úì Systolic BP range
  ‚úì Shock index (clipped 0-3)
  ‚úì Modified shock index (clipped 0-3)
  ‚úì Hypoxemia indicator
  ‚úì Abnormal respiratory rate
  ‚úì Fever indicator
  ‚úì Hypothermia indicator
  ‚úì Temperature range
  ‚úì Hyperglycemia indicator
  ‚úì Hypoglycemia indicator
  ‚úì Glucose variability

--- Creating Age-Based Features ---
  ‚úì Elderly indicator (>65)
  ‚úì Age squared
  ‚úì Age risk groups (one-hot encoded)
  ‚úì Heart rate range

--- Creating Composite Severity Score ---
  ‚úì Severity score (0-5)
    Distribution: {0: 6837, 1: 6834, 2: 4541, 3: 2088, 4: 537, 5: 48}

--- Feature Engineering Summary ---
  Features before: 75
  Features after: 95
  Features added: 20

--- Binary Feature Validation ---
  ‚úì Hypoxemia: variance=0.1580, unique=2
  ‚úì RespRate_Abnormal: variance=0.2120, unique=2
  ‚úì Fever: variance=0.1424, unique=2
  ‚úì Hypothermia: variance=0.

In [10]:
# =============================================================================
# SECTION 9: FEATURE SCALING (CORRECTED)
# =============================================================================

print("\n" + "="*80)
print("SECTION 9: FEATURE SCALING (CORRECTED)")
print("="*80)

print("\n--- Identifying Features to Scale ---")

# Get all numeric columns
all_numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print(f"  Total numeric columns: {len(all_numeric_cols)}")

# Features that should NOT be scaled
binary_features_list = [
    # Medical binary indicators
    'Hypoxemia', 'RespRate_Abnormal', 'Fever', 'Hypothermia',
    'Hyperglycemia', 'Hypoglycemia', 'Elderly',
    
    # Hospital history - binary flags
    'is_first_icu_visit', 
    'is_frequent_flyer',
    
    # Condition flags - binary
    'has_sepsis', 'has_heart_failure', 'has_respiratory_failure',
    'has_aki', 'has_diabetes', 'has_copd', 'has_pneumonia'
]

# COUNT FEATURES - CRITICAL: DO NOT SCALE THESE!
count_features = [
    'n_previous_icu_stays',  # Count of previous ICU visits
    'n_diagnoses'            # Count of diagnoses
]

# ORDINAL FEATURES - DO NOT SCALE
ordinal_features = [
    'Severity_Score'  # Ordinal score 0-5
]

# One-hot encoded features (all contain underscore and are binary)
one_hot_features = [col for col in X.columns if '_' in col and X[col].nunique() <= 2]

print(f"  Binary indicator features: {len(binary_features_list)}")
print(f"  Count features (DO NOT SCALE): {len(count_features)}")
print(f"  Ordinal features (DO NOT SCALE): {len(ordinal_features)}")
print(f"  One-hot encoded features: {len(one_hot_features)}")

# Combine ALL features to exclude from scaling
exclude_from_scaling = list(set(
    binary_features_list + 
    count_features + 
    ordinal_features + 
    one_hot_features
))
exclude_from_scaling = [col for col in exclude_from_scaling if col in all_numeric_cols]

# Features to scale = numeric features - excluded features
features_to_scale = [col for col in all_numeric_cols if col not in exclude_from_scaling]

print(f"\n  Features to scale: {len(features_to_scale)}")
print(f"  Features to keep unscaled: {len(exclude_from_scaling)}")

# Show what we're doing with count features
print(f"\n--- Count Features (NOT scaling) ---")
for feat in count_features:
    if feat in X.columns:
        print(f"  {feat}: range [{X[feat].min():.0f}, {X[feat].max():.0f}], mean {X[feat].mean():.1f}")

# Show sample
print(f"\n--- Sample Features ---")
print(f"  Scaling (first 10): {features_to_scale[:10]}")
print(f"  Not scaling (first 10): {exclude_from_scaling[:10]}")

# Scale continuous features only
print(f"\n--- Applying StandardScaler ---")

scaler = StandardScaler()

X[features_to_scale] = scaler.fit_transform(X[features_to_scale])
X_test[features_to_scale] = scaler.transform(X_test[features_to_scale])

print(f"  ‚úì Scaled {len(features_to_scale)} continuous features")
print(f"  ‚úì Left {len(exclude_from_scaling)} features unscaled")

# Validation
print(f"\n--- Scaling Validation ---")

# Check 1: Scaled features should have mean ‚âà 0, std ‚âà 1
print(f"\n  Checking scaled feature statistics (sample):")
sample_features = features_to_scale[:5]
for feat in sample_features:
    mean = X[feat].mean()
    std = X[feat].std()
    print(f"    {feat:30s} mean={mean:7.4f}, std={std:7.4f}")
    
    if abs(mean) > 0.1:
        print(f"      ‚ö†Ô∏è Mean not close to 0!")
    if abs(std - 1.0) > 0.1:
        print(f"      ‚ö†Ô∏è Std not close to 1!")

# Check 2: Count features should still be integers
print(f"\n  Checking count features remain integers:")
for feat in count_features:
    if feat in X.columns:
        min_val = X[feat].min()
        max_val = X[feat].max()
        mean_val = X[feat].mean()
        print(f"    {feat:25s} range=[{min_val:.1f}, {max_val:.1f}], mean={mean_val:.1f}")
        
        if min_val < -1:  # If negative, it was scaled!
            print(f"      ‚ùå ERROR: This was scaled! Values should be positive counts!")
        else:
            print(f"      ‚úì Looks correct")

# Check 3: Binary features should still be 0/1
print(f"\n  Checking binary features remain 0/1:")
binary_check_passed = True

for feat in binary_features_list:
    if feat in X.columns:
        unique_vals = set(X[feat].unique())
        if not unique_vals.issubset({0, 1, 0.0, 1.0}):
            print(f"    ‚ùå {feat}: values are {sorted(unique_vals)[:5]}")
            binary_check_passed = False

if binary_check_passed:
    print(f"    ‚úì All binary features remain 0/1")

# Check 4: No NaN or Inf introduced by scaling
nan_count = X.isnull().sum().sum()
inf_count = np.isinf(X.select_dtypes(include=[np.number])).sum().sum()

print(f"\n  Checking for invalid values after scaling:")
print(f"    NaN values: {nan_count}")
print(f"    Infinite values: {inf_count}")

if nan_count > 0 or inf_count > 0:
    print(f"    ‚ùå ERROR: Scaling introduced invalid values!")
else:
    print(f"    ‚úì No invalid values")

# Check 5: Train and test still have same columns
if list(X.columns) != list(X_test.columns):
    print(f"\n  ‚ùå ERROR: Column mismatch after scaling!")
else:
    print(f"\n  ‚úì Train and test still have identical columns")

print("\n" + "="*80)
print("‚úì SECTION 9 COMPLETE - Features scaled CORRECTLY")
print("="*80)


SECTION 9: FEATURE SCALING (CORRECTED)

--- Identifying Features to Scale ---
  Total numeric columns: 95
  Binary indicator features: 16
  Count features (DO NOT SCALE): 2
  Ordinal features (DO NOT SCALE): 1
  One-hot encoded features: 51

  Features to scale: 35
  Features to keep unscaled: 60

--- Count Features (NOT scaling) ---
  n_previous_icu_stays: range [0, 24], mean 0.4
  n_diagnoses: range [1, 39], mean 14.8

--- Sample Features ---
  Scaling (first 10): ['HeartRate_Min', 'HeartRate_Max', 'HeartRate_Mean', 'SysBP_Min', 'SysBP_Max', 'SysBP_Mean', 'DiasBP_Min', 'DiasBP_Max', 'DiasBP_Mean', 'MeanBP_Min']
  Not scaling (first 10): ['n_previous_icu_stays', 'MARITAL_STATUS_WIDOWED', 'age_elderly', 'Hypoglycemia', 'has_aki', 'ETHNICITY_BLACK', 'Severity_Score', 'FIRST_CAREUNIT_TSICU', 'ETHNICITY_WHITE', 'Elderly']

--- Applying StandardScaler ---
  ‚úì Scaled 35 continuous features
  ‚úì Left 60 features unscaled

--- Scaling Validation ---

  Checking scaled feature statistics (

In [11]:
# =============================================================================
# SECTION 10: FINAL VALIDATION AND SAVE
# =============================================================================

print("\n" + "="*80)
print("SECTION 10: FINAL VALIDATION & SAVE")
print("="*80)

print("\n" + "="*80)
print("COMPREHENSIVE FINAL VALIDATION")
print("="*80)

validation_passed = True
issues = []

# ============================================================================
# Check 1: Shape Consistency
# ============================================================================
print("\n1. Shape Consistency")
print(f"   X_train: {X.shape}")
print(f"   y_train: {y.shape}")
print(f"   X_test: {X_test.shape}")
print(f"   test_ids: {len(test_ids)}")

if X.shape[0] != y.shape[0]:
    issues.append("‚ùå X and y have different number of samples")
    validation_passed = False
else:
    print(f"   ‚úì X and y have matching samples: {X.shape[0]}")

if X.shape[1] != X_test.shape[1]:
    issues.append("‚ùå X_train and X_test have different number of features")
    validation_passed = False
else:
    print(f"   ‚úì Train and test have matching features: {X.shape[1]}")

if X_test.shape[0] != len(test_ids):
    issues.append("‚ùå X_test and test_ids have different lengths")
    validation_passed = False
else:
    print(f"   ‚úì Test set and test_ids match: {len(test_ids)}")

# ============================================================================
# Check 2: No Missing Values
# ============================================================================
print("\n2. Missing Values")

X_missing = X.isnull().sum().sum()
y_missing = y.isnull().sum()
X_test_missing = X_test.isnull().sum().sum()

print(f"   X_train: {X_missing} missing")
print(f"   y_train: {y_missing} missing")
print(f"   X_test: {X_test_missing} missing")

if X_missing > 0 or y_missing > 0 or X_test_missing > 0:
    issues.append("‚ùå Missing values present")
    validation_passed = False
else:
    print(f"   ‚úì No missing values")

# ============================================================================
# Check 3: No Infinite Values
# ============================================================================
print("\n3. Infinite Values")

X_inf = np.isinf(X.select_dtypes(include=[np.number])).sum().sum()
X_test_inf = np.isinf(X_test.select_dtypes(include=[np.number])).sum().sum()

print(f"   X_train: {X_inf} infinite")
print(f"   X_test: {X_test_inf} infinite")

if X_inf > 0 or X_test_inf > 0:
    issues.append("‚ùå Infinite values present")
    validation_passed = False
else:
    print(f"   ‚úì No infinite values")

# ============================================================================
# Check 4: Column Names Match
# ============================================================================
print("\n4. Column Consistency")

if list(X.columns) == list(X_test.columns):
    print(f"   ‚úì Train and test have identical column names")
else:
    issues.append("‚ùå Column names don't match")
    validation_passed = False
    
    train_only = set(X.columns) - set(X_test.columns)
    test_only = set(X_test.columns) - set(X.columns)
    
    if train_only:
        print(f"   Columns only in train: {train_only}")
    if test_only:
        print(f"   Columns only in test: {test_only}")

# ============================================================================
# Check 5: Data Types
# ============================================================================
print("\n5. Data Types")

# All should be numeric
non_numeric_train = X.select_dtypes(exclude=[np.number]).columns.tolist()
non_numeric_test = X_test.select_dtypes(exclude=[np.number]).columns.tolist()

if non_numeric_train or non_numeric_test:
    issues.append("‚ùå Non-numeric columns present")
    validation_passed = False
    print(f"   Non-numeric in train: {non_numeric_train}")
    print(f"   Non-numeric in test: {non_numeric_test}")
else:
    print(f"   ‚úì All features are numeric")

# ============================================================================
# Check 6: Target Distribution
# ============================================================================
print("\n6. Target Distribution")

target_mean = y.mean()
target_count = y.sum()

print(f"   Mortality rate: {target_mean:.3f} ({target_count}/{len(y)})")
print(f"   Class 0: {(y==0).sum()}")
print(f"   Class 1: {(y==1).sum()}")

expected_mortality = 0.112
if abs(target_mean - expected_mortality) > 0.01:
    issues.append(f"‚ö†Ô∏è Target distribution changed: {target_mean:.3f} vs expected {expected_mortality:.3f}")
else:
    print(f"   ‚úì Mortality rate matches expected (~11.2%)")

# ============================================================================
# Check 7: Feature Value Ranges
# ============================================================================
print("\n7. Feature Value Ranges (sample)")

# Check a few critical features
critical_features = {
    'age': (0, 120),
    'ShockIndex': (0, 3),
    'ModifiedShockIndex': (0, 3),
    'Severity_Score': (0, 5)
}

for feat, (expected_min, expected_max) in critical_features.items():
    if feat in X.columns:
        actual_min = X[feat].min()
        actual_max = X[feat].max()
        
        # For scaled features, ranges will be different
        if feat in features_to_scale:
            print(f"   {feat}: [{actual_min:.2f}, {actual_max:.2f}] (scaled)")
        else:
            print(f"   {feat}: [{actual_min:.2f}, {actual_max:.2f}]", end="")
            
            if actual_min < expected_min or actual_max > expected_max:
                print(f" ‚ö†Ô∏è outside expected [{expected_min}, {expected_max}]")
            else:
                print(f" ‚úì")

# ============================================================================
# Check 8: Binary Features
# ============================================================================
print("\n8. Binary Features Have Variance")

for feat in ['Hypoxemia', 'Fever', 'Elderly', 'is_first_icu_visit']:
    if feat in X.columns:
        var = X[feat].var()
        unique = X[feat].nunique()
        
        if var == 0:
            issues.append(f"‚ùå {feat} has zero variance")
            validation_passed = False
            print(f"   ‚ùå {feat}: variance={var:.4f}, unique={unique}")
        else:
            print(f"   ‚úì {feat}: variance={var:.4f}, unique={unique}")

# ============================================================================
# FINAL VERDICT
# ============================================================================
print("\n" + "="*80)

if validation_passed and len(issues) == 0:
    print("‚úÖ ALL VALIDATION CHECKS PASSED!")
    print("="*80)
    print("\nüéâ Data is ready for modeling!")
    
else:
    print("üö® VALIDATION ISSUES FOUND")
    print("="*80)
    
    if issues:
        print("\nIssues:")
        for i, issue in enumerate(issues, 1):
            print(f"  {i}. {issue}")
    
    print("\n‚ö†Ô∏è Review and fix issues before training models!")




SECTION 10: FINAL VALIDATION & SAVE

COMPREHENSIVE FINAL VALIDATION

1. Shape Consistency
   X_train: (20885, 95)
   y_train: (20885,)
   X_test: (5221, 95)
   test_ids: 5221
   ‚úì X and y have matching samples: 20885
   ‚úì Train and test have matching features: 95
   ‚úì Test set and test_ids match: 5221

2. Missing Values
   X_train: 0 missing
   y_train: 0 missing
   X_test: 0 missing
   ‚úì No missing values

3. Infinite Values
   X_train: 0 infinite
   X_test: 0 infinite
   ‚úì No infinite values

4. Column Consistency
   ‚úì Train and test have identical column names

5. Data Types
   ‚úì All features are numeric

6. Target Distribution
   Mortality rate: 0.112 (2345/20885)
   Class 0: 18540
   Class 1: 2345
   ‚úì Mortality rate matches expected (~11.2%)

7. Feature Value Ranges (sample)
   age: [-2.25, 2.15] (scaled)
   ShockIndex: [-3.11, 8.91] (scaled)
   ModifiedShockIndex: [-3.24, 8.34] (scaled)
   Severity_Score: [0.00, 5.00] ‚úì

8. Binary Features Have Variance
   ‚úì

In [12]:
# ============================================================================
# SAVE PROCESSED DATA
# ============================================================================

if validation_passed:
    print("\n" + "="*80)
    print("SAVING PROCESSED DATA")
    print("="*80)
    
    import os
    save_dir = Path("../data/processed_final")
    os.makedirs(save_dir, exist_ok=True)
    
    # Save datasets
    X.to_pickle(save_dir / 'X_train_processed.pkl')
    y.to_pickle(save_dir / 'y_train.pkl')
    X_test.to_pickle(save_dir / 'X_test_processed.pkl')
    test_ids.to_pickle(save_dir / 'test_ids.pkl')
    
    print(f"\n‚úì Saved processed datasets to {save_dir}/")
    
    # Save preprocessing objects
    with open(save_dir / 'numeric_imputer.pkl', 'wb') as f:
        pickle.dump(numeric_imputer, f)
    with open(save_dir / 'categorical_imputer.pkl', 'wb') as f:
        pickle.dump(categorical_imputer, f)
    with open(save_dir / 'scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)
    
    print(f"‚úì Saved preprocessing objects")
    
    # Save feature lists for reference
    feature_info = {
        'all_features': X.columns.tolist(),
        'features_to_scale': features_to_scale,
        'binary_features': exclude_from_scaling,
        'n_features': X.shape[1]
    }
    
    with open(save_dir / 'feature_info.pkl', 'wb') as f:
        pickle.dump(feature_info, f)
    
    print(f"‚úì Saved feature metadata")
    
    # Create summary report
    summary = f"""
PREPROCESSING SUMMARY
{'='*80}

Dataset Statistics:
  Training samples: {X.shape[0]:,}
  Test samples: {X_test.shape[0]:,}
  Total features: {X.shape[1]}
  
Feature Breakdown:
  Continuous (scaled): {len(features_to_scale)}
  Binary (unscaled): {len(exclude_from_scaling)}
  
Target Distribution:
  Mortality rate: {y.mean():.3f}
  Deaths: {y.sum()} / {len(y)}
  
Feature Groups:
  - Original vitals: ~24
  - Hospital history: 3
  - ICD9 diagnoses: ~7
  - Condition flags: 7
  - Engineered vitals: ~13
  - Age features: ~6
  - One-hot encoded: ~35

Validation Status:  ALL CHECKS PASSED

Files Saved:
  - X_train_processed.pkl
  - y_train.pkl
  - X_test_processed.pkl
  - test_ids.pkl
  - numeric_imputer.pkl
  - categorical_imputer.pkl
  - scaler.pkl
  - feature_info.pkl

Ready for modeling! 
"""
    
    with open(save_dir / 'preprocessing_summary.txt', 'w') as f:
        f.write(summary)
    
    print(f"‚úì Saved preprocessing summary")
    
    print(summary)

else:
    print("\n Data NOT saved due to validation failures")
    print("   Fix the issues above and re-run preprocessing")

print("\n" + "="*80)
print("‚úì SECTION 10 COMPLETE")
print("="*80)

print("\n" + "="*80)
print("PREPROCESSING PIPELINE COMPLETE!")
print("="*80)


SAVING PROCESSED DATA

‚úì Saved processed datasets to ..\data\processed_final/
‚úì Saved preprocessing objects
‚úì Saved feature metadata
‚úì Saved preprocessing summary

PREPROCESSING SUMMARY

Dataset Statistics:
  Training samples: 20,885
  Test samples: 5,221
  Total features: 95

Feature Breakdown:
  Continuous (scaled): 35
  Binary (unscaled): 60

Target Distribution:
  Mortality rate: 0.112
  Deaths: 2345 / 20885

Feature Groups:
  - Original vitals: ~24
  - Hospital history: 3
  - ICD9 diagnoses: ~7
  - Condition flags: 7
  - Engineered vitals: ~13
  - Age features: ~6
  - One-hot encoded: ~35

Validation Status:  ALL CHECKS PASSED

Files Saved:
  - X_train_processed.pkl
  - y_train.pkl
  - X_test_processed.pkl
  - test_ids.pkl
  - numeric_imputer.pkl
  - categorical_imputer.pkl
  - scaler.pkl
  - feature_info.pkl

Ready for modeling! 


‚úì SECTION 10 COMPLETE

PREPROCESSING PIPELINE COMPLETE!
