=============================================================================
COMPUTATIONAL MACHINE LEARNING - FINAL PROJECT 2025
Task: HOSPITAL_EXPIRE_FLAG Classification (Binary Classification)
Student: Corneel Moons


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import warnings
from datetime import datetime
from pathlib import Path
# =============================================================================
# 1. LOAD DATA
# =============================================================================
data_path = Path("../../data/")   # full path to the HEF folder

train = pd.read_csv(data_path / "mimic_train_HEF.csv")
test = pd.read_csv(data_path / "mimic_test_HEF.csv")

train.shape, test.shape

# =============================================================================
# 2. DROP LEAKAGE AND IRRELEVANT COLUMNS
# =============================================================================
print("\n" + "="*70)
print("DROPPING LEAKAGE/IRRELEVANT COLUMNS")
print("="*70)

# Based on your EDA and metadata analysis
columns_to_drop = [
    'DISCHTIME',      # Discharge time (not available at admission)
    'DEATHTIME',      # Death time (direct leakage of target)
    'DOD',            # Date of death (direct leakage)
    'LOS',            # Length of stay (not available at admission, also target for other task)
    'subject_id',     # Patient ID (not predictive)
    'hadm_id',        # Hospital admission ID (not predictive)
    'icustay_id',     # ICU stay ID (not predictive)
    'ADMITTIME',      # Admission time (temporal info, not useful for prediction)
    'Diff'            # Time difference feature (leakage)
]

# Drop from train
train_clean = train.drop(columns=columns_to_drop, errors='ignore')

# Drop from test (these columns should match, except target won't be in test)
test_clean = test.drop(columns=columns_to_drop, errors='ignore')

print(f"Columns dropped: {columns_to_drop}")
print(f"Train shape after dropping: {train_clean.shape}")
print(f"Test shape after dropping: {test_clean.shape}")

# =============================================================================
# 3. SEPARATE FEATURES AND TARGET
# =============================================================================
print("\n" + "="*70)
print("SEPARATING FEATURES AND TARGET")
print("="*70)

# Target variable
y = train_clean['HOSPITAL_EXPIRE_FLAG']
X = train_clean.drop('HOSPITAL_EXPIRE_FLAG', axis=1)
X_test = test_clean.copy()  # Test set has no target

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"\nTarget distribution:\n{y.value_counts(normalize=True)}")
# =============================================================================
# 4. IDENTIFY FEATURE TYPES
# =============================================================================
print("\n" + "="*70)
print("IDENTIFYING FEATURE TYPES")
print("="*70)

# Numeric features (continuous or count data)
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Categorical features (object type or few unique values)
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print(f"Numeric features ({len(numeric_features)}): {numeric_features}")
print(f"Categorical features ({len(categorical_features)}): {categorical_features}")
# =============================================================================
# 5. HANDLE MISSING VALUES
# =============================================================================
print("\n" + "="*70)
print("HANDLING MISSING VALUES")
print("="*70)

# Following course slides: Week 2 - Working with data
# - Numeric: median imputation (robust to outliers)
# - Categorical: mode imputation (most frequent)

# Impute numeric features with median
numeric_imputer = SimpleImputer(strategy='median')
X[numeric_features] = numeric_imputer.fit_transform(X[numeric_features])
X_test[numeric_features] = numeric_imputer.transform(X_test[numeric_features])

# Impute categorical features with most frequent
categorical_imputer = SimpleImputer(strategy='most_frequent')
X[categorical_features] = categorical_imputer.fit_transform(X[categorical_features])
X_test[categorical_features] = categorical_imputer.transform(X_test[categorical_features])

print("Missing values imputed:")
print(f"  - Numeric: median")
print(f"  - Categorical: most frequent")
print(f"\nMissing values remaining in X: {X.isnull().sum().sum()}")
print(f"Missing values remaining in X_test: {X_test.isnull().sum().sum()}")
# =============================================================================
# 6. ENCODE CATEGORICAL VARIABLES
# =============================================================================
print("\n" + "="*70)
print("ENCODING CATEGORICAL VARIABLES")
print("="*70)

# Following course slides: Week 2 - Dealing with categorical data
# Using one-hot encoding for nominal variables

# Combine train and test for consistent encoding
X_combined = pd.concat([X, X_test], keys=['train', 'test'])

# One-hot encode categorical features
X_encoded = pd.get_dummies(X_combined, columns=categorical_features, drop_first=True)

# Split back into train and test
X = X_encoded.xs('train')
X_test = X_encoded.xs('test')

print(f"After encoding:")
print(f"  X shape: {X.shape}")
print(f"  X_test shape: {X_test.shape}")
# =============================================================================
# 7. FEATURE SCALING
# =============================================================================
print("\n" + "="*70)
print("FEATURE SCALING")
print("="*70)

# Following course slides: Week 2 - Normalisation (feature scaling)
# Using StandardScaler (z-score normalization) for numeric features

# Identify numeric columns after encoding (one-hot creates binary columns)
# Only scale original numeric features
numeric_cols_to_scale = [col for col in numeric_features if col in X.columns]

scaler = StandardScaler()
X[numeric_cols_to_scale] = scaler.fit_transform(X[numeric_cols_to_scale])
X_test[numeric_cols_to_scale] = scaler.transform(X_test[numeric_cols_to_scale])

print(f"Scaled {len(numeric_cols_to_scale)} numeric features using StandardScaler")
# =============================================================================
# 8. TRAIN BASELINE MODELS
# =============================================================================
print("\n" + "="*70)
print("TRAINING BASELINE MODELS")
print("="*70)

# Following course structure: Week 3-7 covered these algorithms
# We'll start with Logistic Regression and Random Forest

# Model 1: Logistic Regression (Week 3)
print("\n--- Logistic Regression ---")
logreg = LogisticRegression(
    max_iter=1000, 
    random_state=42,
    class_weight='balanced'  # Handle class imbalance
)
logreg.fit(X, y)

# Model 2: Random Forest (Week 7 - Ensemble methods)
print("--- Random Forest ---")
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    class_weight='balanced',  # Handle class imbalance
    n_jobs=-1  # Use all CPU cores
)
rf.fit(X, y)

print("\nModels trained successfully!")
# =============================================================================
# 9. EVALUATE MODELS (TRAINING SET PERFORMANCE)
# =============================================================================
print("\n" + "="*70)
print("MODEL EVALUATION ON TRAINING SET")
print("="*70)

# Get predictions on training set (for sanity check only)
y_pred_logreg = logreg.predict(X)
y_pred_rf = rf.predict(X)

# Get predicted probabilities (what we'll submit to Kaggle)
y_proba_logreg = logreg.predict_proba(X)[:, 1]
y_proba_rf = rf.predict_proba(X)[:, 1]

# Calculate metrics
print("\n--- Logistic Regression ---")
print(f"Training Accuracy: {logreg.score(X, y):.4f}")
print(f"Training ROC-AUC: {roc_auc_score(y, y_proba_logreg):.4f}")
print("\nClassification Report:")
print(classification_report(y, y_pred_logreg))

print("\n--- Random Forest ---")
print(f"Training Accuracy: {rf.score(X, y):.4f}")
print(f"Training ROC-AUC: {roc_auc_score(y, y_proba_rf):.4f}")
print("\nClassification Report:")
print(classification_report(y, y_pred_rf))
# =============================================================================
# 10. GENERATE PREDICTIONS FOR TEST SET
# =============================================================================
print("\n" + "="*70)
print("GENERATING TEST SET PREDICTIONS")
print("="*70)

# Following assignment instructions: "present your predictions in probabilities"
# Using predict_proba() to get probability of positive class (HOSPITAL_EXPIRE_FLAG = 1)

# Logistic Regression predictions
test_proba_logreg = logreg.predict_proba(X_test)[:, 1]

# Random Forest predictions
test_proba_rf = rf.predict_proba(X_test)[:, 1]

print(f"Generated probabilities for {len(test_proba_logreg)} test samples")
print(f"Probability range - Logistic Regression: [{test_proba_logreg.min():.4f}, {test_proba_logreg.max():.4f}]")
print(f"Probability range - Random Forest: [{test_proba_rf.min():.4f}, {test_proba_rf.max():.4f}]")

# =============================================================================
# 11. CREATE SUBMISSION FILES
# =============================================================================
print("\n" + "="*70)
print("CREATING SUBMISSION FILES")
print("="*70)

# Ensure output folder exists
output_dir = Path("../outputs/predictions")
# Current date as YYYY-MM-DD
date_str = datetime.now().strftime("%Y-%m-%d")

# IMPORTANT: Get the icustay_id from the ORIGINAL test file (before dropping it)
# The predictions are in the same order as the test set rows
test_ids = test['icustay_id'].values

print(f"Number of test IDs: {len(test_ids)}")
print(f"Number of predictions: {len(test_proba_rf)}")
assert len(test_ids) == len(test_proba_rf), "Mismatch between IDs and predictions!"

# Submission 1: Logistic Regression
output_dir = Path("../outputs/predictions/Logistic_Regression/")
output_dir.mkdir(parents=True, exist_ok=True)

submission_logreg = pd.DataFrame({
    'icustay_id': test_ids,
    'prediction': test_proba_logreg
})
submission_logreg.to_csv(output_dir / f'moons_corneel_CML_2025_logreg_{date_str}.csv', index=False)
print(f"✓ Logistic Regression submission saved: {output_dir / f'moons_corneel_CML_2025_logreg_{date_str}.csv'}")
print(f"  Shape: {submission_logreg.shape}")
print(f"  Columns: {list(submission_logreg.columns)}")

# Submission 2: Random Forest
output_dir = Path("../outputs/predictions/Random_Forest/")
output_dir.mkdir(parents=True, exist_ok=True)

submission_rf = pd.DataFrame({
    'icustay_id': test_ids,
    'prediction': test_proba_rf
})
submission_rf.to_csv(output_dir / f'moons_corneel_CML_2025_rf_{date_str}.csv', index=False)
print(f"✓ Random Forest submission saved: {output_dir / f'moons_corneel_CML_2025_rf_{date_str}.csv'}")
print(f"  Shape: {submission_rf.shape}")
print(f"  Columns: {list(submission_rf.columns)}")

# You can also try averaging both models (simple ensemble)
output_dir = Path("../outputs/predictions/Ensemble/")
output_dir.mkdir(parents=True, exist_ok=True)

test_proba_ensemble = (test_proba_logreg + test_proba_rf) / 2
submission_ensemble = pd.DataFrame({
    'icustay_id': test_ids,
    'prediction': test_proba_ensemble
})
submission_ensemble.to_csv(output_dir / f'moons_corneel_CML_2025_ensemble_{date_str}.csv', index=False)
print(f"✓ Ensemble submission saved: {output_dir / f'moons_corneel_CML_2025_ensemble_{date_str}.csv'}")
print(f"  Shape: {submission_ensemble.shape}")
print(f"  Columns: {list(submission_ensemble.columns)}")

print("\n" + "="*70)
print("PIPELINE COMPLETE!")
print("="*70)


DROPPING LEAKAGE/IRRELEVANT COLUMNS
Columns dropped: ['DISCHTIME', 'DEATHTIME', 'DOD', 'LOS', 'subject_id', 'hadm_id', 'icustay_id', 'ADMITTIME', 'Diff']
Train shape after dropping: (20885, 35)
Test shape after dropping: (5221, 34)

SEPARATING FEATURES AND TARGET
X shape: (20885, 34)
y shape: (20885,)
X_test shape: (5221, 34)

Target distribution:
HOSPITAL_EXPIRE_FLAG
0    0.887718
1    0.112282
Name: proportion, dtype: float64

IDENTIFYING FEATURE TYPES
Numeric features (24): ['HeartRate_Min', 'HeartRate_Max', 'HeartRate_Mean', 'SysBP_Min', 'SysBP_Max', 'SysBP_Mean', 'DiasBP_Min', 'DiasBP_Max', 'DiasBP_Mean', 'MeanBP_Min', 'MeanBP_Max', 'MeanBP_Mean', 'RespRate_Min', 'RespRate_Max', 'RespRate_Mean', 'TempC_Min', 'TempC_Max', 'TempC_Mean', 'SpO2_Min', 'SpO2_Max', 'SpO2_Mean', 'Glucose_Min', 'Glucose_Max', 'Glucose_Mean']
Categorical features (10): ['GENDER', 'DOB', 'ADMISSION_TYPE', 'INSURANCE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY', 'DIAGNOSIS', 'ICD9_diagnosis', 'FIRST_CAREUNIT'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numeric_cols_to_scale] = scaler.fit_transform(X[numeric_cols_to_scale])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[numeric_cols_to_scale] = scaler.transform(X_test[numeric_cols_to_scale])


--- Random Forest ---

Models trained successfully!

MODEL EVALUATION ON TRAINING SET

--- Logistic Regression ---
Training Accuracy: 0.9332
Training ROC-AUC: 0.9898

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.93      0.96     18540
           1       0.63      0.99      0.77      2345

    accuracy                           0.93     20885
   macro avg       0.81      0.96      0.86     20885
weighted avg       0.96      0.93      0.94     20885


--- Random Forest ---
Training Accuracy: 0.7984
Training ROC-AUC: 0.8334

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.82      0.88     18540
           1       0.31      0.66      0.42      2345

    accuracy                           0.80     20885
   macro avg       0.63      0.74      0.65     20885
weighted avg       0.88      0.80      0.83     20885


GENERATING TEST SET PREDICTIONS
Generated probabilities f