# Mortality Prediction – Imputation & Class‑Weighted Models (No SMOTE)

This notebook builds Logistic Regression, Random Forest, XGBoost, and SVM classifiers.

Key points:
* **Missing values** imputed (median for numeric, mode for categorical)
* **Class imbalance** handled via `class_weight='balanced'` (or `scale_pos_weight` for XGBoost)
* **GridSearchCV** to optimise hyper‑parameters
* Evaluation metric: **ROC AUC**

In [None]:
%pip install xgboost



In [2]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
import joblib


## 1. Load & Clean Data

In [10]:

# Load dataset
df = pd.read_csv('data/HDHI Admission data.csv')
df.columns = df.columns.str.strip().str.upper()

# Binary outcome: 1 = death (EXPIRY)
df['OUTCOME_BINARY'] = df['OUTCOME'].map({'DISCHARGE': 0, 'DAMA': 0, 'EXPIRY': 1})

# Admission‑time features
features = [
    'AGE', 'GENDER', 'RURAL', 'TYPE OF ADMISSION-EMERGENCY/OPD',
    'DM', 'HTN', 'CAD', 'CKD', 'SMOKING', 'ALCOHOL', 'PRIOR CMP',
    'HB', 'TLC', 'GLUCOSE', 'UREA', 'CREATININE', 'BNP',
    'ACS', 'STEMI', 'ATYPICAL CHEST PAIN', 'HEART FAILURE', 'VALVULAR',
    'CHB', 'AKI', 'CVA INFRACT', 'AF', 'SHOCK', 'CHEST INFECTION'
]

X = df[features].copy()
y = df['OUTCOME_BINARY']

# Standardise predictor names
X.columns = X.columns.str.strip().str.replace(' ', '_')

# Identify column types
cat_cols = ['GENDER', 'RURAL', 'TYPE_OF_ADMISSION-EMERGENCY/OPD']
num_cols = [c for c in X.columns if c not in cat_cols]

# Clean invalid strings
X = X.replace(['EMPTY', 'NA', 'NaN', '--', '\\'], np.nan)


## 2. Preprocessing Pipeline with Imputation

In [11]:

# ColumnTransformer with imputation + scaling / one‑hot
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, num_cols),
    ('cat', categorical_pipeline, cat_cols)
])


## 3. Train–Test Split

In [12]:

# After imputation, we can leave remaining NaNs to be handled by transformers
# (SimpleImputer will be applied inside CV folds).

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    test_size=0.2, random_state=42)

# Class ratio for XGBoost scale_pos_weight
pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print('Scale_pos_weight for XGBoost:', round(pos_weight, 2))


Scale_pos_weight for XGBoost: 13.26


## 4. Model Definitions & Hyper‑parameter Grids

In [13]:

models = {
    'logreg': (
        LogisticRegression(max_iter=1000, class_weight='balanced', solver='liblinear'),
        {'clf__C': [0.1, 1, 10]}
    ),
    'rf': (
        RandomForestClassifier(class_weight='balanced', random_state=42),
        {'clf__n_estimators': [200, 400],
         'clf__max_depth': [None, 10, 20]}
    ),
    'xgb': (
        xgb.XGBClassifier(use_label_encoder=False,
                          eval_metric='logloss',
                          scale_pos_weight=pos_weight,
                          random_state=42),
        {'clf__n_estimators': [200, 400],
         'clf__max_depth': [3, 5],
         'clf__learning_rate': [0.05, 0.1]}
    ),
    'svm': (
        SVC(probability=True, class_weight='balanced'),
        {'clf__C': [0.5, 1], 'clf__kernel': ['linear', 'rbf']}
    )
}


## 5. GridSearchCV & Model Training

In [14]:

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = []

for name, (clf, grid) in models.items():
    pipe = Pipeline([
        ('pre', preprocessor),
        ('clf', clf)
    ])
    gs = GridSearchCV(pipe, grid, cv=skf, scoring='roc_auc',
                      n_jobs=-1, verbose=1)
    gs.fit(X_train, y_train)

    best_model = gs.best_estimator_
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)
    results.append({'model': name, 'auc': auc, 'best_params': gs.best_params_})

    joblib.dump(best_model, f'best_model_{name}.pkl')

results_df = pd.DataFrame(results).sort_values(by='auc', ascending=False)
results_df


Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fitting 5 folds for each of 4 candidates, totalling 20 fits


Unnamed: 0,model,auc,best_params
2,xgb,0.952977,"{'clf__learning_rate': 0.1, 'clf__max_depth': ..."
1,rf,0.952543,"{'clf__max_depth': 20, 'clf__n_estimators': 400}"
3,svm,0.91767,"{'clf__C': 0.5, 'clf__kernel': 'rbf'}"
0,logreg,0.911477,{'clf__C': 0.1}
