In [3]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)


In [5]:
admissions = pd.read_csv('DATASET/ADMISSIONS.csv')
icustays = pd.read_csv('DATASET/ICUSTAYS.csv')
patients = pd.read_csv('DATASET/PATIENTS.csv')
diagnoses = pd.read_csv('DATASET/DIAGNOSES_ICD.csv')
transfers = pd.read_csv('DATASET/TRANSFERS.csv')

print(len(admissions), len(icustays), len(patients))


58976 61532 46520


In [10]:
import numpy as np
import pandas as pd

# Convert timestamps safely
admissions['ADMITTIME'] = pd.to_datetime(admissions['ADMITTIME'], errors='coerce')
patients['DOB'] = pd.to_datetime(patients['DOB'], errors='coerce')

adm_pat = pd.merge(admissions, patients, on='SUBJECT_ID', how='inner')

# Define a safe age computation function
def compute_age(row):
    admit = row['ADMITTIME']
    dob = row['DOB']
    # Skip invalid or missing timestamps
    if pd.isna(admit) or pd.isna(dob):
        return np.nan
    try:
        # Convert to Python datetime to avoid pandas overflow
        admit_dt = admit.to_pydatetime()
        dob_dt = dob.to_pydatetime()
        age = (admit_dt - dob_dt).days / 365.25
        # Exclude absurd values (negative or >120 years)
        if age < 0 or age > 120:
            return np.nan
        return age
    except (OverflowError, ValueError):
        return np.nan

# Apply function row-wise
adm_pat['AGE'] = adm_pat.apply(compute_age, axis=1)

# Replace anonymized or missing ages with 90 (MIMIC convention)
adm_pat['AGE'] = adm_pat['AGE'].fillna(90)


In [11]:
# Load ICU stays data
icustays = pd.read_csv('DATASET/ICUSTAYS.csv')

# Convert datetime columns
icustays['INTIME'] = pd.to_datetime(icustays['INTIME'], errors='coerce')
icustays['OUTTIME'] = pd.to_datetime(icustays['OUTTIME'], errors='coerce')

# Merge with adm_pat
adm_icustay = pd.merge(adm_pat, icustays, on=['SUBJECT_ID', 'HADM_ID'], how='inner')

# Compute ICU length of stay in hours
adm_icustay['LOS_HOURS'] = (adm_icustay['OUTTIME'] - adm_icustay['INTIME']).dt.total_seconds() / 3600

# Keep valid stays only
adm_icustay = adm_icustay[adm_icustay['LOS_HOURS'] > 0]


In [12]:
import numpy as np

# Load lab events (smaller than CHARTEVENTS)
labevents = pd.read_csv('DATASET/LABEVENTS.csv', usecols=['SUBJECT_ID', 'HADM_ID', 'ITEMID', 'CHARTTIME', 'VALUENUM'])

# Convert CHARTTIME
labevents['CHARTTIME'] = pd.to_datetime(labevents['CHARTTIME'], errors='coerce')

# Example: choose a few common lab tests by ITEMID
# (You can adjust ITEMIDs based on your dataset dictionary)
important_labs = {
    50868: 'HEMOGLOBIN',
    50912: 'LACTATE',
    50971: 'POTASSIUM',
    50983: 'SODIUM',
    51006: 'WBC',
    51221: 'GLUCOSE'
}

# Filter only those labs
labevents = labevents[labevents['ITEMID'].isin(important_labs.keys())]
labevents['LABEL'] = labevents['ITEMID'].map(important_labs)

# Compute mean lab value per admission
lab_summary = (
    labevents.groupby(['SUBJECT_ID', 'HADM_ID', 'LABEL'])['VALUENUM']
    .mean()
    .reset_index()
    .pivot(index=['SUBJECT_ID', 'HADM_ID'], columns='LABEL', values='VALUENUM')
    .reset_index()
)

# Merge labs with main cohort
cohort = pd.merge(adm_icustay, lab_summary, on=['SUBJECT_ID', 'HADM_ID'], how='left')

# Fill missing labs with median values
cohort.fillna(cohort.median(numeric_only=True), inplace=True)

print("✅ Cohort ready with vitals and labs.")
print("Shape:", cohort.shape)
cohort.head()


✅ Cohort ready with vitals and labs.
Shape: (283, 44)


Unnamed: 0,ROW_ID_x,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA,ROW_ID_y,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG,AGE,ROW_ID,ICUSTAY_ID,DBSOURCE,FIRST_CAREUNIT,LAST_CAREUNIT,FIRST_WARDID,LAST_WARDID,INTIME,OUTTIME,LOS,LOS_HOURS,GLUCOSE,HEMOGLOBIN,LACTATE,POTASSIUM,SODIUM,WBC
0,997,806,149888,2103-01-19 18:19:00,2103-02-03 18:25:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,SNF,Medicare,,NOT SPECIFIED,,WHITE,,,S/P MI;CHEST PAIN;AORTIC STENOSIS\? CATH,0,1,760,M,2019-12-19,2103-04-15 00:00:00,,2103-04-15 00:00:00,1,83.08282,1031,239074,carevue,CCU,CSRU,57,14,2103-01-25 13:38:58,2103-01-29 11:03:13,3.8918,93.404167,30.386364,16.4,2.6,4.468421,139.6,49.529412
1,1464,1182,117380,2107-01-23 22:11:00,2107-02-01 09:50:00,2107-02-01 09:50:00,EMERGENCY,EMERGENCY ROOM ADMIT,DEAD/EXPIRED,Medicare,,NOT SPECIFIED,,WHITE,2107-01-23 17:56:00,2107-01-23 23:24:00,ALTERED MENTAL STATUS,1,1,1122,M,2018-10-18,2107-02-01 00:00:00,2107-02-01 00:00:00,2107-02-01 00:00:00,1,88.262834,1524,278277,carevue,MICU,MICU,23,23,2107-01-23 22:12:16,2107-01-28 02:23:23,4.1744,100.185278,34.685714,14.5,0.8,3.616667,136.833333,24.333333
2,1315,1059,198502,2106-02-03 16:24:00,2106-02-10 07:00:00,2106-02-10 07:00:00,EMERGENCY,EMERGENCY ROOM ADMIT,DEAD/EXPIRED,Medicare,,JEWISH,MARRIED,WHITE,2106-02-03 07:35:00,2106-02-03 17:58:00,COLONIC MASS,1,1,1003,F,2017-08-13,2106-02-10 00:00:00,2106-02-10 00:00:00,2106-02-10 00:00:00,1,88.473648,1374,290225,carevue,CCU,CCU,7,7,2106-02-06 14:37:16,2106-02-08 13:47:29,1.9654,47.170278,30.085714,20.428571,2.2,3.828571,133.857143,40.285714
3,2679,2213,180694,2101-12-18 08:31:00,2102-01-06 13:55:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,SNF,Medicare,,CATHOLIC,MARRIED,WHITE,,,ACUTE MYOCARDIAL INFARCTION,0,1,2095,M,2019-07-26,2103-05-29 00:00:00,2103-05-29 00:00:00,2103-05-29 00:00:00,1,82.395619,2793,224736,carevue,CCU,CCU,57,57,2101-12-18 08:32:25,2101-12-31 09:59:20,13.0604,313.448611,31.454286,13.413793,1.12,3.942424,139.517241,30.5
4,2679,2213,180694,2101-12-18 08:31:00,2102-01-06 13:55:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,SNF,Medicare,,CATHOLIC,MARRIED,WHITE,,,ACUTE MYOCARDIAL INFARCTION,0,1,2095,M,2019-07-26,2103-05-29 00:00:00,2103-05-29 00:00:00,2103-05-29 00:00:00,1,82.395619,2794,219825,carevue,CCU,CCU,57,57,2102-01-01 12:15:50,2102-01-03 20:41:22,2.3511,56.425556,31.454286,13.413793,1.12,3.942424,139.517241,30.5


In [13]:
# Create outcome label (mortality)
cohort['MORTALITY'] = cohort['HOSPITAL_EXPIRE_FLAG']

# Optional: If you want to focus on ICU mortality instead of hospital mortality
cohort['ICU_MORTALITY'] = np.where(
    (cohort['DEATHTIME'].notnull()) &
    (cohort['DEATHTIME'] >= cohort['INTIME']) &
    (cohort['DEATHTIME'] <= cohort['OUTTIME']),
    1, 0
)

# Drop redundant timestamp columns if you want a clean ML-ready dataset
cohort_final = cohort[[
    'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'AGE', 'GENDER', 'ETHNICITY',
    'ADMISSION_TYPE', 'FIRST_CAREUNIT', 'LOS_HOURS',
    'GLUCOSE', 'HEMOGLOBIN', 'LACTATE', 'POTASSIUM', 'SODIUM', 'WBC',
    'MORTALITY', 'ICU_MORTALITY'
]]

print("✅ Final cohort ready for ML/DL modeling.")
print("Shape:", cohort_final.shape)
cohort_final.head()


✅ Final cohort ready for ML/DL modeling.
Shape: (283, 17)


Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,AGE,GENDER,ETHNICITY,ADMISSION_TYPE,FIRST_CAREUNIT,LOS_HOURS,GLUCOSE,HEMOGLOBIN,LACTATE,POTASSIUM,SODIUM,WBC,MORTALITY,ICU_MORTALITY
0,806,149888,239074,83.08282,M,WHITE,EMERGENCY,CCU,93.404167,30.386364,16.4,2.6,4.468421,139.6,49.529412,0,0
1,1182,117380,278277,88.262834,M,WHITE,EMERGENCY,MICU,100.185278,34.685714,14.5,0.8,3.616667,136.833333,24.333333,1,0
2,1059,198502,290225,88.473648,F,WHITE,EMERGENCY,CCU,47.170278,30.085714,20.428571,2.2,3.828571,133.857143,40.285714,1,0
3,2213,180694,224736,82.395619,M,WHITE,EMERGENCY,CCU,313.448611,31.454286,13.413793,1.12,3.942424,139.517241,30.5,0,0
4,2213,180694,219825,82.395619,M,WHITE,EMERGENCY,CCU,56.425556,31.454286,13.413793,1.12,3.942424,139.517241,30.5,0,0


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# ----- Prepare data -----
data = cohort_final.copy()

# Encode categorical columns
categorical_cols = ['GENDER', 'ETHNICITY', 'ADMISSION_TYPE', 'FIRST_CAREUNIT']
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

# Define features (X) and target (y)
X = data.drop(['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'MORTALITY', 'ICU_MORTALITY'], axis=1)
y = data['MORTALITY']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ----- Train logistic regression model -----
model = LogisticRegression(max_iter=500)
model.fit(X_train_scaled, y_train)

# ----- Predictions -----
y_pred = model.predict(X_test_scaled)

# ----- Evaluation -----
print("✅ Model trained successfully.\n")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ Model trained successfully.

Accuracy: 0.8245614035087719

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.98      0.90        45
           1       0.75      0.25      0.38        12

    accuracy                           0.82        57
   macro avg       0.79      0.61      0.64        57
weighted avg       0.81      0.82      0.79        57


Confusion Matrix:
 [[44  1]
 [ 9  3]]


In [15]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest with class weights
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    class_weight='balanced'
)

rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)

print("✅ Random Forest Model trained successfully.\n")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


✅ Random Forest Model trained successfully.

Accuracy: 0.7894736842105263

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.96      0.88        45
           1       0.50      0.17      0.25        12

    accuracy                           0.79        57
   macro avg       0.66      0.56      0.56        57
weighted avg       0.75      0.79      0.75        57


Confusion Matrix:
 [[43  2]
 [10  2]]
