In [1]:
import datetime
import pandas as pd

## 1. Data Preprocessing

### 1.1. Diagnoses

In [2]:
# Load MIMIC-III diagnoses table
diagnoses = pd.read_csv("mimic/DIAGNOSES_ICD.csv")
diagnoses.columns = diagnoses.columns.str.lower()
diagnoses.head()

Unnamed: 0,row_id,subject_id,hadm_id,seq_num,icd9_code
0,1297,109,172335,1.0,40301
1,1298,109,172335,2.0,486
2,1299,109,172335,3.0,58281
3,1300,109,172335,4.0,5855
4,1301,109,172335,5.0,4254


In [3]:
# Mapping ICD-9 major codes to broader disease categories
ICD9_CATEGORY_MAP = {
    "250": "Diabetes",
    "401": "Hypertension",
    "410": "Myocardial Infarction",
    "414": "Ischemic Heart Disease",
    "493": "Asthma",
    "518": "Respiratory Failure",
    "585": "Chronic Kidney Disease",
    "V58": "Long-term Medication Use",
    "V12": "Personal History of Disease",
}

In [4]:
# Function to extract the ICD-9 category
def get_icd9_category(icd_code):
    prefix = icd_code.split('.')[0]  # Extract the first 3 digits
    return ICD9_CATEGORY_MAP.get(prefix, "Other")  # Default to "Other" if not found

# Apply mapping to create a new column for disease categories
diagnoses["disease_category"] = diagnoses["icd9_code"].astype(str).apply(get_icd9_category)

# Choose columns
diagnoses = diagnoses[["subject_id", "hadm_id", "icd9_code", "disease_category"]]

# Save processed data
diagnoses.to_csv("preprocessed/processed_diagnoses.csv", index=False)

# Display sample output
diagnoses[["subject_id", "icd9_code", "disease_category"]].head()

Unnamed: 0,subject_id,icd9_code,disease_category
0,109,40301,Other
1,109,486,Other
2,109,58281,Other
3,109,5855,Other
4,109,4254,Other


### 1.2. Prescriptions

In [5]:
# Load MIMIC-III prescriptions table
prescriptions = pd.read_csv("mimic/PRESCRIPTIONS.csv")
prescriptions.columns = prescriptions.columns.str.lower()
prescriptions.head()

  prescriptions = pd.read_csv("mimic/PRESCRIPTIONS.csv")


Unnamed: 0,row_id,subject_id,hadm_id,icustay_id,startdate,enddate,drug_type,drug,drug_name_poe,drug_name_generic,formulary_drug_cd,gsn,ndc,prod_strength,dose_val_rx,dose_unit_rx,form_val_disp,form_unit_disp,route
0,2214776,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,MAIN,Tacrolimus,Tacrolimus,Tacrolimus,TACR1,21796.0,469061711.0,1mg Capsule,2,mg,2,CAP,PO
1,2214775,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,MAIN,Warfarin,Warfarin,Warfarin,WARF5,6562.0,56017275.0,5mg Tablet,5,mg,1,TAB,PO
2,2215524,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,MAIN,Heparin Sodium,,,HEPAPREMIX,6522.0,338055002.0,"25,000 unit Premix Bag",25000,UNIT,1,BAG,IV
3,2216265,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,BASE,D5W,,,HEPBASE,,0.0,HEPARIN BASE,250,ml,250,ml,IV
4,2214773,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,MAIN,Furosemide,Furosemide,Furosemide,FURO20,8208.0,54829725.0,20mg Tablet,20,mg,1,TAB,PO


In [6]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Load PubMedBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")
model = AutoModel.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")

def get_pubmedbert_embedding(drug_name):
    """Generate embedding using PubMedBERT for a given drug name."""
    inputs = tokenizer(drug_name, return_tensors="pt", truncation=True, padding=True, max_length=50)
    with torch.no_grad():
        output = model(**inputs)
    return output.last_hidden_state.mean(dim=1).squeeze().numpy()

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [7]:
# Extract unique medication names
medications = prescriptions["drug"].dropna().unique()

# Generate PubMedBERT embeddings for each medication
medication_embeddings = {drug: get_pubmedbert_embedding(drug) for drug in medications}

# Convert embeddings into a DataFrame
medication_df = pd.DataFrame.from_dict(medication_embeddings, orient="index")
medication_df.reset_index(inplace=True)
medication_df.columns = ["drug"] + [f"dim_{i}" for i in range(medication_df.shape[1] - 1)]

# Save medication embeddings
medication_df.to_csv("preprocessed/medication_embeddings.csv", index=False)

print("Generated PubMedBERT embeddings for medications!")

Generated PubMedBERT embeddings for medications!


In [8]:
# Choose columns
prescriptions = prescriptions[['subject_id', 'hadm_id', 'drug', 'startdate', 'enddate']]

# Merge embeddings into the prescriptions dataset
prescriptions = prescriptions.merge(medication_df, on="drug", how="left")

# Save the updated prescriptions data with embeddings
prescriptions.to_csv("preprocessed/processed_prescriptions.csv", index=False)

# Display sample results
prescriptions[["subject_id", "hadm_id", "drug", "startdate", "enddate"] + [f"dim_{i}" for i in range(5)]].head()

KeyboardInterrupt: 

### 1.3. Admissions

In [None]:
# Load MIMIC-III ADMISSIONS table
admissions = pd.read_csv("mimic/ADMISSIONS.csv")
admissions.columns = admissions.columns.str.lower()
admissions.head()

In [None]:
# Select the first ethnicity recorded for each patient
patient_ethnicity = admissions.groupby("subject_id")["ethnicity"].first().reset_index()

# Encode ethnicity as numerical labels
patient_ethnicity["ethnicity"], ethnicity_mapping = pd.factorize(patient_ethnicity["ethnicity"])

# Display sample mapping
print("Ethnicity Mapping:", dict(enumerate(ethnicity_mapping)))
patient_ethnicity.head()

In [None]:
# Convert datetime columns
admissions["admittime"] = pd.to_datetime(admissions["admittime"])
admissions["dischtime"] = pd.to_datetime(admissions["dischtime"])

# Calculate length of hospital stay
admissions["length_of_stay"] = (admissions["dischtime"] - admissions["admittime"]).dt.days

admissions = admissions[['subject_id', 'hadm_id', 'admittime', 'dischtime', 'length_of_stay', 'admission_type']]
admissions.to_csv("preprocessed/processed_admissions.csv", index=False)
# Display updated admissions data
admissions.head()

### 1.4. Lab Events

In [None]:
# Load LABEVENTS table
labevents = pd.read_csv("mimic/LABEVENTS.csv")
labevents.columns = labevents.columns.str.lower()
labevents.head()

#### Fill nan in hadm_id with the patient's first hospital admission

In [None]:
# Get first hospital admission for each patient
first_admission = admissions.groupby("subject_id")["hadm_id"].first().reset_index()

# Merge first admission ID into lab events
labevents = labevents.merge(first_admission, on="subject_id", how="left", suffixes=("", "_first"))

# Fill NaN hadm_id with first known hospital admission
labevents["hadm_id"] = labevents["hadm_id"].fillna(labevents["hadm_id_first"])

# Drop temporary column
labevents = labevents.drop(columns=["hadm_id_first"])

In [None]:
# Convert datetime column
labevents["charttime"] = pd.to_datetime(labevents["charttime"])

# Drop missing values in lab results
labevents = labevents.dropna(subset=["valuenum"])

# Normalize lab values (Min-Max Scaling)
labevents["valuenum_normalized"] = (
    (labevents["valuenum"] - labevents["valuenum"].min()) / 
    (labevents["valuenum"].max() - labevents["valuenum"].min())
)

labevents = labevents[['subject_id', 'hadm_id', 'itemid', 'charttime', 'valuenum_normalized']]
labevents.to_csv("preprocessed/processed_labevents.csv", index=False)

# Display sample normalized lab results
labevents.head()

### 1.5. Procedures

In [None]:
# Load PROCEDURES_ICD table
procedures = pd.read_csv("mimic/PROCEDURES_ICD.csv")
procedures.columns = procedures.columns.str.lower()
procedures.head()

In [None]:
# Define ICD-9 Procedure Category Mapping
ICD9_PROCEDURE_MAP = {
    "36": "Cardiac Procedures",
    "81": "Joint Procedures",
    "96": "Respiratory Support",
    "99": "Other Procedures",
}

# Function to categorize ICD-9 procedure codes
def get_icd9_procedure_category(icd_code):
    prefix = icd_code.split('.')[0]  # Extract first 2 digits
    return ICD9_PROCEDURE_MAP.get(prefix, "Other")

# Apply mapping
procedures["procedure_category"] = procedures["icd9_code"].astype(str).apply(get_icd9_procedure_category)

# Choose columns
procedures = procedures[['subject_id', 'hadm_id', 'icd9_code', 'procedure_category']]
procedures.to_csv("preprocessed/processed_procedures.csv", index=False)

# Display sample output
procedures.head()

### 1.6. Patients

In [None]:
# Load MIMIC-III PATIENTS table
patients = pd.read_csv("mimic/PATIENTS.csv")
patients.columns = patients.columns.str.lower()
patients.head()

In [None]:
# Get first admission year for each patient
first_admission = admissions.groupby("subject_id")["admittime"].min().reset_index()
first_admission["admission_year"] = first_admission["admittime"].dt.year

# Merge admission year with patient data
patients = patients.merge(first_admission[["subject_id", "admission_year"]], on="subject_id", how="left")

# Convert DOB to Age
patients["dob"] = pd.to_datetime(patients["dob"])  # Convert to datetime
patients["age"] = patients["admission_year"] - patients["dob"].dt.year

In [None]:
# Convert gender to binary (M=1, F=0)
patients["gender"] = patients["gender"].apply(lambda x: 1 if x == "M" else 0)

# Merge Ethnicity from Admissions
patients = patients.merge(patient_ethnicity, on="subject_id", how="left")

patients = patients[['subject_id', 'age', 'gender', 'ethnicity']]
patients.to_csv("preprocessed/processed_patients.csv")

# Display sample output
patients.head()