## Training language models on MIMIC-IV dataset for Sepsis classification

### Models to Train (3 BERTs) 

In [2]:
bert_model_names = [
    "emilyalsentzer/Bio_ClinicalBERT",   # Clinical notes (best for MIMIC)
    "dmis-lab/biobert-base-cased-v1.2",  # Biomedical literature
    "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"
]

### Dataset & Dataloader 

In [3]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

In [4]:
class SepsisTextDataset(Dataset):
    def __init__(self, csv_path, tokenizer, max_len=256):
        self.df = pd.read_csv(csv_path)
        self.texts = self.df["text"].astype(str).tolist()
        self.labels = self.df["sepsis"].astype(int).tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }


### Training Function

In [8]:
from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW
from tqdm import tqdm

def train_bert_model(
    model_name,
    train_csv,
    val_csv,
    device,
    epochs=3,
    batch_size=8,
    lr=2e-5
):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=2
    ).to(device)

    train_ds = SepsisTextDataset(train_csv, tokenizer)
    val_ds   = SepsisTextDataset(val_csv, tokenizer)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_ds, batch_size=batch_size)

    optimizer = AdamW(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in tqdm(train_loader, desc=f"{model_name} | Epoch {epoch+1}"):
            optimizer.zero_grad()

            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1} | Train Loss: {total_loss / len(train_loader):.4f}")

    return model, tokenizer


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

trained_text_models = []   # for DES
trained_tokenizers = []

for model_name in bert_model_names:
    model, tokenizer = train_bert_model(
        model_name,
        train_csv="data/train.csv",
        val_csv="data/dsel.csv",
        device=device
    )

    trained_text_models.append(model)
    trained_tokenizers.append(tokenizer)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
emilyalsentzer/Bio_ClinicalBERT | Epoch 1: 100%|████████████| 714/714 [01:15<00:00,  9.50it/s]


Epoch 1 | Train Loss: 0.6232


emilyalsentzer/Bio_ClinicalBERT | Epoch 2: 100%|████████████| 714/714 [01:15<00:00,  9.46it/s]


Epoch 2 | Train Loss: 0.5364


emilyalsentzer/Bio_ClinicalBERT | Epoch 3: 100%|████████████| 714/714 [01:15<00:00,  9.45it/s]


Epoch 3 | Train Loss: 0.4663


config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dmis-lab/biobert-base-cased-v1.2 | Epoch 1:   2%|▏           | 12/714 [00:01<01:14,  9.45it/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

dmis-lab/biobert-base-cased-v1.2 | Epoch 1: 100%|███████████| 714/714 [01:15<00:00,  9.46it/s]


Epoch 1 | Train Loss: 0.6674


dmis-lab/biobert-base-cased-v1.2 | Epoch 2: 100%|███████████| 714/714 [01:15<00:00,  9.46it/s]


Epoch 2 | Train Loss: 0.5884


dmis-lab/biobert-base-cased-v1.2 | Epoch 3: 100%|███████████| 714/714 [01:15<00:00,  9.44it/s]


Epoch 3 | Train Loss: 0.5421


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract | Epoch 1: 100%|█| 714/714 [01:14<00:00, 


Epoch 1 | Train Loss: 0.6284


microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract | Epoch 2: 100%|█| 714/714 [01:15<00:00, 


Epoch 2 | Train Loss: 0.5367


microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract | Epoch 3: 100%|█| 714/714 [01:15<00:00, 

Epoch 3 | Train Loss: 0.4834





### Evaluate

In [20]:
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, balanced_accuracy_score
import numpy as np

In [21]:
def get_test_loader(csv_path, tokenizer, batch_size=16):
    test_ds = SepsisTextDataset(csv_path, tokenizer)
    return DataLoader(test_ds, batch_size=batch_size, shuffle=False)


In [22]:
@torch.no_grad()
def evaluate_bert(model, test_loader, device):
    model.eval()

    all_preds = []
    all_probs = []
    all_labels = []

    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)

        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)[:, 1]  # sepsis prob
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_probs.extend(probs.cpu().numpy())
        all_labels.extend(batch["labels"].cpu().numpy())

    return {
        "Accuracy": accuracy_score(all_labels, all_preds),
        "Balanced Accuracy": balanced_accuracy_score(all_labels, all_preds),
        "F1": f1_score(all_labels, all_preds),
        "AUROC": roc_auc_score(all_labels, all_probs)
    }


In [23]:
test_results = {}

for model, tokenizer, model_name in zip(
    trained_text_models,
    trained_tokenizers,
    ["ClinicalBERT", "BioBERT", "PubMedBERT"]
):
    test_loader = get_test_loader("data/test.csv", tokenizer)
    test_results[model_name] = evaluate_bert(model, test_loader, device)

In [24]:
df_results = pd.DataFrame(test_results).T
df_results

Unnamed: 0,Accuracy,Balanced Accuracy,F1,AUROC
ClinicalBERT,0.711601,0.711601,0.737546,0.786129
BioBERT,0.68219,0.68219,0.714181,0.765507
PubMedBERT,0.699346,0.699346,0.687606,0.779476


### Save the models 

In [28]:
import os
from transformers import AutoTokenizer

save_root = "saved_text_models"

model_save_map = {
    "clinicalbert_sepsis": "emilyalsentzer/Bio_ClinicalBERT",
    "biobert_sepsis": "dmis-lab/biobert-base-cased-v1.2",
    "pubmedbert_sepsis": "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"
}

os.makedirs(save_root, exist_ok=True)

for (save_name, hf_name), model in zip(model_save_map.items(), trained_text_models):
    save_path = os.path.join(save_root, save_name)
    os.makedirs(save_path, exist_ok=True)

    # Load tokenizer from original HF model
    tokenizer = AutoTokenizer.from_pretrained(hf_name)

    # Save BOTH model and tokenizer
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

    print(f"Saved model + tokenizer → {save_path}")


Saved model + tokenizer → saved_text_models/clinicalbert_sepsis
Saved model + tokenizer → saved_text_models/biobert_sepsis
Saved model + tokenizer → saved_text_models/pubmedbert_sepsis


In [26]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT",
    use_fast=False
)

In [27]:
tokenizer.save_pretrained("saved_text_models/clinicalbert_sepsis")


('saved_text_models/clinicalbert_sepsis/tokenizer_config.json',
 'saved_text_models/clinicalbert_sepsis/special_tokens_map.json',
 'saved_text_models/clinicalbert_sepsis/vocab.txt',
 'saved_text_models/clinicalbert_sepsis/added_tokens.json')