In [1]:
import torch
import pandas as pd
import numpy as np
import re
print("CUDA available:", torch.cuda.is_available())
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

CUDA available: True
Device name: Tesla T4


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Report labels for the CheXpert dataset
report_label = [
    'Atelectasis',
    'Cardiomegaly',
    'Consolidation',
    'Edema',
    'Enlarged Cardiomediastinum',
    'Fracture',
    'Lung Lesion',
    'Lung Opacity',
    'Pleural Effusion',
    'Pneumonia',
    'Pneumothorax',
    'Pleural Other',
    'Support Devices',
    'No Finding'
    ]


# use processed/clean text training data

# model training

Task 2: convert free-text radiology reports into 14 structured binary labels (presence/absence of specific conditions)<br>

Workflow:<br>

1. train a text-based classifier on text_train_data.h5<br>
2. validate its performance on text_val_data.h5<br>
3. run inference on text_test_data.h5 to produce label predictions<br>

#### h5py datasets

### use .csv datasets

In [4]:
# load .csv dataset
train_df = pd.read_csv("text_train_data.csv")
val_df = pd.read_csv("text_val_data.csv")
test_df = pd.read_csv("text_test_data.csv")

## processing

Cased model: BioBERT learns upper and lower cases differently --> keep original casing<br>
To avoid truncation and potential loss of meaning in BERT-based models, apply sentence splitting<br>

In [7]:
# define preprocessing functions
def clean_report(report):
    report = re.sub(r'_{2,}', '<UNK>', report)
    report = re.sub(r'\s+', ' ', report).strip()
    return report

def simple_sentence_split(text):
    return [s.strip() for s in re.split(r'(?<=[.?!])\s+(?=[A-Z0-9])', text) if s.strip()]

def join_sentences(sentences, max_sentences=6):
    return ' '.join(sentences[:max_sentences])

def preprocess(df):
    cleaned = df["text"].apply(clean_report)
    split = cleaned.apply(simple_sentence_split)
    joined = split.apply(join_sentences)
    return joined.tolist(), df["file_name"].tolist(), df.iloc[:, 2:].values.astype('float32')  # labels are all columns from 3rd onward

# Assign to original variable names
texts, patient_ids, labels = preprocess(train_df)
texts_val, patient_ids_val, labels_val = preprocess(val_df)
texts_test, patient_ids_test, labels_test = preprocess(test_df)

## sentence splitting
From what I understand, tokenization has size limit<br>
To avoid truncation and potential loss of meaning in BERT-based models, I want to try sentence splitting<br>

## define & train models

Define tokeniser

### updated for .csv datasets

define tokenizer for t

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

tokenized_train = tokenizer(
    texts,
    padding='max_length',
    truncation=True,
    max_length=256,
    return_tensors='pt'
)

tokenized_val = tokenizer(
    texts_val,
    padding='max_length',
    truncation=True,
    max_length=256,
    return_tensors='pt'
)

tokenized_test = tokenizer(
    texts_test,
    padding='max_length',
    truncation=True,
    max_length=256,
    return_tensors='pt'
)

# Extract tensors
input_ids = tokenized_train['input_ids']
attention_mask = tokenized_train['attention_mask']

In [10]:
# use pre-trained BioBERT model

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "dmis-lab/biobert-base-cased-v1.1",
    num_labels=13,
    problem_type="multi_label_classification"
)

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


define Dataset functions

In [11]:
from torch.utils.data import Dataset
import torch

class RadiologyReportDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.float32)
        }

In [12]:
from torch.utils.data import DataLoader
# set batch size
batch_size = 8

# create training dataset
# (the same as the original dataset, as test/train was already split)
train_dataset = RadiologyReportDataset(texts, labels, tokenizer)

# create data loader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)



In [13]:
# sanity test

for batch in train_loader:
    print(batch['input_ids'].shape)   # should be [batch_size, 512]
    print(batch['labels'].shape)      # should be [batch_size, 13]
    break

torch.Size([8, 512])
torch.Size([8, 13])


In [15]:
# GPU sanity check
import torch

print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Using GPU:", torch.cuda.get_device_name(0))
    print("GPU memory usage:")
    print("Allocated:", round(torch.cuda.memory_allocated(0)/1024**2, 1), "MB")
    print("Cached:   ", round(torch.cuda.memory_reserved(0)/1024**2, 1), "MB")
else:
    print("Running on CPU")

CUDA available: True
Using GPU: Tesla T4
GPU memory usage:
Allocated: 0.0 MB
Cached:    0.0 MB


### establish baseline
batch_size = 8 <br>
epoch = 1 <br>
max_length = 300 <br>
mask NaN --> label -1

In [None]:

import torch
from torch.optim import AdamW
from sklearn.metrics import f1_score

# mask NaN labels (the missing values in medical reports may contain uncertainty)
labels = np.nan_to_num(labels, nan=-1.0)

model.to(device)
model.train()
total_loss = 0

learning_rate = 2e-4
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.BCEWithLogitsLoss(reduction='none')

for batch in train_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    optimizer.zero_grad()
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits

    # Masked BCEWithLogitsLoss
    loss = loss_fn(logits, labels)
    mask = (labels != -1).float()
    masked_loss = (loss * mask).sum() / mask.sum()

    masked_loss.backward()
    optimizer.step()

    total_loss += masked_loss.item()

avg_train_loss = total_loss / len(train_loader)
print(f"epoch 1 complete — average train loss: {avg_train_loss:.4f}")

In [None]:
# check if max_length can be set to 300 without information loss
# token lengths for each sample
token_lengths = [len(tokenizer.encode(text, truncation=False)) for text in joined_texts]

# summary stats
print("Max token length:", max(token_lengths))
print("95th percentile:", np.percentile(token_lengths, 95))
print("Average token length:", np.mean(token_lengths))

In [None]:
# save model result
torch.save(model.state_dict(), "biobert_epoch1.pt")

In [None]:
from google.colab import files
files.download("biobert_epoch1.pt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# evaluate model performance using val_data (after 1 epoch)

from sklearn.metrics import f1_score, classification_report, accuracy_score
import torch

model.eval()
all_preds, all_targets = [], []

with torch.no_grad():
    for batch in val_loader:

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        probs = torch.sigmoid(logits)
        preds = (probs > 0.5).float()

        mask = (labels != -1)
        masked_preds = preds[mask]
        masked_labels = labels[mask]

        # collect batch predictions for NaN but move them to CPU
        all_preds.append(masked_preds.cpu())
        all_targets.append(masked_labels.cpu())

# combine predictions and labels
y_pred = torch.cat(all_preds).numpy()
y_true = torch.cat(all_targets).numpy()

# compute macro F1 score, accuracy
val_f1 = f1_score(y_true, y_pred, average='macro')
val_acc = accuracy_score(y_true, y_pred)

print(f"validation macro F1 Score (after 1 epoch): {val_f1:.4f}")
print(f"validation accuracy (after 1 epoch): {val_acc:.4f}")

validation macro F1 Score (after 1 epoch): 0.4846
validation accuracy (after 1 epoch): 0.9403


### BioBERT (unmasked)

In [16]:
# sanity check again
print(f"texts: {len(texts)}, labels: {labels.shape}")

texts: 36484, labels: (36484, 13)


In [None]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from sklearn.metrics import f1_score
import numpy as np
from transformers import AutoModelForSequenceClassification

# 1: convert NaNs to soft targets (0.5)
labels = np.nan_to_num(labels, nan=0.5)

# 2: define dataset and dataloader
train_dataset = RadiologyReportDataset(texts, labels, tokenizer, max_length=256)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# 3: define model
model = AutoModelForSequenceClassification.from_pretrained(
    "dmis-lab/biobert-base-cased-v1.1",
    num_labels=13,
    problem_type="multi_label_classification"
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 4: training parameter setting
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.BCEWithLogitsLoss()

# 5: train for multiple epochs
num_epochs = 3  # fine-tune

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    print(f"\nEpoch {epoch+1}/{num_epochs}")

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        label_batch = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = loss_fn(logits, label_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} complete — avg Loss: {avg_train_loss:.4f}")

    # save checkpoint
    checkpoint_path = f"biobert_soft_epoch{epoch+1}.pt"
    torch.save(model.state_dict(), checkpoint_path)

    # download last checkpoint (optional in Colab)
    if epoch + 1 == num_epochs:
        from google.colab import files
        files.download(checkpoint_path)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3
Epoch 1 complete — avg Loss: 0.1207

Epoch 2/3
Epoch 2 complete — avg Loss: 0.0945

Epoch 3/3
Epoch 3 complete — avg Loss: 0.0858


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### validation & test on BioBERT (unmasked)

In [17]:
# define validate function

from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, classification_report

def evaluate_model(model, data_loader, device):
    model.eval()
    all_preds, all_probs, all_targets = [], [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).float()

            all_preds.append(preds.cpu())
            all_probs.append(probs.cpu())
            all_targets.append(labels.cpu())

    y_pred = torch.cat(all_preds).cpu().numpy().astype(int)
    y_prob = torch.cat(all_probs).cpu().numpy()
    y_true = torch.cat(all_targets).cpu().numpy().astype(int)

    # classification Report
    print("Classification Report: Validation")
    print(classification_report(y_true, y_pred, digits=4))

    # F1
    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_micro = f1_score(y_true, y_pred, average='micro')

    # AUROC
    roc_macro = roc_auc_score(y_true, y_prob, average='macro')
    roc_micro = roc_auc_score(y_true, y_prob, average='micro')

    print(f"F1 Macro: {f1_macro:.4f}")
    print(f"F1 Micro: {f1_micro:.4f}")
    print(f"AUROC Macro: {roc_macro:.4f}")
    print(f"AUROC Micro: {roc_micro:.4f}")

In [19]:
# save upload of saved weight
from google.colab import files
files.upload()

Saving biobert_soft_epoch3.pt to biobert_soft_epoch3.pt


In [21]:
# load saved models into model for evaluation
model.load_state_dict(torch.load("biobert_soft_epoch3.pt", map_location=device))
model.to(device)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

create validation & test loaders

In [20]:
# Create validation and test datasets directly from raw texts and labels
val_dataset = RadiologyReportDataset(texts_val, labels_val, tokenizer, max_length=256)
test_dataset = RadiologyReportDataset(texts_test, labels_test, tokenizer, max_length=256)

# Wrap in DataLoaders
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8, shuffle=False)

In [22]:
evaluate_model(model, val_loader, device)
evaluate_model(model, test_loader, device)

Classification Report: Validation
              precision    recall  f1-score   support

           0     0.6588    0.4108    0.5060       409
           1     0.7835    0.4222    0.5487       360
           2     0.7174    0.3626    0.4818        91
           3     0.7286    0.4811    0.5795       106
           4     0.5000    0.0145    0.0282        69
           5     0.7143    0.2941    0.4167       102
           6     0.6437    0.3709    0.4706       151
           7     0.7064    0.5441    0.6147       544
           8     0.7275    0.7880    0.7565       349
           9     0.6364    0.1892    0.2917        37
          10     0.7129    0.3512    0.4706       205
          11     0.4828    0.5714    0.5234        49
          12     0.6333    0.3858    0.4795       197

   micro avg     0.6983    0.4665    0.5593      2669
   macro avg     0.6650    0.3989    0.4745      2669
weighted avg     0.6950    0.4665    0.5437      2669
 samples avg     0.1799    0.1371    0.1484   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report: Validation
              precision    recall  f1-score   support

           0     0.7034    0.4148    0.5219       446
           1     0.7080    0.4000    0.5112       400
           2     0.7297    0.3600    0.4821        75
           3     0.7527    0.5691    0.6481       123
           4     0.3333    0.0143    0.0274        70
           5     0.7021    0.4024    0.5116        82
           6     0.5769    0.3750    0.4545       120
           7     0.7040    0.5243    0.6010       576
           8     0.6886    0.7710    0.7275       393
           9     0.7000    0.1591    0.2593        44
          10     0.7016    0.3850    0.4971       226
          11     0.5397    0.6182    0.5763        55
          12     0.5000    0.3134    0.3853       201

   micro avg     0.6792    0.4685    0.5545      2811
   macro avg     0.6415    0.4082    0.4772      2811
weighted avg     0.6724    0.4685    0.5395      2811
 samples avg     0.1899    0.1488    0.1591   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
