In [2]:
import concurrent.futures as futures
import multiprocessing
import os
import re

import pickle
import numpy as np
from pytorch_lightning import (
    LightningDataModule,
    LightningModule,
    Trainer,
    seed_everything,
)
from pytorch_lightning.callbacks import ModelCheckpoint

import torch
import datasets

from torch.utils.data import DataLoader
from transformers import (
    # AdamW,  # this does not work
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
)

from torchmetrics.classification import BinaryAccuracy, BinaryF1Score

In [3]:
SEED = 0
rng = np.random.default_rng(SEED)
GEN_SEED = torch.Generator().manual_seed(SEED)
seed_everything(SEED, workers=True)
MODEL_NAME = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"

os.environ["TOKENIZERS_PARALLELISM"] = "true"

Global seed set to 0


In [5]:
CLEANED = True
if CLEANED:
    with open("../../dataset_clean.pkl", "rb") as f:
        tuple_dataset = pickle.load(f)
else:
    with open("../../dataset.pkl", "rb") as f:
        tuple_dataset = pickle.load(f)

print("loaded dataset")
print("dataset: ", len(tuple_dataset))
print(tuple_dataset[:1])

loaded dataset
dataset:  77998
[('patientregistry idcenter 100 sex M yeardiagnosisdiabetes 1988-01-01 levelofeducation [UNK] maritalstatus [UNK] profession [UNK] yearofbirth 1936-01-01 yearfirstaccess 1991-01-01 yearofdeath [UNK] diagnosis date 2013-03-05 amdcode AMD130 meaning Non diabetic retinopathy value S date 2011-02-07 amdcode AMD130 meaning Non diabetic retinopathy value S date 2009-03-20 amdcode AMD044 meaning Ischemic heart disease value 414 date 2009-03-20 amdcode AMD247 meaning Other comorbidities value 414.9 date 2009-03-20 amdcode AMD247 meaning Other comorbidities value 36.10 date 2009-03-20 amdcode AMD049 meaning Coronary bypass value S date 2008-04-03 amdcode AMD049 meaning Coronary bypass value S date 2008-04-03 amdcode AMD044 meaning Ischemic heart disease value 414 date 2008-04-03 amdcode AMD247 meaning Other comorbidities value 414.9 date 2008-04-03 amdcode AMD247 meaning Other comorbidities value 36.10 date 2008-04-03 amdcode AMD130 meaning Non diabetic retinopath

In [None]:
# class PubMedBERTDataset(Dataset):
#     def __init__(self, data):
#         # here data is a list of tuples,
#         # each containing the patient history string and their label
#         self.data = data

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         patient_history = self.data[idx][0]
#         label = self.data[idx][1]
#         return patient_history, label

In [4]:
def convert_to_huggingfaceDataset(tuple_dataset):
    # here data is a list of tuples,
    # each containing the patient history string and their label
    # we need to convert it to a hugginface dataset
    dict_list = [{"label": data[1], "text": data[0]} for data in tuple_dataset]
    dataset = datasets.Dataset.from_list(dict_list)
    return dataset

In [15]:
max_len = [0] * 100
index = [0] * 100

# Save the 100 most length sentence and their indexes
for i, (ph, l) in enumerate(tuple_dataset):
    # Count the number of words in the ph variable
    if CLEANED:
        ph_len = len(ph.split(" "))
    else:
        ph_len = len(ph.replace(" ", "=").split("="))
    for j, x in enumerate(max_len):
        if ph_len > x:
            max_len.insert(j, ph_len)
            index.insert(j, i)
            max_len.pop()
            index.pop()
            break

print(max_len)
print(index)

[22305, 22079, 20909, 20501, 20112, 20020, 19881, 19713, 19338, 19145, 18848, 18822, 18814, 18793, 18717, 18361, 18123, 18063, 17986, 17984, 17982, 17948, 17883, 17720, 17708, 17642, 17545, 17536, 17521, 17516, 17488, 17482, 17438, 17379, 17353, 17239, 17215, 17189, 17186, 17174, 16998, 16988, 16890, 16662, 16632, 16596, 16554, 16462, 16427, 16301, 16172, 16141, 16130, 16060, 16048, 16035, 15983, 15979, 15963, 15950, 15894, 15887, 15856, 15824, 15811, 15807, 15805, 15737, 15696, 15659, 15616, 15560, 15558, 15550, 15542, 15453, 15425, 15391, 15374, 15366, 15359, 15354, 15339, 15329, 15300, 15288, 15271, 15265, 15210, 15207, 15199, 15177, 15164, 15163, 15136, 15133, 15118, 15087, 15070, 15065]
[29386, 11013, 33943, 14886, 21091, 27236, 43122, 45283, 28635, 38514, 19574, 21713, 12253, 6307, 9228, 44448, 4151, 25433, 20428, 23246, 4328, 5232, 13095, 40099, 24116, 8833, 16724, 14597, 21850, 3343, 35516, 9494, 30163, 7109, 30630, 26372, 24934, 1773, 18178, 5568, 46660, 25141, 18157, 35225, 3

In [8]:
class PubMedBERTDataModule(LightningDataModule):
    def __init__(
        self,
        tuple_dataset,
        model_name_with_path: str,
        max_seq_length: int = 512,  # 512 is the max length of BERT and PubMedBERT but I need 32768 for not CLEANED and 22305 for CLEANED
        train_batch_size: int = 8,
        eval_batch_size: int = 8,
        **kwargs,
    ):
        super().__init__()
        self.model_name_with_path = model_name_with_path
        self.max_seq_length = max_seq_length
        self.train_batch_size = train_batch_size
        self.eval_batch_size = eval_batch_size
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_name_with_path, use_fast=True
        )

    def setup(self, stage=None):
        dataset = convert_to_huggingfaceDataset(tuple_dataset)
        tokenized_dataset = dataset.map(
            self.convert_to_features,
            batched=True,
            remove_columns=["text", "label"],
        )
        tokenized_dataset.set_format(type="torch")

        # split dataset into train and validation sampling randomly
        # use 20% of training data for validation
        train_set_size = int(len(tokenized_dataset) * 0.8)
        valid_set_size = len(tokenized_dataset) - train_set_size

        # split the dataset randomly into two
        self.train_data, self.valid_data = torch.utils.data.random_split(
            tokenized_dataset, [train_set_size, valid_set_size], generator=GEN_SEED
        )

    def prepare_data(self):
        AutoTokenizer.from_pretrained(
            self.model_name_with_path,
            use_fast=True,
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_data,
            batch_size=self.train_batch_size,
            shuffle=True,
            num_workers=8,
        )

    def val_dataloader(self):
        return DataLoader(
            self.valid_data,
            batch_size=self.eval_batch_size,
            shuffle=False,
            num_workers=4,
        )

    def test_dataloader(self):
        # placeholder
        return DataLoader(
            self.valid_data,
            batch_size=self.eval_batch_size,
            shuffle=False,
            num_workers=4,
        )

    def convert_to_features(self, example_batch, indices=None):
        # Tokenize the patient history
        features = self.tokenizer(
            text=example_batch["text"],
            max_length=self.max_seq_length,
            padding="longest",
            truncation=True,
            return_tensors="pt",
        )
        # Rename label to labels to make it easier to pass to model forward
        features["labels"] = example_batch["label"]

        return features

In [7]:
# dm = PubMedBERTDataModule(tuple_dataset, MODEL_NAME)
# dm.prepare_data()
# dm.setup("fit")
# next(iter(dm.train_dataloader()))

In [9]:
class PubMedBERTTransformer(LightningModule):
    def __init__(
        self,
        model_name_or_path: str,
        num_labels: int = 2,  # It will create a PubMedBERT model (in our case) instance with encoder weights copied from the PubMedBERT model and a randomly initialized sequence classification head on top of the encoder with an output size of 2
        learning_rate: float = 2e-5,
        adam_epsilon: float = 1e-8,
        warmup_steps: int = 0,
        weight_decay: float = 0.0,
        **kwargs,
    ):
        super().__init__()

        self.save_hyperparameters()

        self.config = AutoConfig.from_pretrained(
            model_name_or_path, num_labels=num_labels
        )
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name_or_path, config=self.config
        )
        self.train_acc_metric = BinaryAccuracy()
        self.val_acc_metric = BinaryAccuracy()
        self.train_f1_metric = BinaryF1Score()
        self.val_f1_metric = BinaryF1Score()

    def forward(self, **inputs):
        return self.model(**inputs)

    def step(self, batch):
        outputs = self(**batch)
        loss, logits = outputs[:2]
        if self.hparams.num_labels > 1:
            preds = logits.argmax(axis=1)
        elif self.hparams.num_labels == 1:
            preds = logits.squeeze()
        labels = batch["labels"]
        return {"loss": loss, "logits": logits, "preds": preds, "labels": labels}

    def training_step(self, batch, batch_idx):
        outputs = self.step(batch)
        self.train_acc_metric(outputs["preds"], outputs["labels"])
        self.train_f1_metric(outputs["preds"], outputs["labels"])
        self.log(
            "train_acc",
            self.train_acc_metric,
            on_step=True,
            on_epoch=True,
            prog_bar=False,
        )
        self.log(
            "train_f1", self.train_f1_metric, on_step=True, on_epoch=True, prog_bar=True
        )
        self.log(
            "train_loss", outputs["loss"], on_step=True, on_epoch=True, prog_bar=True
        )
        return outputs["loss"]

    def validation_step(self, batch, batch_idx):
        outputs = self.step(batch)
        self.val_acc_metric(outputs["preds"], outputs["labels"])
        self.val_f1_metric(outputs["preds"], outputs["labels"])
        self.log("val_acc", self.val_acc_metric, on_epoch=True, prog_bar=True)
        self.log("val_f1", self.val_f1_metric, on_epoch=True, prog_bar=True)
        self.log("val_loss", outputs["loss"], on_epoch=True, prog_bar=True)
        return {
            "loss": outputs["loss"],
            "preds": outputs["preds"],
            "labels": outputs["labels"],
        }

    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""
        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p
                    for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [
                    p
                    for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        optimizer = torch.optim.AdamW(
            optimizer_grouped_parameters,
            lr=self.hparams.learning_rate,
            eps=self.hparams.adam_epsilon,
        )

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.hparams.warmup_steps,
            num_training_steps=self.trainer.estimated_stepping_batches,
        )
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        return [optimizer], [scheduler]

In [9]:
dm = PubMedBERTDataModule(tuple_dataset, MODEL_NAME)
dm.setup("fit")
# print(next(iter(dm.train_dataloader())))

model = PubMedBERTTransformer(
    model_name_or_path=MODEL_NAME,
)

trainer = Trainer(
    max_epochs=2,
    accelerator="auto",
    devices="auto",
)
trainer.fit(model=model, datamodule=dm)

Map: 100%|██████████| 150/150 [00:00<00:00, 187.21 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Map: 100%|██████████| 150/150 [00:00<00:00, 223.64 examples/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
  rank_zero_warn(
  rank_zero_warn(

  | Name   | Type                          | Params
---------------------------------------------------------
0 | model  | BertForSequenceClassification | 109 M 
1 | metric | BinaryAccuracy                | 0     
-------------------------

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(


Epoch 1: 100%|██████████| 30/30 [00:23<00:00,  1.29it/s, v_num=0, train_loss=0.0398]

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 30/30 [00:25<00:00,  1.18it/s, v_num=0, train_loss=0.0398]


In [29]:
patient_index = 40000

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(tuple_dataset[patient_index][0])
inputs = tokenizer(
    tuple_dataset[patient_index][0],
    max_length=512,
    truncation=True,
    return_tensors="pt",
).to("cuda" if torch.cuda.is_available() else "cpu")
print(tuple_dataset[patient_index][1])

patientregistry idcenter 164 sex M yeardiagnosisdiabetes 1996-01-01 levelofeducation [UNK] maritalstatus 2.0 profession 9.0 yearofbirth 1941-01-01 yearfirstaccess 2003-01-01 yearofdeath 2014-01-01 diagnosis date 2013-04-12 amdcode AMD097 meaning Cigarette smoke value EX date 2013-04-12 amdcode AMD247 meaning Other comorbidities value 585 date 2013-04-12 amdcode AMD067 meaning Chronic renal failure value S date 2012-12-21 amdcode AMD247 meaning Other comorbidities value 585 date 2012-12-21 amdcode AMD097 meaning Cigarette smoke value EX date 2012-12-21 amdcode AMD067 meaning Chronic renal failure value S date 2011-12-05 amdcode AMD247 meaning Other comorbidities value 736.70 date 2011-12-05 amdcode AMD038 meaning Polyneuropathy value 357.2 date 2011-12-05 amdcode AMD247 meaning Other comorbidities value 585 date 2011-12-05 amdcode AMD247 meaning Other comorbidities value 440.21 date 2011-12-05 amdcode AMD247 meaning Other comorbidities value 440.2 date 2011-12-05 amdcode AMD247 meaning 

In [30]:
config = AutoConfig.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    "../../lightning_logs/version_6/checkpoints/epoch=2-step=23400.ckpt", config=config
).to("cuda" if torch.cuda.is_available() else "cpu")
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class = logits.argmax(axis=1).item()
print("Predicted class:", predicted_class)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../../lightning_logs/version_6/checkpoints/epoch=2-step=23400.ckpt and are newly initialized: ['encoder.layer.8.attention.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', 'embeddings.word_embeddings.weight', 'encoder.layer.9.intermediate.dense.weight', 'encoder.layer.1.attention.self.value.weight', 'encoder.layer.11.attention.self.key.bias', 'encoder.layer.4.output.LayerNorm.bias', 'encoder.layer.7.attention.output.dense.weight', 'encoder.layer.4.attention.output.LayerNorm.bias', 'encoder.layer.10.output.dense.bias', 'encoder.layer.7.attention.self.key.weight', 'encoder.layer.11.attention.output.dense.weight', 'encoder.layer.10.attention.self.value.bias', 'encoder.layer.4.attention.self.key.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.3.intermediate.dense.weight', 'encoder.layer.1.attention.output.dense.bias', 'encoder.layer.10.output.LayerNorm.bias', 'enc

Predicted class: 1


In [31]:
model = PubMedBERTTransformer(
    model_name_or_path=MODEL_NAME,
).load_from_checkpoint(
    "../../lightning_logs/version_6/checkpoints/epoch=2-step=23400.ckpt"
)

# disable randomness, dropout, etc...
model.eval()

# predict with the model
y_hat = model(**inputs).logits.argmax(axis=1).item()
print("Predicted class:", y_hat)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1
