In [2]:
import pickle
import numpy as np
from pytorch_lightning import (
    LightningDataModule,
    LightningModule,
    Trainer,
    seed_everything,
)
import torch

from torch.utils.data import DataLoader
from transformers import (
    AdamW,
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
)
import datasets
from typing import Optional
from datetime import datetime
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
SEED = 0
rng = np.random.default_rng(SEED)
GEN_SEED = torch.Generator().manual_seed(SEED)

In [4]:
with open("../../dataset.pkl", "rb") as f:
    tuple_dataset = pickle.load(f)

print("loaded dataset")
print("dataset: ", len(tuple_dataset))
print(tuple_dataset[:1])

loaded dataset
dataset:  77998
[('patientregistry: idcenter=100, idpatient=3080, sex=M, yeardiagnosisdiabetes=1988-01-01, levelofeducation=[UNK], maritalstatus=[UNK], profession=[UNK], yearofbirth=1936-01-01 00:00:00, yearfirstaccess=1991-01-01 00:00:00, yearofdeath=[UNK], diagnosis: idcenter=100, idpatient=3080, date=1991-10-29 00:00:00, amdcode=AMD097, meaning=Cigarette smoke, value=N, idcenter=100, idpatient=3080, date=2004-02-04 00:00:00, amdcode=AMD044, meaning=Ischemic heart disease, value=414, idcenter=100, idpatient=3080, date=2004-02-04 00:00:00, amdcode=AMD247, meaning=Other comorbidities, value=414.9, idcenter=100, idpatient=3080, date=2004-07-21 00:00:00, amdcode=AMD130, meaning=Non diabetic retinopathy, value=[UNK], idcenter=100, idpatient=3080, date=2005-02-21 00:00:00, amdcode=AMD049, meaning=Coronary bypass, value=S, idcenter=100, idpatient=3080, date=2005-02-21 00:00:00, amdcode=AMD247, meaning=Other comorbidities, value=36.10, idcenter=100, idpatient=3080, date=2005-0

In [5]:
def convert_to_huggingfaceDataset(tuple_dataset):
    # here data is a list of tuples,
    # each containing the patient history string and their label
    # we need to convert it to a hugginface dataset
    dict_list = [{"label": data[1], "text": data[0]} for data in tuple_dataset]
    dataset = datasets.Dataset.from_list(dict_list)
    return dataset

In [6]:
# max_len = [0] * 100
# index = [0] * 100

# # Save the 100 most length sentence and their indexes
# for i, (ph, l) in enumerate(dataset):
#     # Count the number of words in the ph variable
#     ph_len = len(ph.replace(' ', '=').split('='))
#     for j, x in enumerate(max_len):
#         if ph_len > x:
#             max_len.insert(j, ph_len)
#             index.insert(j, i)
#             max_len.pop()
#             index.pop()
#             break

# print(max_len)
# print(index)

In [7]:
class PubMedBERTDataModule(LightningDataModule):
    def __init__(
        self,
        tuple_dataset,
        model_name_with_path: str,
        max_seq_length: int = 32768,
        train_batch_size: int = 16,
        eval_batch_size: int = 32,
        **kwargs,
    ):
        super().__init__()
        self.model_name_with_path = model_name_with_path
        self.max_seq_length = max_seq_length
        self.train_batch_size = train_batch_size
        self.eval_batch_size = eval_batch_size
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_name_with_path, use_fast=True
        )

    def setup(self, stage=None):
        dataset = convert_to_huggingfaceDataset(tuple_dataset[:50])
        tokenized_dataset = dataset.map(
            self.convert_to_features,
            batched=True,
        )

        # split dataset into train and validation sampling randomly
        # use 20% of training data for validation
        train_set_size = int(len(tokenized_dataset) * 0.8)
        valid_set_size = len(tokenized_dataset) - train_set_size

        # split the dataset randomly into two
        self.train_data, self.valid_data = torch.utils.data.random_split(
            tokenized_dataset, [train_set_size, valid_set_size], generator=GEN_SEED
        )

    def prepare_data(self):
        AutoTokenizer.from_pretrained(
            self.model_name_with_path,
            use_fast=True,
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_data, batch_size=self.train_batch_size, shuffle=True
        )

    def val_dataloader(self):
        return DataLoader(
            self.valid_data, batch_size=self.eval_batch_size, shuffle=False
        )

    def test_dataloader(self):
        # placeholder
        return DataLoader(
            self.valid_data, batch_size=self.eval_batch_size, shuffle=False
        )

    def convert_to_features(self, example_batch, indices=None):
        # Tokenize the patient history
        features = self.tokenizer.batch_encode_plus(
            example_batch["text"],
            max_length=self.max_seq_length,
            padding="longest",
            truncation=True,
            return_tensors="pt",
        )
        # Rename label to labels to make it easier to pass to model forward
        features["labels"] = example_batch["label"]

        return features

In [8]:
class PubMedBERTTransformer(LightningModule):
    def __init__(
        self,
        model_name_or_path: str,
        num_labels: int = 2,
        learning_rate: float = 2e-5,
        adam_epsilon: float = 1e-8,
        warmup_steps: int = 0,
        weight_decay: float = 0.0,
        train_batch_size: int = 32,
        eval_batch_size: int = 32,
        eval_splits: Optional[list] = None,
        **kwargs,
    ):
        super().__init__()

        self.save_hyperparameters()

        self.config = AutoConfig.from_pretrained(
            model_name_or_path, num_labels=num_labels
        )
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name_or_path, config=self.config
        )
        self.metric = evaluate.load(
            "accuracy", experiment_id=datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
        )

    def forward(self, **inputs):
        del inputs["label"]
        del inputs["text"]
        return self.model(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            labels=inputs["labels"],
        )

    def training_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs[0]
        return loss

    def validation_step(self, batch, batch_idx, dataloader_idx=0):
        outputs = self(**batch)
        val_loss, logits = outputs[:2]

        if self.hparams.num_labels > 1:
            preds = torch.argmax(logits, axis=1)
        elif self.hparams.num_labels == 1:
            preds = logits.squeeze()

        labels = batch["labels"]

        return {"loss": val_loss, "preds": preds, "labels": labels}

    def on_validation_epoch_end(self, outputs):
        preds = torch.cat([x["preds"] for x in outputs]).detach().cpu().numpy()
        labels = torch.cat([x["labels"] for x in outputs]).detach().cpu().numpy()
        loss = torch.stack([x["loss"] for x in outputs]).mean()
        self.log("val_loss", loss, prog_bar=True)
        self.log_dict(
            self.metric.compute(predictions=preds, references=labels), prog_bar=True
        )

    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""
        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p
                    for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [
                    p
                    for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=self.hparams.learning_rate,
            eps=self.hparams.adam_epsilon,
        )

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.hparams.warmup_steps,
            num_training_steps=self.trainer.estimated_stepping_batches,
        )
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        return [optimizer], [scheduler]

In [9]:
seed_everything(42)
model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"

dm = PubMedBERTDataModule(tuple_dataset, model_name)
dm.setup("fit")
model = PubMedBERTTransformer(
    model_name_or_path=model_name,
)

trainer = Trainer(
    max_epochs=1,
    accelerator="auto",
    devices=1 if torch.cuda.is_available() else 1, 
)
trainer.fit(model, datamodule=dm)

Global seed set to 42
Map: 100%|██████████| 50/50 [00:02<00:00, 19.67 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Map: 100%|██████████| 50/50 [00:02<00:00, 20.98 examples/s]
Loading `train_dataloader` to estimate number of stepping batches.
  rank_zero_warn(
  rank_zero_warn(

  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 109 M 
--------------------------------------------------------
109 M     Trainable params
0         Non-trainable par

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Sanity Checking DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2]), tensor([2774, 2774, 2774, 2774, 2774, 2774, 2774, 2774, 2774, 2774]), tensor([3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127]), tensor([4837, 4837, 4837, 4837, 4837, 4837, 4837, 4837, 4837, 4837]), tensor([30, 30, 30, 30, 30, 30, 30, 30, 30, 30]), tensor([29464, 29464, 29464, 29464, 29464, 29464, 29464, 29464, 29464, 29464]), tensor([9583, 9583, 9583, 9583, 9583, 9583, 9583, 9583, 9583, 9583]), tensor([33, 33, 33, 33, 33, 33, 33, 33, 33, 33]), tensor([ 2641,  9608,  4353, 15820, 16710,  5117,  2282, 11491, 16560, 16056]), tensor([16, 16, 16, 16, 16, 16, 16, 16, 16, 16]), tensor([2313, 2313, 2313, 2313, 2313, 2313, 2313, 2313, 2313, 2313]), tensor([7791, 7791, 7791, 7791, 7791, 7791, 7791, 7791, 7791, 7791]), tensor([33, 33, 33, 33, 33, 33, 33, 33, 33, 33]), tensor([23076, 29467,  5749, 16877,  5255, 13680, 19307,  3826, 28242, 17810]), tensor([1015, 1026,   16, 1008, 

AttributeError: 'list' object has no attribute 'size'