In [None]:
# mount your drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -U wandb
!pip install -U pymetis
!pip install -U transformers[torch]
!pip install -U adapters
!pip install -U datasets

Collecting wandb
  Downloading wandb-0.17.0-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.4.0-py2.py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.2/289.2 kB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86

In [None]:
import numpy as np
import wandb
import torch
import json
import pandas as pd
from os import listdir
import os

import adapters
from adapters import AdapterTrainer
from adapters import AutoAdapterModel, AdapterFusionConfig, AdapterConfig
from adapters.composition import Fuse
from transformers import AutoConfig, AutoTokenizer

from transformers import get_linear_schedule_with_warmup, AdamW
from transformers import TrainingArguments, EvalPrediction, Trainer, default_data_collator
import datasets


# This notebook can be used to fuse pretrained adapters of PubMedQA, to train their fusion on a downstream task with pubmedqa

In [None]:
data_dir = "/content/drive/MyDrive/Data/pubmedqa"

### Load Pubmedqa from prerpocessed folder

In [None]:
# complete path to the data directory

def load_pubmedqa(data_dir, fold_num=0):
    train_json = json.load(open(f"{data_dir}/pqal_fold{fold_num}/train_set.json"))
    dev_json = json.load(open(f"{data_dir}/pqal_fold{fold_num}/dev_set.json"))
    test_json = json.load(open(f"{data_dir}/test_set.json"))

    id_li = []
    question_li = []
    context_li = []
    label_li = []
    for k, v in train_json.items():
        id_li.append(k)
        question_li.append(v["QUESTION"])
        context_li.append(v["CONTEXTS"])
        label_li.append(v["final_decision"])
    train_df = pd.DataFrame(
        {"id": id_li, "question": question_li, "context": context_li, "label": label_li}
    )

    dev_id_li = []
    dev_question_li = []
    dev_context_li = []
    dev_label_li = []
    for k, v in dev_json.items():
        dev_id_li.append(k)
        dev_question_li.append(v["QUESTION"])
        dev_context_li.append(v["CONTEXTS"])
        dev_label_li.append(v["final_decision"])
    dev_df = pd.DataFrame(
        {
            "id": dev_id_li,
            "question": dev_question_li,
            "context": dev_context_li,
            "label": dev_label_li,
        }
    )

    test_id_li = []
    test_question_li = []
    test_context_li = []
    test_label_li = []
    for k, v in test_json.items():
        test_id_li.append(k)
        test_question_li.append(v["QUESTION"])
        test_context_li.append(v["CONTEXTS"])
        test_label_li.append(v["final_decision"])
    test_df = pd.DataFrame(
        {
            "id": test_id_li,
            "question": test_question_li,
            "context": test_context_li,
            "label": test_label_li,
        }
    )
    print(
        f"Load pubmed_qa_l datasets train_df({len(train_df.index)}),dev_df({len(dev_df.index)}),test_df({len(test_df.index)})"
    )
    return train_df, dev_df, test_df

train_df, dev_df, test_df = load_pubmedqa(data_dir, 0)

Load pubmed_qa_l datasets train_df(450),dev_df(50),test_df(500)


### Convert from Pandas to Hugging Face dataset

In [None]:
train_df['question'] = train_df['question'].astype(str)
train_df['context'] = train_df['context'].astype(str)
dev_df['question'] = dev_df['question'].astype(str)
dev_df['context'] = dev_df['context'].astype(str)
test_df['question'] = test_df['question'].astype(str)
test_df['context'] = test_df['context'].astype(str)

train = datasets.Dataset.from_pandas(train_df)
train = train.class_encode_column("label")
eval = datasets.Dataset.from_pandas(dev_df)
eval = eval.class_encode_column("label")

Casting to class labels:   0%|          | 0/450 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/50 [00:00<?, ? examples/s]

#### Convert to usable train and eval sets with corresponding Tokenizer

In [None]:
from transformers import BertTokenizer

#tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(
      batch["question"],
      batch["context"],
      max_length=180,
      truncation=True,
      padding="max_length"
  )

train = train.map(encode_batch, batched=True)
train = train.rename_column("label", "labels")
eval = eval.map(encode_batch, batched=True)
eval = eval.rename_column("label", "labels")
train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

## Pretrained Adapters and Base Model

#### Load one Adapter and directly use its Head

In [None]:
from transformers import BertConfig

id2label = {id: label for (id, label) in enumerate(train.features["labels"].names)}

# Load the configuration from the pretrained model and update it with your id2label mapping
config = BertConfig.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT",
    id2label=id2label,
)

# Now, model is your adapter-based BERT model initialized with the pretrained weights from the specified model.


model = AutoAdapterModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", config=config)

umls_adapter = model.load_adapter("reginaboateng/umls_RE_adapter_clinical_bert", source="hf", set_active=True)
model.train_adapter(umls_adapter)



Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

#### Load one Adapter + One Adapter Head

In [None]:
from transformers import BertConfig

id2label = {id: label for (id, label) in enumerate(train.features["labels"].names)}

# Load the configuration from the pretrained model and update it with your id2label mapping
config = BertConfig.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT",
    id2label=id2label,
)

# Now, model is your adapter-based BERT model initialized with the pretrained weights from the specified model.


model = AutoAdapterModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", config=config)

umls_adapter = model.load_adapter("reginaboateng/umls_RE_adapter_clinical_bert", source="hf", set_active=True)

config = AdapterConfig.load("pfeiffer")
# Add a classification head for your downstream task
model.add_classification_head("classification_head", num_labels=len(id2label))

# Activate the adapter
model.active_adapters = umls_adapter

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

adapter_config.json:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

head_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

pytorch_adapter.bin:   0%|          | 0.00/7.19M [00:00<?, ?B/s]

pytorch_model_head.bin:   0%|          | 0.00/143k [00:00<?, ?B/s]

#### Load Two Adapters and Train Fusion

In [None]:
from transformers import BertConfig
from adapters.composition import Fuse

id2label = {id: label for (id, label) in enumerate(train.features["labels"].names)}

# Load the configuration from the pretrained model and update it with your id2label mapping
config = BertConfig.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT",
    id2label=id2label,
)

model = AutoAdapterModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", config=config)

umls_adapter1 = model.load_adapter("reginaboateng/umls_RE_adapter_clinical_bert", source="hf", load_as="umls1", with_head=False)
umls_adapter2 = model.load_adapter("reginaboateng/umls_RE_adapter_clinical_bert", source="hf", load_as="umls2", with_head=False)

model.add_adapter_fusion(Fuse("umls1", "umls2"))
model.set_active_adapters(Fuse("umls1", "umls2"))

# Add a classification head for our target task
model.add_classification_head("cb", num_labels=len(id2label))

# Unfreeze and activate fusion setup
adapter_setup = Fuse("umls1", "umls2")
model.train_adapter_fusion(adapter_setup)

Fetching 6 files: 100%|██████████| 6/6 [00:00<?, ?it/s]
Fetching 6 files: 100%|██████████| 6/6 [00:00<?, ?it/s]


In [None]:
device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu"
    )
model.to(device)
print(f"device:{device}")

device:cuda


### Set all training en Evaluation Arguments - currently as close as possible to MOP

In [None]:
# Args that er used by MOP
"""!ARGS:  Namespace(add_rel_pred=False, add_sapbert=False, amp=False, base_model='lighteternal/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext-finetuned-mnli', batch_size=4, best_model_dir='./temp/model_20240528_065000/', cuda=True, data_dir='../../../../../../Data/pubmedqa/', dev_file='dev.tsv', device=device(type='cuda'), epochs=3, gradient_accumulation_steps=1, groups=None, is_multilabel=False, lr=5e-06, max_seq_length=512, model='PubMedBERT-base_pure_S20Rel', model_dir='../../../../model_dir/', n_gpu=1, num_labels=3, patience=2, pretrain_epoch='0', reduction_factor=8, repeat_runs=2, seed=None, temperature=1.0, test_file='test.tsv', tokenizer='lighteternal/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext-finetuned-mnli', train_file='train.tsv', train_ratio=1, warmup_proportion=0.1)
Get 450 examples of PubMedQA datasets for train set"""

"!ARGS:  Namespace(add_rel_pred=False, add_sapbert=False, amp=False, base_model='lighteternal/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext-finetuned-mnli', batch_size=4, best_model_dir='./temp/model_20240528_065000/', cuda=True, data_dir='../../../../../../Data/pubmedqa/', dev_file='dev.tsv', device=device(type='cuda'), epochs=3, gradient_accumulation_steps=1, groups=None, is_multilabel=False, lr=5e-06, max_seq_length=512, model='PubMedBERT-base_pure_S20Rel', model_dir='../../../../model_dir/', n_gpu=1, num_labels=3, patience=2, pretrain_epoch='0', reduction_factor=8, repeat_runs=2, seed=None, temperature=1.0, test_file='test.tsv', tokenizer='lighteternal/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext-finetuned-mnli', train_file='train.tsv', train_ratio=1, warmup_proportion=0.1)\nGet 450 examples of PubMedQA datasets for train set"

In [None]:
args= {"lr": 5e-06, "batch_size": 8, "epochs": 3, "warmup_proportion": 0.1, "gradient_accumulation_steps": 1} # lr: 5e-5

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

num_train_optimization_steps = None
num_train_optimization_steps = (
            int(
                len(train) / args["batch_size"] / args["gradient_accumulation_steps"]
            )
            * args["epochs"]
        )


optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.01,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]

optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=args["lr"],
            weight_decay=0.01,
            correct_bias=False,
        )
scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_training_steps=num_train_optimization_steps,
            num_warmup_steps=args["warmup_proportion"] * num_train_optimization_steps,
        )




In [None]:
def compute_accuracy(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis=1)
  return {"acc": (preds == p.label_ids).mean()}

training_args = TrainingArguments(
    learning_rate=args["lr"],
    num_train_epochs=args["epochs"],
    per_device_train_batch_size=args["batch_size"],
    per_device_eval_batch_size=args["batch_size"],
    logging_steps=200,
    output_dir="./training_output",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=compute_accuracy,
    optimizers=(optimizer, scheduler),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=171, training_loss=2.2166596351311223, metrics={'train_runtime': 38.3613, 'train_samples_per_second': 35.192, 'train_steps_per_second': 4.458, 'total_flos': 127532750076000.0, 'train_loss': 2.2166596351311223, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 1.8502269983291626,
 'eval_acc': 0.46,
 'eval_runtime': 0.6961,
 'eval_samples_per_second': 71.834,
 'eval_steps_per_second': 10.057,
 'epoch': 3.0}

### Testing

In [None]:
q = test_df.iloc[100]["question"]
c = test_df.iloc[100]["context"]
l = test_df.iloc[100]["label"]
l

'yes'

In [None]:
def predict(question, context):
  encoded = tokenizer(question, context, return_tensors="pt")
  if torch.cuda.is_available():
    encoded.to("cuda")
  logits = model(**encoded)[0]
  pred_class = torch.argmax(logits).item()
  return id2label[pred_class]

predict(q, c)

'yes'