In [1]:
# mount your drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# This notebook can be used to fuse pretrained adapters to train their fusion on the MedQA downstream task

In [2]:
!pip install -U wandb
!pip install -U pymetis
!pip install -U transformers[torch]==4.40.2
!pip install -U adapters
!pip install -U datasets
!pip install -U evaluate

Collecting wandb
  Downloading wandb-0.17.4-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.7.1-py2.py3-none-any.whl (300 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.2/300.2 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86

In [3]:
import numpy as np
import wandb
import torch
import json
import pandas as pd
from os import listdir
import os

import adapters
from adapters import AdapterTrainer
from adapters import AutoAdapterModel, AdapterFusionConfig, AdapterConfig
from adapters.composition import Fuse
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification

from transformers import get_linear_schedule_with_warmup, AdamW
from transformers import TrainingArguments, EvalPrediction, Trainer, default_data_collator
import datasets

### Load MedQA from prerpocessed folder

In [None]:
medqa = datasets.load_dataset("GBaker/MedQA-USMLE-4-options")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/654 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1273 [00:00<?, ? examples/s]

#### Convert to usable train and eval sets with corresponding Tokenizer

In [None]:
from transformers import BertTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")

option_letters = ["A", "B", "C", "D"]
option2label = {o: i for i, o in enumerate(option_letters)}
def preprocess_function(examples):
    num_answers = len(option_letters)
    question = [[ques] * num_answers for ques in examples["question"]]
    options = [[ex[opt] for opt in option_letters] for ex in examples["options"]]
    label = np.vectorize(lambda x: option2label[x])(examples["answer_idx"])

    q = sum(question, [])
    o = sum(options, [])

    tokenized_examples = tokenizer(q, o, truncation=True)
    result = {k: [v[i : i + num_answers] for i in range(0, len(v), num_answers)] for k, v in tokenized_examples.items()}
    result["label"] = label
    return result

medqa = medqa.map(preprocess_function, batched=True)

train, test = medqa["train"], medqa["test"]



tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Map:   0%|          | 0/10178 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/1273 [00:00<?, ? examples/s]

In [None]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

rel_features = [
    "input_ids",
    "attention_mask",
    "token_type_ids",
    "label",
]

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])

        flattened_features = [
            [{k: v[i] for k, v in feature.items() if k in rel_features}
             for i in range(num_choices)]
            for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

## Pretrained Adapters and Base Model

#### Unpack adapter .zip *file*

In [4]:
import zipfile

zip_file_name = "/content/drive/MyDrive/Adapters/adapters.zip"
folder_to_extract = "adapters/PubMedBERT/S20Rel_EP/"
destination_directory = "/content/"

with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    zip_contents = zip_ref.namelist()
    print("Contents of the zip file:", zip_contents)

with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    for member in zip_contents:
        if member.startswith(folder_to_extract):
            # Construct the full path for extraction
            member_destination_path = os.path.join(destination_directory, member)
            # Check if the member is a directory
            if member.endswith('/'):
                # Create the directory if it does not exist
                os.makedirs(member_destination_path, exist_ok=True)
            else:
                # Create the necessary directories in the destination path
                os.makedirs(os.path.dirname(member_destination_path), exist_ok=True)
                # Extract the file
                with zip_ref.open(member) as source_file:
                    with open(member_destination_path, 'wb') as target_file:
                        target_file.write(source_file.read())
print(f'Extracted {folder_to_extract} to "{destination_directory}"')

Contents of the zip file: ['adapters/', 'adapters/BioBERT/', 'adapters/BioBERT/S20Rel_EP/', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_0_epoch_2_2024-07-02 23:35:56/', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_0_epoch_2_2024-07-02 23:35:56/adapter_config.json', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_0_epoch_2_2024-07-02 23:35:56/head_config.json', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_0_epoch_2_2024-07-02 23:35:56/pytorch_adapter.bin', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_0_epoch_2_2024-07-02 23:35:56/pytorch_model_head.bin', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_10_epoch_2_2024-07-03 00:51:36/', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_10_epoch_2_2024-07-03 00:51:36/adapter_config.json', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_10_epoch_2_2024-07-03 00:51:36/head_config.json', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_10_epoch_2_2024-07-03 00:5

#### Load pre-trained Adapters and Train Fusion


In [5]:
from transformers import BertConfig, AutoModelForMultipleChoice
from adapters.composition import Fuse

# AutoAdapterModel
model = AutoModelForMultipleChoice.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")
adapters.init(model)

adapters_dir = "/content/adapters/PubMedBERT"
final_adapters = []

for adapter_dir in os.listdir(adapters_dir):
    adapter_path = os.path.join(adapters_dir, adapter_dir)

    if os.path.isdir(adapter_path):
        for sub_dir in os.listdir(adapter_path):
            sub_dir_path = os.path.join(adapter_path, sub_dir)
            config_path = os.path.join(sub_dir_path, "adapter_config.json")

            if os.path.isdir(sub_dir_path) and os.path.isfile(config_path):
                try:
                    # Extract partition number
                    partition_info = sub_dir.split("partition_")[1].split("_")[0]
                    adapter_name = f"partition_{partition_info}"
                    # Load adapter
                    model.load_adapter(sub_dir_path, config=config_path, load_as=adapter_name, with_head=False)
                    final_adapters.append(adapter_name)
                    print(f"Loaded adapter: {adapter_name}")
                except Exception as e:
                    print(f"Error loading adapter from {sub_dir_path}: {e}")

fusion = Fuse(*[f"partition_{x}" for x in range(20)])

model.add_adapter_fusion(fusion)
model.set_active_adapters(fusion)

# Unfreeze and activate fusion setup (only fusion layer and head will be trained)
model.train_adapter_fusion(fusion)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded adapter: partition_16
Loaded adapter: partition_8
Loaded adapter: partition_13
Loaded adapter: partition_1
Loaded adapter: partition_18
Loaded adapter: partition_3
Loaded adapter: partition_7
Loaded adapter: partition_2
Loaded adapter: partition_10
Loaded adapter: partition_12
Loaded adapter: partition_9
Loaded adapter: partition_14
Loaded adapter: partition_6
Loaded adapter: partition_19
Loaded adapter: partition_4
Loaded adapter: partition_15
Loaded adapter: partition_17
Loaded adapter: partition_11
Loaded adapter: partition_5
Loaded adapter: partition_0


In [None]:
model.adapter_summary()



In [None]:
device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu"
    )
model.to(device)
print(f"device:{device}")

device:cuda


### Set all training en Evaluation Arguments - currently as close as possible to MOP

In [None]:
args= {"lr": 5e-06, "batch_size": 2, "epochs": 3, "warmup_proportion": 0.1, "gradient_accumulation_steps": 1} # lr: 5e-5

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

num_train_optimization_steps = None
num_train_optimization_steps = (
            int(
                len(train) / args["batch_size"] / args["gradient_accumulation_steps"]
            )
            * args["epochs"]
        )


optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.01,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]

optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=args["lr"],
            weight_decay=0.01,
            correct_bias=False,
        )
scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_training_steps=num_train_optimization_steps,
            num_warmup_steps=args["warmup_proportion"] * num_train_optimization_steps,
        )




In [None]:
from transformers import TrainingArguments, EvalPrediction, Trainer
import evaluate

metric = evaluate.load("accuracy")
# metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    learning_rate=args["lr"],
    num_train_epochs=args["epochs"],
    per_device_train_batch_size=args["batch_size"],
    per_device_eval_batch_size=args["batch_size"],
    logging_steps=200,
    output_dir="./training_output",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler),
)

In [None]:
wandb.init(anonymous="allow", mode="offline") #"dryrun")

In [None]:
trainer.train()

Step,Training Loss


KeyboardInterrupt: 

In [None]:
trainer.evaluate()

{'eval_loss': 0.4828279912471771,
 'eval_accuracy': 0.8243243243243243,
 'eval_f1': 0.9037037037037037,
 'eval_precision': 0.8243243243243243,
 'eval_recall': 1.0,
 'eval_runtime': 4.153,
 'eval_samples_per_second': 17.818,
 'eval_steps_per_second': 2.408,
 'epoch': 3.0}

In [None]:
import gc
# set to True if needed
if True:
  gc.collect()
  torch.cuda.empty_cache()