In [None]:
# mount your drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# This notebook can be used to fuse pretrained adapters to train their fusion on a downstream task with BioASQ

In [None]:
!pip install -U wandb
!pip install -U pymetis
!pip install -U transformers[torch]==4.40.2
!pip install -U adapters
!pip install -U datasets
!pip install -U evaluate

Collecting wandb
  Downloading wandb-0.17.4-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.7.1-py2.py3-none-any.whl (300 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.2/300.2 kB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86

In [None]:
import numpy as np
import wandb
import torch
import json
import pandas as pd
from os import listdir
import os

import adapters
from adapters import AdapterTrainer
from adapters import AutoAdapterModel, AdapterFusionConfig, AdapterConfig
from adapters.composition import Fuse
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification

from transformers import get_linear_schedule_with_warmup, AdamW
from transformers import TrainingArguments, EvalPrediction, Trainer, default_data_collator
import datasets

In [None]:
data_dir = "/content/drive/MyDrive/Data/mednli/"

### Load MedNLI from prerpocessed folder

In [None]:
def load_mednli(file_path):
    """Load MedNLI from a JSONL file."""
    column_names = ['sentence1', 'sentence2', 'gold_label']

    file_name = lambda x: f"{file_path}mli_{x}_v1.jsonl"

    train_df = pd.read_json(file_name("train"), lines=True)[column_names]
    train_df = train_df.rename(columns={
        'sentence1': 'text_a', 'sentence2': 'text_b', 'gold_label': 'label'}
                               )

    dev_df = pd.read_json(file_name("dev"), lines=True)[column_names]
    dev_df = dev_df.rename(columns={
        'sentence1': 'text_a', 'sentence2': 'text_b', 'gold_label': 'label'}
                               )

    test_df = pd.read_json(file_name("test"), lines=True)[column_names]
    test_df = test_df.rename(columns={
        'sentence1': 'text_a', 'sentence2': 'text_b', 'gold_label': 'label'}
                               )

    return train_df, dev_df, test_df

train_df, dev_df, test_df = load_mednli(data_dir)

In [None]:
train_df['text_a'] = train_df['text_a'].astype(str)
train_df['text_b'] = train_df['text_b'].astype(str)
dev_df['text_a'] = dev_df['text_a'].astype(str)
dev_df['text_b'] = dev_df['text_b'].astype(str)
test_df['text_a'] = test_df['text_a'].astype(str)
test_df['text_b'] = test_df['text_b'].astype(str)

train = datasets.Dataset.from_pandas(train_df)
train = train.class_encode_column("label")
eval = datasets.Dataset.from_pandas(dev_df)
eval = eval.class_encode_column("label")

Casting to class labels:   0%|          | 0/11232 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/1395 [00:00<?, ? examples/s]

#### Convert to usable train and eval sets with corresponding Tokenizer

In [None]:
from transformers import BertTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")

def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(
      batch["text_a"],
      batch["text_b"],
      max_length=180,
      truncation=True,
      padding="max_length"
  )

train = train.map(encode_batch, batched=True)
train = train.rename_column("label", "labels")
eval = eval.map(encode_batch, batched=True)
eval = eval.rename_column("label", "labels")
train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Map:   0%|          | 0/11232 [00:00<?, ? examples/s]

Map:   0%|          | 0/1395 [00:00<?, ? examples/s]

## Pretrained Adapters and Base Model

In [None]:
import zipfile

zip_file_name = "/content/drive/MyDrive/Adapters/adapters.zip"
folder_to_extract = "adapters/PubMedBERT/S20Rel_EP/"
destination_directory = "/content/"

with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    zip_contents = zip_ref.namelist()
    print("Contents of the zip file:", zip_contents)

with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    for member in zip_contents:
        if member.startswith(folder_to_extract):
            # Construct the full path for extraction
            member_destination_path = os.path.join(destination_directory, member)
            # Check if the member is a directory
            if member.endswith('/'):
                # Create the directory if it does not exist
                os.makedirs(member_destination_path, exist_ok=True)
            else:
                # Create the necessary directories in the destination path
                os.makedirs(os.path.dirname(member_destination_path), exist_ok=True)
                # Extract the file
                with zip_ref.open(member) as source_file:
                    with open(member_destination_path, 'wb') as target_file:
                        target_file.write(source_file.read())
print(f'Extracted {folder_to_extract} to "{destination_directory}"')

Contents of the zip file: ['adapters/', 'adapters/BioBERT/', 'adapters/BioBERT/S20Rel_EP/', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_0_epoch_2_2024-07-02 23:35:56/', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_0_epoch_2_2024-07-02 23:35:56/adapter_config.json', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_0_epoch_2_2024-07-02 23:35:56/head_config.json', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_0_epoch_2_2024-07-02 23:35:56/pytorch_adapter.bin', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_0_epoch_2_2024-07-02 23:35:56/pytorch_model_head.bin', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_10_epoch_2_2024-07-03 00:51:36/', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_10_epoch_2_2024-07-03 00:51:36/adapter_config.json', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_10_epoch_2_2024-07-03 00:51:36/head_config.json', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_10_epoch_2_2024-07-03 00:5

#### Load pre-trained Adapters and Train Fusion


In [None]:
from transformers import BertConfig
from adapters.composition import Fuse

id2label = {id: label for (id, label) in enumerate(train.features["labels"].names)}

# Load the configuration from the pretrained model and update it with your id2label mapping
config = BertConfig.from_pretrained(
    "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext",
    id2label=id2label
)

# AutoAdapterModel
model = AutoModelForSequenceClassification.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext", config=config)
adapters.init(model)

adapters_dir = "/content/adapters/PubMedBERT"
final_adapters = []

for adapter_dir in os.listdir(adapters_dir):
    adapter_path = os.path.join(adapters_dir, adapter_dir)
    config_path = os.path.join(adapter_path, "adapter_config.json")

    if os.path.isdir(adapter_path):
        for sub_dir in os.listdir(adapter_path):
            sub_dir_path = os.path.join(adapter_path, sub_dir)
            config_path = os.path.join(sub_dir_path, "adapter_config.json")

            if os.path.isdir(sub_dir_path) and os.path.isfile(config_path):
                try:
                    # Extract partition number
                    partition_info = sub_dir.split("partition_")[1].split("_")[0]
                    adapter_name = f"partition_{partition_info}"
                    # Load adapter
                    model.load_adapter(sub_dir_path, config=config_path, load_as=adapter_name, with_head=False)
                    final_adapters.append(adapter_name)
                    print(f"Loaded adapter: {adapter_name}")
                except Exception as e:
                    print(f"Error loading adapter from {sub_dir_path}: {e}")

fusion = Fuse(*[f"partition_{x}" for x in range(20)])

model.add_adapter_fusion(fusion)
model.set_active_adapters(fusion)

# Unfreeze and activate fusion setup (only fusion layer and head will be trained)
model.train_adapter_fusion(fusion)

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded adapter: partition_2
Loaded adapter: partition_7
Loaded adapter: partition_0
Loaded adapter: partition_6
Loaded adapter: partition_10
Loaded adapter: partition_16
Loaded adapter: partition_17
Loaded adapter: partition_8
Loaded adapter: partition_11
Loaded adapter: partition_9
Loaded adapter: partition_13
Loaded adapter: partition_1
Loaded adapter: partition_18
Loaded adapter: partition_14
Loaded adapter: partition_15
Loaded adapter: partition_12
Loaded adapter: partition_4
Loaded adapter: partition_3
Loaded adapter: partition_5
Loaded adapter: partition_19


In [None]:
model.adapter_summary()



In [None]:
device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu"
    )
model.to(device)
print(f"device:{device}")

device:cuda


### Set all training en Evaluation Arguments - currently as close as possible to MOP

In [None]:
args= {"lr": 5e-06, "batch_size": 12, "epochs": 1, "warmup_proportion": 0.1, "gradient_accumulation_steps": 1} # lr: 5e-5

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

num_train_optimization_steps = None
num_train_optimization_steps = (
            int(
                len(train) / args["batch_size"] / args["gradient_accumulation_steps"]
            )
            * args["epochs"]
        )


optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.01,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]

optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=args["lr"],
            weight_decay=0.01,
            correct_bias=False,
        )
scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_training_steps=num_train_optimization_steps,
            num_warmup_steps=args["warmup_proportion"] * num_train_optimization_steps,
        )




In [None]:
from transformers import TrainingArguments, EvalPrediction, Trainer
import evaluate

metric = evaluate.load("accuracy")
# metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    learning_rate=args["lr"],
    num_train_epochs=args["epochs"],
    per_device_train_batch_size=args["batch_size"],
    per_device_eval_batch_size=args["batch_size"],
    logging_steps=200,
    output_dir="./training_output",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler),
)

In [None]:
wandb.init(anonymous="allow", mode="offline") #"dryrun")

In [None]:
trainer.train()

Step,Training Loss
200,1.107
400,1.0944
600,1.0738
800,1.06


TrainOutput(global_step=936, training_loss=1.0796246976933928, metrics={'train_runtime': 1263.6664, 'train_samples_per_second': 8.888, 'train_steps_per_second': 0.741, 'total_flos': 1728578052115200.0, 'train_loss': 1.0796246976933928, 'epoch': 1.0})

In [None]:
trainer.evaluate()

{'eval_loss': 1.0220168828964233,
 'eval_accuracy': 0.5268817204301075,
 'eval_runtime': 66.3318,
 'eval_samples_per_second': 21.031,
 'eval_steps_per_second': 1.764}

### Testing

In [None]:
index_range = range(100, 110)
a = test_df.iloc[index_range]["text_a"].to_list()
b = test_df.iloc[index_range]["text_b"].to_list()
l = test_df.iloc[index_range]["label"]
l

100    contradiction
101          neutral
102       entailment
103    contradiction
104          neutral
105       entailment
106    contradiction
107          neutral
108       entailment
109    contradiction
Name: label, dtype: object

In [None]:
def predict(text_a, text_b):
  encoded = tokenizer(text_a, text_b, return_tensors="pt", max_length=180, truncation=True, padding="max_length")
  if torch.cuda.is_available():
    encoded.to("cuda")
  logits = model(**encoded)[0]
  pred_class = torch.argmax(logits, dim=1).tolist()
  return [id2label[idx] for idx in pred_class]

predict(a, b)

['entailment',
 'entailment',
 'entailment',
 'entailment',
 'entailment',
 'entailment',
 'entailment',
 'contradiction',
 'entailment',
 'entailment']

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()