In [1]:
# mount your drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# This notebook can be used to fuse pretrained adapters to train their fusion on a downstream task with BioASQ

In [2]:
!pip install -U wandb
!pip install -U pymetis
!pip install -U transformers[torch]==4.40.2
!pip install -U adapters
!pip install -U datasets
!pip install -U evaluate

Collecting wandb
  Downloading wandb-0.17.4-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.7.1-py2.py3-none-any.whl (300 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.2/300.2 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86

In [3]:
import numpy as np
import wandb
import torch
import json
import pandas as pd
from os import listdir
import os

import adapters
from adapters import AdapterTrainer
from adapters import AutoAdapterModel, AdapterFusionConfig, AdapterConfig
from adapters.composition import Fuse
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification

from transformers import get_linear_schedule_with_warmup, AdamW
from transformers import TrainingArguments, EvalPrediction, Trainer, default_data_collator
import datasets

### Load BioASQ from prerpocessed folder

In [4]:
data_dir = "/content/drive/MyDrive/Data/HoC/"

df_train = pd.read_csv(os.path.join(data_dir, "train.tsv"), sep="\t")
df_dev = pd.read_csv(os.path.join(data_dir, "dev.tsv"), sep="\t")
df_test = pd.read_csv(os.path.join(data_dir, "test.tsv"), sep="\t")

In [5]:
def extract_true_labels(label_str):
    true_labels = []
    pairs = label_str.split(',')
    for pair in pairs:
        label, value = pair.split('_')
        true_labels.append(float(value))
    return true_labels

In [6]:
df_train['labels'] = df_train['labels'].apply(extract_true_labels)
df_dev['labels'] = df_dev['labels'].apply(extract_true_labels)
df_test['labels'] = df_test['labels'].apply(extract_true_labels)
df_train

Unnamed: 0,labels,sentence,index
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Hypoxic events frequently occur in the aquatic...,22239943_s0
1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Only a few studies are however available on th...,22239943_s1
2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","To elucidate the phenomenon , mirror carp Cypr...",22239943_s2
3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",( 16.13-16.22 g ) were exposed chronically to ...,22239943_s3
4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Level of oxidative DNA damage ( as determined ...,22239943_s4
...,...,...,...
12114,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","Within three-dimensional fibrin gels , specifi...",12549857_s4
12115,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Tube formation by primary endothelial cells an...,12549857_s5
12116,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",Both cell types produced FGF-2 and VEGF cytoki...,12549857_s6
12117,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",Increasing doses of suramin significantly decr...,12549857_s7


#### Convert to usable train and eval sets with corresponding Tokenizer

In [68]:
train = datasets.Dataset.from_pandas(df_train)
eval = datasets.Dataset.from_pandas(df_dev)

In [69]:
from transformers import BertTokenizer

labels = [x for x in range(10)]

tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")

def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(
      batch["sentence"],
      max_length=180,
      truncation=True,
      padding="max_length"
  )

train = train.map(encode_batch, batched=True)
eval = eval.map(encode_batch, batched=True)

train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])



Map:   0%|          | 0/12119 [00:00<?, ? examples/s]

Map:   0%|          | 0/1798 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

In [49]:
train[:1]

{'labels': tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
 'input_ids': tensor([[    2, 11277,  3916,  5467,  4704,  1922,  1920, 15849,  3515,  1922,
           3279,  1956,  2703, 16826,    16,  2710,  7881, 11570,    18,     3,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,

## Pretrained Adapters and Base Model

In [10]:
import zipfile

zip_file_name = "/content/drive/MyDrive/Adapters/adapters.zip"
folder_to_extract = "adapters/PubMedBERT/S20Rel_EP/"
destination_directory = "/content/"

with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    zip_contents = zip_ref.namelist()
    print("Contents of the zip file:", zip_contents)

with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    for member in zip_contents:
        if member.startswith(folder_to_extract):
            # Construct the full path for extraction
            member_destination_path = os.path.join(destination_directory, member)
            # Check if the member is a directory
            if member.endswith('/'):
                # Create the directory if it does not exist
                os.makedirs(member_destination_path, exist_ok=True)
            else:
                # Create the necessary directories in the destination path
                os.makedirs(os.path.dirname(member_destination_path), exist_ok=True)
                # Extract the file
                with zip_ref.open(member) as source_file:
                    with open(member_destination_path, 'wb') as target_file:
                        target_file.write(source_file.read())
print(f'Extracted {folder_to_extract} to "{destination_directory}"')

Contents of the zip file: ['adapters/', 'adapters/BioBERT/', 'adapters/BioBERT/S20Rel_EP/', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_0_epoch_2_2024-07-02 23:35:56/', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_0_epoch_2_2024-07-02 23:35:56/adapter_config.json', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_0_epoch_2_2024-07-02 23:35:56/head_config.json', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_0_epoch_2_2024-07-02 23:35:56/pytorch_adapter.bin', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_0_epoch_2_2024-07-02 23:35:56/pytorch_model_head.bin', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_10_epoch_2_2024-07-03 00:51:36/', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_10_epoch_2_2024-07-03 00:51:36/adapter_config.json', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_10_epoch_2_2024-07-03 00:51:36/head_config.json', 'adapters/BioBERT/S20Rel_EP/biobert_S20Rel_EP_partition_10_epoch_2_2024-07-03 00:5

#### Load pre-trained Adapters and Train Fusion


In [31]:
from transformers import BertConfig
from adapters.composition import Fuse

#id2label = {id: label for (id, label) in enumerate(train.features["labels"].names)}

# Load the configuration from the pretrained model and update it with your id2label mapping
config = BertConfig.from_pretrained(
    "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext",
    #id2label=id2label,
    problem_type = "multi_label_classification",
    num_labels=10
)

# AutoAdapterModel
model = AutoModelForSequenceClassification.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext", config=config)
adapters.init(model)

adapters_dir = "/content/adapters/PubMedBERT"
final_adapters = []
for adapter_dir in os.listdir(adapters_dir):
    adapter_path = os.path.join(adapters_dir, adapter_dir)
    config_path = os.path.join(adapter_path, "adapter_config.json")

    if os.path.isdir(adapter_path):
        for sub_dir in os.listdir(adapter_path):
            sub_dir_path = os.path.join(adapter_path, sub_dir)
            config_path = os.path.join(sub_dir_path, "adapter_config.json")

            if os.path.isdir(sub_dir_path) and os.path.isfile(config_path):
                try:
                    # Extract partition number
                    partition_info = sub_dir.split("partition_")[1].split("_")[0]
                    adapter_name = f"partition_{partition_info}"
                    # Load adapter
                    model.load_adapter(sub_dir_path, config=config_path, load_as=adapter_name, with_head=False)
                    final_adapters.append(adapter_name)
                    print(f"Loaded adapter: {adapter_name}")
                except Exception as e:
                    print(f"Error loading adapter from {sub_dir_path}: {e}")

fusion = Fuse(*[f"partition_{x}" for x in range(20)])

model.add_adapter_fusion(fusion)
model.set_active_adapters(fusion)

# Unfreeze and activate fusion setup (only fusion layer and head will be trained)
model.train_adapter_fusion(fusion)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded adapter: partition_2
Loaded adapter: partition_7
Loaded adapter: partition_0
Loaded adapter: partition_6
Loaded adapter: partition_10
Loaded adapter: partition_16
Loaded adapter: partition_17
Loaded adapter: partition_8
Loaded adapter: partition_11
Loaded adapter: partition_9
Loaded adapter: partition_13
Loaded adapter: partition_1
Loaded adapter: partition_18
Loaded adapter: partition_14
Loaded adapter: partition_15
Loaded adapter: partition_12
Loaded adapter: partition_4
Loaded adapter: partition_3
Loaded adapter: partition_5
Loaded adapter: partition_19


In [12]:
model.adapter_summary()



In [32]:
device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu"
    )
model.to(device)
print(f"device:{device}")

device:cuda


### Set all training en Evaluation Arguments - currently as close as possible to MOP

In [49]:
args= {"lr": 5e-06, "batch_size": 12, "epochs": 1, "warmup_proportion": 0.1, "gradient_accumulation_steps": 1} # lr: 5e-5

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

num_train_optimization_steps = None
num_train_optimization_steps = (
            int(
                len(train) / args["batch_size"] / args["gradient_accumulation_steps"]
            )
            * args["epochs"]
        )


optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.01,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]

optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=args["lr"],
            weight_decay=0.01,
            correct_bias=False,
        )
scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_training_steps=num_train_optimization_steps,
            num_warmup_steps=args["warmup_proportion"] * num_train_optimization_steps,
        )




In [76]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import TrainingArguments, EvalPrediction, Trainer

def compute_metrics(eval_pred, threshold=0.5):
    logits, labels = eval_pred

    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(logits))

    predictions = np.zeros(probs.shape)
    predictions[np.where(probs >= threshold)] = 1
    print("predictions", predictions)

    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(y_true=labels, y_pred=predictions, average="macro")
    precision = precision_score(y_true=labels, y_pred=predictions, average="macro")
    recall = recall_score(y_true=labels, y_pred=predictions, average="macro")

    return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}

training_args = TrainingArguments(
    learning_rate=args["lr"],
    num_train_epochs=args["epochs"],
    per_device_train_batch_size=args["batch_size"],
    per_device_eval_batch_size=args["batch_size"],
    logging_steps=200,
    output_dir="./training_output",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler),
)

In [57]:
wandb.init(anonymous="allow", mode="offline") #"dryrun")

In [70]:
trainer.train()

Step,Training Loss
200,0.4541
400,0.1828
600,0.1567
800,0.1493
1000,0.1475


TrainOutput(global_step=1010, training_loss=0.21752162428185493, metrics={'train_runtime': 1400.1387, 'train_samples_per_second': 8.656, 'train_steps_per_second': 0.721, 'total_flos': 1865155695316560.0, 'train_loss': 0.21752162428185493, 'epoch': 1.0})

In [72]:
#trainer.evaluate(eval_small)
trainer.evaluate()

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.5431147217750549,
 'eval_accuracy': 0.4166666666666667,
 'eval_f1': 0.0,
 'eval_precision': 0.0,
 'eval_recall': 0.0,
 'eval_runtime': 21.4604,
 'eval_samples_per_second': 0.559,
 'eval_steps_per_second': 0.047}

In [67]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [61]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found
