# Summary

Adapter model training and evaluation modeled after Don't Stop Pretraining paper

# Setup

In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
!pip install datasets
!pip install huggingface_hub
!pip install scikit-learn
!pip install transformers[torch]

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed dataset

In [3]:
!pip install -qq adapters

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m256.0/256.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)
from sklearn.metrics import accuracy_score,  f1_score

from datasets import load_dataset

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Create function for metrics for evaluation

In [5]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)

   # Calculate precision, recall, and F1-score
    f1 = f1_score(labels, preds, average='macro')

    return {
        'accuracy': accuracy,
        'f1_macro': f1
    }

Load the tokenizer - Roberta Base

In [6]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")#, truncation=True, padding=True)

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm_probability=0.15
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [7]:
tokenizer.model_max_length

512

In [8]:
from transformers import AutoConfig, AutoModelForMaskedLM

config = AutoConfig.from_pretrained("roberta-base")

# Training

## DAPT

Continue pretraining ROBERTA on a large corpus of unlabeled
domain-specific text (Reviews dataset).

In [23]:
from datasets import load_dataset, DatasetDict

# split = 'train[0:20%]' # reduce the working size to speed up iteration

# dataset_train = load_dataset("BigTMiami/amazon_25M_reviews_condensed", split='train[0:20%]')
# dataset_valid = load_dataset("BigTMiami/amazon_25M_reviews_condensed", split='dev[0:20%]')
# dataset_test = load_dataset("BigTMiami/amazon_25M_reviews_condensed", split='test[0:20%]')

# domain_dataset = load_dataset("BigTMiami/amazon_split_25M_reviews_20_percent_condensed")

domain_dataset = load_dataset("BigTMiami/amazon_split_25M_reviews_20_percent_condensed")
# # 90% train, 10% test + validation
# train_testvalid = full_dataset.train_test_split(test_size=0.1)

# # Split the 10% test + valid in half test, half valid
# test_valid = train_testvalid['test'].train_test_split(test_size=0.5)

# # gather everyone if you want to have a single DatasetDict
# domain_dataset = DatasetDict({'train': train_testvalid['train'], 'test': test_valid['test'], 'validation': test_valid['train']})

In [24]:
# Process domain dataset to be batch size
domain_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 862683
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 8360
    })
})

In [26]:

tokenizer = AutoTokenizer.from_pretrained("roberta-base", use_fast=True)#, truncation=True, padding=True)

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm_probability=0.15
)

In [22]:
# def encode_batch(batch):
#   """Encodes a batch of input data using the model tokenizer."""
#   return tokenizer(batch, max_length=32, truncation=True, padding="max_length")

# # Encode the input data
# dataset = dataset.map(encode_batch, batched=True)

Map:   0%|          | 0/862683 [00:00<?, ? examples/s]

ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [45]:
from adapters import AutoAdapterModel
# Model
dapt_model = AutoAdapterModel.from_pretrained(
    "roberta-base",
    config=config,
)

# Add a new adapter to model - use seq bn architecture
dapt_model.add_adapter("review_adapter", config="seq_bn")

# # Add a matching classification head
# dapt_model.add_classification_head(
#     "review_adapter",
#     num_labels=2,
#     id2label={ 0: "👎", 1: "👍"}
#   )

# Activate the adapter
dapt_model.train_adapter("review_adapter")
dapt_model.set_active_adapters("review_adapter") # this is unnecessary if call train

Some weights of RobertaAdapterModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'heads.default.3.bias', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
from transformers import DataCollatorForLanguageModeling
from torch.utils.data import DataLoader

tokenizer = AutoTokenizer.from_pretrained("roberta-base")#, padding="max_length")
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm_probability=0.15
)

# lm_datasets = domain_dataset.map(
#     group_texts,
#     batched=True,
#     batch_size=32,
#     num_proc=4,
# )

# #tokenize dataset with padding and truncation
# dataset_tokenized = domain_dataset.map(tokenizer, batched=True)

# #Instantiate Pytorch DataLoader
# dl = DataLoader(dataset_tokenized, shuffle=True, collate_fn=data_collator, batch_size=32)


In [49]:
# Training
import numpy as np
from transformers import TrainingArguments, EvalPrediction
from adapters import AdapterTrainer

training_args = TrainingArguments(
    output_dir="./adapter_dapt_reviews_small",
    learning_rate=0.0005,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=11, # 16*128*6
    weight_decay=0.01,
    warmup_ratio=0.06, # Paper: warmup proportion of 0.06
    adam_epsilon=1e-6, # Paper 1e-6 (huggingface default 1e-08)
    adam_beta1=0.9, # Paper: Adam weights 0.9
    adam_beta2=0.98, # Paper: Adam weights 0.98 (huggingface default  0.999)
    lr_scheduler_type="linear",
    evaluation_strategy="steps",
    eval_steps=500, # Evaluates every 50 steps
    save_strategy="steps",
    save_steps=500, # Checkpoint saves every 500 steps / every 1,000,000 reviews ( 500 * 10 * 34 * 6)
    save_total_limit=2, # Saves latest 2 checkpoints
    # torch_compile=True,  # Much Faster
    push_to_hub=True,
    hub_strategy="checkpoint", # Only pushes at end with save_model()
    logging_strategy="steps", # Is default
    logging_steps=100, # Logs training progres
)


# def compute_accuracy(p: EvalPrediction):
#   preds = np.argmax(p.predictions, axis=1)
#   return {"acc": (preds == p.label_ids).mean()}

dapt_trainer = AdapterTrainer(
    model=dapt_model,
    args=training_args,
    # tokenizer=tokenizer,
    train_dataset=domain_dataset["train"],
    eval_dataset=domain_dataset["validation"],
    data_collator=data_collator,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [50]:
eval_results = dapt_trainer.evaluate()
non_trained_eval_loss = eval_results["eval_loss"]
# print_gpu_utilization()
print(f"Non Trained Eval Loss: {non_trained_eval_loss:.2f}")
print(eval_results)

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Non Trained Eval Loss: 1.90
{'eval_loss': 1.8973259925842285, 'eval_runtime': 90.9474, 'eval_samples_per_second': 91.921, 'eval_steps_per_second': 2.881}


In [None]:
# TRAINING
results = dapt_trainer.train()
print(results)

TypeError: 'dict' object is not callable

In [None]:
dapt_trainer.evaluate()

In [None]:
dapt_trainer.push_to_hub("RobertaAdapter_reviews_DAPT_v1")

TAPT

In [None]:
task_dataset = load_dataset("BigTMiami/amazon_helpfulness")

Downloading readme:   0%|          | 0.00/613 [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/40.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.82M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

In [None]:
task_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 115251
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    dev: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
})

In [None]:
task_model = AutoAdapterModel.from_pretrained(
    "roberta-base",
    config=classification_config,
)

# Add a new adapter to model
task_model.add_adapter("task_review_helpfulness_adapter", config="seq_bn")

# Add a matching classification head
task_model.add_classification_head(
    "reviews_helpfulness_adapter",
    num_labels=2,
    id2label={ 0: "👎", 1: "👍"}
  )

# Activate the adapter
task_model.train_adapter("task_review_helpfulness_adapter")

Some weights of RobertaAdapterModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'heads.default.3.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Use Adapter Training Class
import numpy as np
from transformers import TrainingArguments, EvalPrediction
from adapters import AdapterTrainer

training_args = TrainingArguments(
    learning_rate=0.0001,
    adam_epsilon=1e-6,
    num_train_epochs=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=128, # notes on pretraining
    #logging_steps=200,
    adam_beta1 = 0.9,
    adam_beta2 = 0.98,
    weight_decay = 0.01,
    warmup_ratio = 0.06,
    # masking_probability=0.15,
    lr_scheduler_type="linear",
    output_dir="./adapter_tapt_helpfulness",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

def compute_accuracy(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis=1)
  return {"acc": (preds == p.label_ids).mean()}

task_trainer = AdapterTrainer(
    model=task_model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=task_dataset["train"],
    eval_dataset=task_dataset["dev"],
    compute_metrics=compute_accuracy,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [None]:
# TRAINING
task_trainer.train()

ValueError: Expected input batch_size (16) to match target batch_size (8192).

In [None]:
# Evaluate
task_trainer.evaluate()

In [None]:
task_trainer.push_to_hub("RobertaAdapter_helpfulness_TAPT")

In [None]:
# Save adapter to hugging face
task_trainer.save_model("RobertaAdapter_helpfulnessModel")

# Evaluation

In [None]:
from transformers import TextClassificationPipeline

classifier = TextClassificationPipeline(model=task_model, tokenizer=tokenizer, device=training_args.device.index)

classifier("This is awesome!")

In [None]:
# Save trained adapter
task_model.save_adapter("./test_adapter1", "rotten_tomatoes")

!ls -lh final_adapter