In [1]:
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from peft import (
    get_peft_model,
    VeraConfig,
    PeftType,
)

import evaluate
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed, AutoConfig
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
batch_size = 256
model_name_or_path = "roberta-base"
task = "mrpc"
peft_type = PeftType.VERA
device = "cuda"
num_epochs = 30
max_length = 128

In [3]:
peft_config = VeraConfig(
    task_type="SEQ_CLS", 
    inference_mode=False, 
    r=512, 
    projection_prng_key=0xABC,
    d_initial=0.1,
    target_modules=["query", "value"],
    save_projection=True
)
head_lr = 2e-3
vera_lr = 2e-3

In [4]:
if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

datasets = load_dataset("glue", task)
metric = evaluate.load("glue", task)


def tokenize_function(examples):
    # max_length=None => use the model max length (it's actually the default)
    outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=max_length)
    return outputs


tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=["idx", "sentence1", "sentence2"],
)

# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
# transformers library
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")


def collate_fn(examples):
    return tokenizer.pad(examples, padding="longest", return_tensors="pt")


# Instantiate dataloaders.
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size
)

Map: 100%|██████████| 1725/1725 [00:00<00:00, 27854.91 examples/s]


In [6]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [5]:
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True, max_length=None)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 622,850 || all params: 125,270,020 || trainable%: 0.49720595558298786


PeftModelForSequenceClassification(
  (base_model): VeraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (vera_dropout): ModuleDict(
                      (default): Identity()
                    )
                    (vera_lambda_b): ParameterDict(  (default): Parameter containin

In [6]:
optimizer = AdamW(
    [
        {"params": [p for n, p in model.named_parameters() if "vera_lambda_" in n], "lr": vera_lr},
        {"params": [p for n, p in model.named_parameters() if "classifier" in n], "lr": head_lr},
    ]
)

# Instantiate scheduler
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [7]:
model.to(device)
for epoch in range(num_epochs):
    model.train()
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch.to(device)
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch.to(device)
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        predictions, references = predictions, batch["labels"]
        metric.add_batch(
            predictions=predictions,
            references=references,
        )

    eval_metric = metric.compute()
    print(f"epoch {epoch}:", eval_metric)

  0%|          | 0/15 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 15/15 [00:10<00:00,  1.38it/s]
100%|██████████| 2/2 [00:00<00:00,  2.77it/s]


epoch 0: {'accuracy': 0.6838235294117647, 'f1': 0.8122270742358079}


100%|██████████| 15/15 [00:10<00:00,  1.43it/s]
100%|██████████| 2/2 [00:00<00:00,  2.83it/s]


epoch 1: {'accuracy': 0.6838235294117647, 'f1': 0.8122270742358079}


100%|██████████| 15/15 [00:10<00:00,  1.39it/s]
100%|██████████| 2/2 [00:00<00:00,  2.78it/s]


epoch 2: {'accuracy': 0.7058823529411765, 'f1': 0.8219584569732937}


100%|██████████| 15/15 [00:11<00:00,  1.36it/s]
100%|██████████| 2/2 [00:00<00:00,  2.79it/s]


epoch 3: {'accuracy': 0.7058823529411765, 'f1': 0.8203592814371258}


100%|██████████| 15/15 [00:10<00:00,  1.39it/s]
100%|██████████| 2/2 [00:00<00:00,  2.57it/s]


epoch 4: {'accuracy': 0.7254901960784313, 'f1': 0.8271604938271604}


100%|██████████| 15/15 [00:10<00:00,  1.37it/s]
100%|██████████| 2/2 [00:00<00:00,  2.63it/s]


epoch 5: {'accuracy': 0.7377450980392157, 'f1': 0.8298887122416534}


100%|██████████| 15/15 [00:10<00:00,  1.37it/s]
100%|██████████| 2/2 [00:00<00:00,  2.91it/s]


epoch 6: {'accuracy': 0.7377450980392157, 'f1': 0.8325508607198748}


100%|██████████| 15/15 [00:10<00:00,  1.40it/s]
100%|██████████| 2/2 [00:00<00:00,  2.66it/s]


epoch 7: {'accuracy': 0.7647058823529411, 'f1': 0.8451612903225807}


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]
100%|██████████| 2/2 [00:00<00:00,  2.62it/s]


epoch 8: {'accuracy': 0.7867647058823529, 'f1': 0.8566721581548599}


100%|██████████| 15/15 [00:11<00:00,  1.36it/s]
100%|██████████| 2/2 [00:00<00:00,  2.58it/s]


epoch 9: {'accuracy': 0.7475490196078431, 'f1': 0.8408037094281298}


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]
100%|██████████| 2/2 [00:00<00:00,  2.81it/s]


epoch 10: {'accuracy': 0.8014705882352942, 'f1': 0.8581436077057792}


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]
100%|██████████| 2/2 [00:00<00:00,  2.72it/s]


epoch 11: {'accuracy': 0.7720588235294118, 'f1': 0.8516746411483255}


100%|██████████| 15/15 [00:11<00:00,  1.36it/s]
100%|██████████| 2/2 [00:00<00:00,  2.62it/s]


epoch 12: {'accuracy': 0.8014705882352942, 'f1': 0.8629441624365483}


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]
100%|██████████| 2/2 [00:00<00:00,  2.73it/s]


epoch 13: {'accuracy': 0.8088235294117647, 'f1': 0.8686868686868686}


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]
100%|██████████| 2/2 [00:00<00:00,  2.68it/s]


epoch 14: {'accuracy': 0.7990196078431373, 'f1': 0.8646864686468646}


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]
100%|██████████| 2/2 [00:00<00:00,  2.60it/s]


epoch 15: {'accuracy': 0.7965686274509803, 'f1': 0.8641571194762684}


100%|██████████| 15/15 [00:11<00:00,  1.36it/s]
100%|██████████| 2/2 [00:00<00:00,  2.59it/s]


epoch 16: {'accuracy': 0.8186274509803921, 'f1': 0.8732876712328768}


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]
100%|██████████| 2/2 [00:00<00:00,  2.74it/s]


epoch 17: {'accuracy': 0.8088235294117647, 'f1': 0.8691275167785235}


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]
100%|██████████| 2/2 [00:00<00:00,  2.77it/s]


epoch 18: {'accuracy': 0.8431372549019608, 'f1': 0.8853046594982079}


100%|██████████| 15/15 [00:11<00:00,  1.35it/s]
100%|██████████| 2/2 [00:00<00:00,  2.52it/s]


epoch 19: {'accuracy': 0.8431372549019608, 'f1': 0.8869257950530036}


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]
100%|██████████| 2/2 [00:00<00:00,  2.56it/s]


epoch 20: {'accuracy': 0.8333333333333334, 'f1': 0.8831615120274915}


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]
100%|██████████| 2/2 [00:00<00:00,  2.62it/s]


epoch 21: {'accuracy': 0.8284313725490197, 'f1': 0.8801369863013697}


100%|██████████| 15/15 [00:11<00:00,  1.34it/s]
100%|██████████| 2/2 [00:00<00:00,  2.64it/s]


epoch 22: {'accuracy': 0.8504901960784313, 'f1': 0.8924162257495591}


100%|██████████| 15/15 [00:11<00:00,  1.33it/s]
100%|██████████| 2/2 [00:00<00:00,  2.69it/s]


epoch 23: {'accuracy': 0.8431372549019608, 'f1': 0.8892733564013842}


100%|██████████| 15/15 [00:11<00:00,  1.31it/s]
100%|██████████| 2/2 [00:00<00:00,  2.69it/s]


epoch 24: {'accuracy': 0.8431372549019608, 'f1': 0.8892733564013842}


100%|██████████| 15/15 [00:11<00:00,  1.32it/s]
100%|██████████| 2/2 [00:00<00:00,  2.73it/s]


epoch 25: {'accuracy': 0.8308823529411765, 'f1': 0.882051282051282}


100%|██████████| 15/15 [00:10<00:00,  1.39it/s]
100%|██████████| 2/2 [00:00<00:00,  2.58it/s]


epoch 26: {'accuracy': 0.8504901960784313, 'f1': 0.893169877408056}


100%|██████████| 15/15 [00:10<00:00,  1.36it/s]
100%|██████████| 2/2 [00:00<00:00,  2.75it/s]


epoch 27: {'accuracy': 0.8431372549019608, 'f1': 0.8892733564013842}


100%|██████████| 15/15 [00:10<00:00,  1.39it/s]
100%|██████████| 2/2 [00:00<00:00,  2.81it/s]


epoch 28: {'accuracy': 0.8455882352941176, 'f1': 0.8908145580589255}


100%|██████████| 15/15 [00:10<00:00,  1.39it/s]
100%|██████████| 2/2 [00:00<00:00,  2.70it/s]

epoch 29: {'accuracy': 0.8529411764705882, 'f1': 0.8951048951048952}





## Share adapters on the 🤗 Hub

In [8]:
model.push_to_hub("afmck/roberta-large-peft-vera", use_auth_token=True)

adapter_model.bin: 100%|██████████| 5.66M/5.66M [00:02<00:00, 2.71MB/s]


CommitInfo(commit_url='https://huggingface.co/afmck/roberta-large-peft-vera/commit/56f759ecc228c310dc25e2647e996a2b49dfe5b1', commit_message='Upload model', commit_description='', oid='56f759ecc228c310dc25e2647e996a2b49dfe5b1', pr_url=None, pr_revision=None, pr_num=None)

## Load adapters from the Hub

You can also directly load adapters from the Hub using the commands below:

In [9]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoTokenizer

peft_model_id = "afmck/roberta-large-peft-vera"
config = PeftConfig.from_pretrained(peft_model_id)
inference_model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Vera model
inference_model = PeftModel.from_pretrained(inference_model, peft_model_id)

inference_model.to(device)
inference_model.eval()
for step, batch in enumerate(tqdm(eval_dataloader)):
    batch.to(device)
    with torch.no_grad():
        outputs = inference_model(**batch)
    predictions = outputs.logits.argmax(dim=-1)
    predictions, references = predictions, batch["labels"]
    metric.add_batch(
        predictions=predictions,
        references=references,
    )

eval_metric = metric.compute()
print(eval_metric)

Downloading adapter_config.json: 100%|██████████| 534/534 [00:00<00:00, 3.83MB/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Downloading adapter_model.bin: 100%|██████████| 5.66M/5.66M [00:00<00:00, 21.5MB/s]
  0%|          | 0/2 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 2/2 [00:00<00:00,  3.81it/s]

{'accuracy': 0.8529411764705882, 'f1': 0.8951048951048952}



