In [48]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
from tqdm import tqdm 
from datasets import load_dataset
from sklearn.metrics import accuracy_score
import logging
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoModel,
    AutoConfig,
)
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import evaluate

print("total GPU memory: %f GB" % (torch.cuda.mem_get_info()[1] / 1024 ** 3))
print("available GPU memory %f GB" % (torch.cuda.mem_get_info()[0] / 1024 ** 3))
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cpu":
    logging.warning("No GPU available, using CPU will be super slow")

total GPU memory: 47.544312 GB
available GPU memory 2.184448 GB


### Load dataset, model, and tokenizer

In [49]:
dataset_id = "ag_news"
model_id = "roberta-base"
save_dir = "./ckpt/lora_roberta_agnews"

dataset = load_dataset(dataset_id)
train_dataset = dataset['train']
test_dataset = dataset["test"].shard(num_shards=2, index=0)
val_dataset = dataset["test"].shard(num_shards=2, index=1)

In [50]:
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=256, return_tensors="pt")

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [51]:
num_labels = dataset['train'].features['label'].num_classes
class_names = dataset["train"].features["label"].names

print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


In [52]:
id2label = {i: label for i, label in enumerate(class_names)}
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})
config

RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "World",
    "1": "Sports",
    "2": "Business",
    "3": "Sci/Tech"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.33.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [53]:
roberta_model = RobertaForSequenceClassification.from_pretrained(model_id, config=config).to(device)
peft_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS, inference_mode=False, r=2, lora_alpha=2, lora_dropout=0.1, bias="all",
)
roberta_model = get_peft_model(roberta_model, peft_config)
roberta_model.print_trainable_parameters()
peft_config.target_modules

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,363,208 || all params: 125,316,104 || trainable%: 1.0878154973601797


['query', 'value']

In [47]:
# check param count and mem usage 
# too small for LoRA to save training time
mem_params = sum([param.nelement()*param.element_size() for param in roberta_model.parameters()])
mem_bufs = sum([buf.nelement()*buf.element_size() for buf in roberta_model.buffers()])
mem = mem_params + mem_bufs # in bytes
param_count = sum([param.nelement() for param in roberta_model.parameters()])
print(f"Model size = {mem/1024**2:.2f} MB")
print(f"Number of parameters = {param_count/1e6:.2f} M")

Model size = 475.50 MB
Number of parameters = 124.65 M


### Evaluate performance before fine-tuning

In [35]:
with torch.autocast(device):
    roberta_predictions = []
    for i in tqdm(range(len(test_dataset["text"])), total=len(test_dataset["text"])):
        test_input = tokenizer(test_dataset["text"][i], return_tensors="pt").to(device)
        with torch.no_grad():
            logits = roberta_model(**test_input).logits
        predicted_class_id = logits.argmax().item()
        roberta_predictions.append(predicted_class_id)

100%|██████████| 3800/3800 [00:46<00:00, 81.44it/s]


In [21]:
print("raw roberta accuracy: ", round(accuracy_score(test_dataset["label"], roberta_predictions), 3))

raw roberta accuracy:  0.249


### Fine-tune Roberta

In [31]:
training_args = TrainingArguments(
    output_dir=save_dir,
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=180,
    per_device_eval_batch_size=380,
    learning_rate=0.005,
    weight_decay=0.01,
    logging_strategy="steps",
    warmup_steps=50,
    eval_steps=100,
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [32]:
trainer = Trainer(
    model=roberta_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [33]:
with torch.autocast(device, cache_enabled=False):
    trainer.train()

Epoch,Training Loss,Validation Loss
1,0.2508,0.223473


In [34]:
trainer.evaluate()

{'eval_loss': 0.223463237285614,
 'eval_runtime': 12.8218,
 'eval_samples_per_second': 296.37,
 'eval_steps_per_second': 0.78,
 'epoch': 1.0}

In [37]:
finetuned_roberta_outputs = trainer.predict(test_dataset)

In [38]:
finetuned_roberta_predictions = finetuned_roberta_outputs[1]

In [39]:
print("fine-tuned roberta accuracy: ", round(accuracy_score(test_dataset["label"], finetuned_roberta_predictions), 3))

fine-tuned roberta accuracy:  1.0


### Load fine-tuned Roberta

In [46]:
finetuned_model = RobertaForSequenceClassification.from_pretrained(model_id, config=config).to(device)
lora_weights = save_dir + "/checkpoint-667" # 667 training steps
finetuned_model = PeftModel.from_pretrained(
    finetuned_model, 
    lora_weights, 
    device_map="auto",
    offload_folder="offload", 
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
