In [1]:
!pip install torch
!pip install safetensors



In [2]:
from lora_transformer import LoraConfig, LoraModel
from typing import Literal
import torch
from transformers import AutoTokenizer,AutoModelForSequenceClassification, get_linear_schedule_with_warmup, DataCollatorWithPadding
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.metrics import accuracy_score

In [3]:
device = torch.device("cuda")

In [4]:
print(device)

cuda


In [5]:
MODEL_NAME = "roberta-base"
TASK = "sst2"
BATCH_SIZE = 16
LR = 5e-4
EPOCHS = 60
MAX_LEN = 512
RANK = 8
ALPHA = 16
BIAS = "none"
DROPOUT = 0.0
TARGET_MODULES = ["query", "value"]
EXCLUDE_MODULES =  ["classifier"]

In [6]:
dataset = load_dataset("glue", TASK)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

sst2/train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

sst2/validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

sst2/test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [7]:
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True, max_length=MAX_LEN)

In [8]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [9]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(tokenized_datasets["train"],
    shuffle=True,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=BATCH_SIZE, collate_fn=data_collator)

In [10]:
pretrained_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
lora_config = LoraConfig(rank = RANK, bias = BIAS, alpha = ALPHA,
                        dropout = DROPOUT,
                        target_modules = TARGET_MODULES , exclude_modules = EXCLUDE_MODULES)


In [12]:
model = LoraModel(pretrained_model, lora_config)
model.to(device)

LoraModel(
  (model): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-11): 12 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSdpaSelfAttention(
                (query): LoRALinearLayer(
                  in_features=768, out_features=768, bias=True
                  (dropout): Identity()
                )
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): LoRALinearLayer(
                  in_features=768, out_features=768, bias=True
                  (dropout): 

In [13]:
def count_parameters(model):
    trainable = model.get_n_trainable()
    # pretrained model total
    total = sum(p.numel() for p in model.model.parameters())
    print(f"Trainable Params: {trainable:,} || Total Params: {total:,} || %: {100 * trainable / total:.2f}%")

In [14]:
count_parameters(model)

Trainable Params: 887,042 || Total Params: 124,942,082 || %: 0.71%


In [15]:
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LR)
num_training_steps = EPOCHS * len(train_dataloader)
scheduler = get_linear_schedule_with_warmup(optimizer,
    num_warmup_steps=int(0.06 * num_training_steps),
    num_training_steps=num_training_steps)

In [16]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{EPOCHS}")

    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        progress_bar.set_postfix({"loss": loss.item()})

    avg_train_loss = total_loss / len(train_dataloader)

    model.eval()
    preds = []
    labels = []

    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        preds.extend(predictions.cpu().numpy())
        labels.extend(batch["labels"].cpu().numpy())

    acc = accuracy_score(labels, preds)
    print(f"\nEpoch {epoch + 1}. Train Loss: {avg_train_loss:.4f} | Val Accuracy: {acc:.4f}\n")

model.save_model("roberta_lora_sst2.safetensors", merge_weights=False)
print("Model Saved.")

Epoch 1/60: 100%|██████████| 4210/4210 [02:30<00:00, 27.97it/s, loss=0.307]



Epoch 1. Train Loss: 0.3288 | Val Accuracy: 0.9312



Epoch 2/60: 100%|██████████| 4210/4210 [02:29<00:00, 28.17it/s, loss=0.011]



Epoch 2. Train Loss: 0.2229 | Val Accuracy: 0.9312



Epoch 3/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.28it/s, loss=0.0369]



Epoch 3. Train Loss: 0.1995 | Val Accuracy: 0.9438



Epoch 4/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.41it/s, loss=0.159]



Epoch 4. Train Loss: 0.1856 | Val Accuracy: 0.9392



Epoch 5/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.38it/s, loss=0.00576]



Epoch 5. Train Loss: 0.1721 | Val Accuracy: 0.9369



Epoch 6/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.26it/s, loss=0.0384]



Epoch 6. Train Loss: 0.1639 | Val Accuracy: 0.9312



Epoch 7/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.26it/s, loss=0.0999]



Epoch 7. Train Loss: 0.1535 | Val Accuracy: 0.9323



Epoch 8/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.29it/s, loss=0.135]



Epoch 8. Train Loss: 0.1481 | Val Accuracy: 0.9415



Epoch 9/60: 100%|██████████| 4210/4210 [02:29<00:00, 28.25it/s, loss=0.0618]



Epoch 9. Train Loss: 0.1416 | Val Accuracy: 0.9300



Epoch 10/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.28it/s, loss=0.417]



Epoch 10. Train Loss: 0.1360 | Val Accuracy: 0.9323



Epoch 11/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.27it/s, loss=1.09]



Epoch 11. Train Loss: 0.1328 | Val Accuracy: 0.9289



Epoch 12/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.30it/s, loss=0.057]



Epoch 12. Train Loss: 0.1293 | Val Accuracy: 0.9323



Epoch 13/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.36it/s, loss=0.0868]



Epoch 13. Train Loss: 0.1246 | Val Accuracy: 0.9438



Epoch 14/60: 100%|██████████| 4210/4210 [02:29<00:00, 28.24it/s, loss=1.28]



Epoch 14. Train Loss: 0.1219 | Val Accuracy: 0.9335



Epoch 15/60: 100%|██████████| 4210/4210 [02:30<00:00, 27.92it/s, loss=0.146]



Epoch 15. Train Loss: 0.1186 | Val Accuracy: 0.9323



Epoch 16/60: 100%|██████████| 4210/4210 [02:29<00:00, 28.11it/s, loss=0.247]



Epoch 16. Train Loss: 0.1139 | Val Accuracy: 0.9289



Epoch 17/60: 100%|██████████| 4210/4210 [02:29<00:00, 28.24it/s, loss=0.36]



Epoch 17. Train Loss: 0.1123 | Val Accuracy: 0.9300



Epoch 18/60: 100%|██████████| 4210/4210 [02:29<00:00, 28.19it/s, loss=0.00186]



Epoch 18. Train Loss: 0.1081 | Val Accuracy: 0.9415



Epoch 19/60: 100%|██████████| 4210/4210 [02:29<00:00, 28.19it/s, loss=0.0485]



Epoch 19. Train Loss: 0.1054 | Val Accuracy: 0.9381



Epoch 20/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.27it/s, loss=0.0867]



Epoch 20. Train Loss: 0.1018 | Val Accuracy: 0.9369



Epoch 21/60: 100%|██████████| 4210/4210 [02:29<00:00, 28.16it/s, loss=0.00101]



Epoch 21. Train Loss: 0.0996 | Val Accuracy: 0.9427



Epoch 22/60: 100%|██████████| 4210/4210 [02:29<00:00, 28.19it/s, loss=0.0891]



Epoch 22. Train Loss: 0.0963 | Val Accuracy: 0.9312



Epoch 23/60: 100%|██████████| 4210/4210 [02:29<00:00, 28.15it/s, loss=0.000983]



Epoch 23. Train Loss: 0.0918 | Val Accuracy: 0.9369



Epoch 24/60: 100%|██████████| 4210/4210 [02:29<00:00, 28.22it/s, loss=0.00264]



Epoch 24. Train Loss: 0.0916 | Val Accuracy: 0.9415



Epoch 25/60: 100%|██████████| 4210/4210 [02:29<00:00, 28.23it/s, loss=0.0587]



Epoch 25. Train Loss: 0.0888 | Val Accuracy: 0.9323



Epoch 26/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.32it/s, loss=0.0188]



Epoch 26. Train Loss: 0.0839 | Val Accuracy: 0.9404



Epoch 27/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.29it/s, loss=0.000868]



Epoch 27. Train Loss: 0.0846 | Val Accuracy: 0.9335



Epoch 28/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.37it/s, loss=0.0478]



Epoch 28. Train Loss: 0.0815 | Val Accuracy: 0.9392



Epoch 29/60: 100%|██████████| 4210/4210 [02:29<00:00, 28.25it/s, loss=0.21]



Epoch 29. Train Loss: 0.0774 | Val Accuracy: 0.9427



Epoch 30/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.31it/s, loss=0.0177]



Epoch 30. Train Loss: 0.0748 | Val Accuracy: 0.9415



Epoch 31/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.29it/s, loss=0.00637]



Epoch 31. Train Loss: 0.0727 | Val Accuracy: 0.9255



Epoch 32/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.34it/s, loss=0.00116]



Epoch 32. Train Loss: 0.0692 | Val Accuracy: 0.9381



Epoch 33/60: 100%|██████████| 4210/4210 [02:29<00:00, 28.20it/s, loss=0.0279]



Epoch 33. Train Loss: 0.0679 | Val Accuracy: 0.9427



Epoch 34/60: 100%|██████████| 4210/4210 [02:31<00:00, 27.86it/s, loss=0.000399]



Epoch 34. Train Loss: 0.0652 | Val Accuracy: 0.9438



Epoch 35/60: 100%|██████████| 4210/4210 [02:29<00:00, 28.15it/s, loss=0.000681]



Epoch 35. Train Loss: 0.0617 | Val Accuracy: 0.9404



Epoch 36/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.33it/s, loss=0.23]



Epoch 36. Train Loss: 0.0625 | Val Accuracy: 0.9392



Epoch 37/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.34it/s, loss=0.398]



Epoch 37. Train Loss: 0.0599 | Val Accuracy: 0.9427



Epoch 38/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.26it/s, loss=0.000345]



Epoch 38. Train Loss: 0.0571 | Val Accuracy: 0.9369



Epoch 39/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.31it/s, loss=0.024]



Epoch 39. Train Loss: 0.0554 | Val Accuracy: 0.9427



Epoch 40/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.30it/s, loss=0.00895]



Epoch 40. Train Loss: 0.0525 | Val Accuracy: 0.9392



Epoch 41/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.35it/s, loss=0.000506]



Epoch 41. Train Loss: 0.0497 | Val Accuracy: 0.9461



Epoch 42/60: 100%|██████████| 4210/4210 [02:29<00:00, 28.25it/s, loss=0.00973]



Epoch 42. Train Loss: 0.0488 | Val Accuracy: 0.9450



Epoch 43/60: 100%|██████████| 4210/4210 [02:29<00:00, 28.18it/s, loss=0.000211]



Epoch 43. Train Loss: 0.0462 | Val Accuracy: 0.9358



Epoch 44/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.26it/s, loss=0.000294]



Epoch 44. Train Loss: 0.0428 | Val Accuracy: 0.9472



Epoch 45/60: 100%|██████████| 4210/4210 [02:28<00:00, 28.29it/s, loss=0.00036]



Epoch 45. Train Loss: 0.0409 | Val Accuracy: 0.9507



Epoch 46/60: 100%|██████████| 4210/4210 [02:29<00:00, 28.12it/s, loss=0.000225]



Epoch 46. Train Loss: 0.0401 | Val Accuracy: 0.9495



Epoch 47/60: 100%|██████████| 4210/4210 [02:29<00:00, 28.21it/s, loss=0.0625]



Epoch 47. Train Loss: 0.0386 | Val Accuracy: 0.9484



Epoch 48/60: 100%|██████████| 4210/4210 [02:30<00:00, 28.06it/s, loss=0.0275]



Epoch 48. Train Loss: 0.0364 | Val Accuracy: 0.9507



Epoch 49/60: 100%|██████████| 4210/4210 [02:29<00:00, 28.20it/s, loss=0.000343]



Epoch 49. Train Loss: 0.0350 | Val Accuracy: 0.9507



Epoch 50/60: 100%|██████████| 4210/4210 [02:29<00:00, 28.19it/s, loss=0.045]



Epoch 50. Train Loss: 0.0320 | Val Accuracy: 0.9484



Epoch 51/60: 100%|██████████| 4210/4210 [02:30<00:00, 28.05it/s, loss=0.0152]



Epoch 51. Train Loss: 0.0315 | Val Accuracy: 0.9461



Epoch 52/60: 100%|██████████| 4210/4210 [02:31<00:00, 27.85it/s, loss=0.000195]



Epoch 52. Train Loss: 0.0284 | Val Accuracy: 0.9484



Epoch 53/60: 100%|██████████| 4210/4210 [02:32<00:00, 27.55it/s, loss=0.00067]



Epoch 53. Train Loss: 0.0287 | Val Accuracy: 0.9450



Epoch 54/60: 100%|██████████| 4210/4210 [02:36<00:00, 26.88it/s, loss=0.000398]



Epoch 54. Train Loss: 0.0261 | Val Accuracy: 0.9438



Epoch 55/60: 100%|██████████| 4210/4210 [02:39<00:00, 26.46it/s, loss=0.0116]



Epoch 55. Train Loss: 0.0250 | Val Accuracy: 0.9461



Epoch 56/60: 100%|██████████| 4210/4210 [02:38<00:00, 26.56it/s, loss=0.000257]



Epoch 56. Train Loss: 0.0250 | Val Accuracy: 0.9472



Epoch 57/60: 100%|██████████| 4210/4210 [02:35<00:00, 27.02it/s, loss=0.00026]



Epoch 57. Train Loss: 0.0242 | Val Accuracy: 0.9461



Epoch 58/60: 100%|██████████| 4210/4210 [02:39<00:00, 26.43it/s, loss=0.00145]



Epoch 58. Train Loss: 0.0219 | Val Accuracy: 0.9472



Epoch 59/60: 100%|██████████| 4210/4210 [02:39<00:00, 26.41it/s, loss=0.000102]



Epoch 59. Train Loss: 0.0214 | Val Accuracy: 0.9472



Epoch 60/60: 100%|██████████| 4210/4210 [02:36<00:00, 26.95it/s, loss=0.00806]



Epoch 60. Train Loss: 0.0198 | Val Accuracy: 0.9472

Model Saved.
