**Setup: Install Dependencies**

In [1]:
%pip install transformers datasets accelerate
%pip install evaluate
%pip install -U datasets peft
%pip install scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[3

**Load the dataset**

In [None]:
from datasets import load_dataset

dataset = load_dataset("mteb/amazon_massive_intent", "en")
train_ds = dataset["train"]
val_ds   = dataset["validation"]
test_ds  = dataset["test"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/51 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/51 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/51 [00:00<?, ?it/s]

train/en.json.gz:   0%|          | 0.00/187k [00:00<?, ?B/s]

test/en.json.gz:   0%|          | 0.00/54.1k [00:00<?, ?B/s]

validation/en.json.gz:   0%|          | 0.00/38.3k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

**Inspect labels & build mappings**

In [None]:
# labels are strings; build stable id maps
labels = sorted(set(train_ds["label"]))
label2id = {l:i for i,l in enumerate(labels)}
id2label = {i:l for l,i in label2id.items()}

num_labels = len(labels)
print("num_labels =", num_labels)

num_labels = 60


**Tokenizer + preprocessing**

In [None]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

MAX_LEN = 128

def preprocess(batch):
    enc = tokenizer(batch["text"], truncation=True, padding="max_length", max_length=MAX_LEN)
    enc["labels"] = [label2id[l] for l in batch["label"]]
    return enc

train_ds = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
val_ds   = val_ds.map(preprocess,   batched=True, remove_columns=val_ds.column_names)
test_ds  = test_ds.map(preprocess,  batched=True, remove_columns=test_ds.column_names)

train_ds.set_format(type="torch")
val_ds.set_format(type="torch")
test_ds.set_format(type="torch")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/11514 [00:00<?, ? examples/s]

Map:   0%|          | 0/2033 [00:00<?, ? examples/s]

Map:   0%|          | 0/2974 [00:00<?, ? examples/s]

**Load base model + attach LoRA**

In [None]:
from transformers import AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model

base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=num_labels, id2label=id2label, label2id=label2id
)

# LoRA config (works well for DistilBERT)
lora_cfg = LoraConfig(
    r=8,                 # rank
    lora_alpha=16,       # scaling
    lora_dropout=0.05,   # regularization
    bias="none",
    target_modules=["q_lin","v_lin"],  # LoRA in attention proj layers
    task_type="SEQ_CLS"
)

model = get_peft_model(base_model, lora_cfg)
model.print_trainable_parameters()   # sanity check: <1% should be trainable


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 784,188 || all params: 67,783,800 || trainable%: 1.1569


**Training setup (Trainer)**

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
import evaluate
import numpy as np

acc = evaluate.load("accuracy")
f1  = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": f1.compute(predictions=preds, references=labels, average="macro")["f1"]
    }

args = TrainingArguments(
    output_dir="./results_intent_lora",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    warmup_ratio=0.05,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    report_to="none"
)

collator = DataCollatorWithPadding(tokenizer)

**Early stopping the model training**

In [None]:
from transformers import EarlyStoppingCallback
callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]

In [12]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
    callbacks=callbacks
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,1.7586,0.798567,0.78062,0.660268
2,0.7016,0.572111,0.843089,0.793373
3,0.5423,0.517619,0.851451,0.801339
4,0.4569,0.505355,0.861289,0.832531
5,0.402,0.473558,0.865716,0.840738
6,0.3647,0.455503,0.879488,0.857683
7,0.3318,0.460855,0.878013,0.857689
8,0.3099,0.457702,0.875061,0.858056


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,1.7586,0.798567,0.78062,0.660268
2,0.7016,0.572111,0.843089,0.793373
3,0.5423,0.517619,0.851451,0.801339
4,0.4569,0.505355,0.861289,0.832531
5,0.402,0.473558,0.865716,0.840738
6,0.3647,0.455503,0.879488,0.857683
7,0.3318,0.460855,0.878013,0.857689
8,0.3099,0.457702,0.875061,0.858056
9,0.2927,0.449548,0.880472,0.860475
10,0.278,0.44863,0.88244,0.865051


TrainOutput(global_step=3600, training_loss=0.5438570467631022, metrics={'train_runtime': 910.9177, 'train_samples_per_second': 126.4, 'train_steps_per_second': 3.952, 'total_flos': 3886361914613760.0, 'train_loss': 0.5438570467631022, 'epoch': 10.0})

**Evaluate on test split**

In [13]:
metrics = trainer.evaluate(test_ds)
print(metrics)

{'eval_loss': 0.45501065254211426, 'eval_accuracy': 0.879287155346335, 'eval_f1_macro': 0.8626208937008324, 'eval_runtime': 11.8876, 'eval_samples_per_second': 250.177, 'eval_steps_per_second': 7.823, 'epoch': 10.0}


**Save the LoRA adapter**

In [15]:
save_path = f"./{model_name}-finetuned-intent"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('./distilbert-base-uncased-finetuned-intent/tokenizer_config.json',
 './distilbert-base-uncased-finetuned-intent/special_tokens_map.json',
 './distilbert-base-uncased-finetuned-intent/vocab.txt',
 './distilbert-base-uncased-finetuned-intent/added_tokens.json',
 './distilbert-base-uncased-finetuned-intent/tokenizer.json')

**Reload for inference**

In [17]:
from peft import PeftModel
from transformers import pipeline, AutoModelForSequenceClassification

base = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=num_labels, id2label=id2label, label2id=label2id
)
lora_loaded = PeftModel.from_pretrained(base, save_path)

clf = pipeline("text-classification", model=lora_loaded, tokenizer=tokenizer, top_k=1)

print(clf("please cancel my alarm"))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


[[{'label': 'alarm_remove', 'score': 0.9880728125572205}]]
