In [2]:
import warnings
warnings.filterwarnings("ignore")
import torch
import transformers
import evaluate
import numpy as np
print(torch.backends.mps.is_available())  # Should return True
print(torch.backends.mps.is_built())  # Should return True

True
True


In [3]:
print("Torch version:", torch.__version__)
print("Transformers version:", transformers.__version__)

Torch version: 2.0.1
Transformers version: 4.24.0


In [4]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from peft import get_peft_model, LoraConfig

In [5]:
# Step 1: Load a pre-trained model and tokenizer
model_checkpoint = "distilbert-base-uncased"  # You can choose any model you'd like
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [6]:
# Step 2: Load a dataset (e.g., IMDB sentiment analysis dataset)
dataset = load_dataset("imdb")
# Take smaller subsets from train and test for faster training
dataset["train"] = dataset["train"].shuffle(seed=42).select(range(3000))
dataset["test"] = dataset["test"].shuffle(seed=42).select(range(1000))

In [7]:
dataset["test"]

Dataset({
    features: ['text', 'label'],
    num_rows: 1000
})

In [8]:
# Step 3: Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [9]:
train_dataset = dataset["train"].map(tokenize_function, batched=True)
test_dataset = dataset["test"].map(tokenize_function, batched=True)

Map: 100%|██████████| 1000/1000 [00:00<00:00, 3786.27 examples/s]


In [10]:
test_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1000
})

In [11]:
# Step 4: Define LoRA configuration for efficient fine-tuning
lora_config = LoraConfig(
    task_type="SEQ_CLS",
    r=8,  # Low rank dimension for LoRA
    lora_alpha=32,  # Scaling factor for LoRA
    lora_dropout=0.1,  # Dropout for LoRA layers
    target_modules=['q_lin'],  # LoRA will be applied to query layers
)


In [12]:
# Step 5: Prepare the model with LoRA
# define label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2
)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'pre_classifier.we

In [13]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1257988 || all params: 67620868 || trainable%: 1.8603547058875376


In [14]:
# 6. Define Trainer args
training_args = TrainingArguments(
    output_dir="./lora-distilbert-imdb",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    num_train_epochs=2,
    learning_rate=2e-4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    load_best_model_at_end=True,
)

In [15]:
# 7. Evaluation
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return accuracy.compute(predictions=preds, references=labels)

In [16]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# 8. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [17]:
#9. Train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `PeftModelForSequenceClassification.forward` and have been ignored: text. If text are not expected by `PeftModelForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3000
  Num Epochs = 2
  Instantaneous batch size per device = 20
  Total train batch size (w. parallel, distributed & accumulation) = 20
  Gradient Accumulation steps = 1
  Total optimization steps = 300
  Number of trainable parameters = 1257988
  0%|          | 0/300 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  3%|▎         | 10/300 [01:27<41:48,  8.65s/it]

{'loss': 0.6884, 'learning_rate': 0.00019333333333333333, 'epoch': 0.07}


  7%|▋         | 20/300 [02:54<40:37,  8.71s/it]

{'loss': 0.6634, 'learning_rate': 0.0001866666666666667, 'epoch': 0.13}


 10%|█         | 30/300 [04:21<38:58,  8.66s/it]

{'loss': 0.632, 'learning_rate': 0.00018, 'epoch': 0.2}


 13%|█▎        | 40/300 [05:48<37:52,  8.74s/it]

{'loss': 0.5841, 'learning_rate': 0.00017333333333333334, 'epoch': 0.27}


 17%|█▋        | 50/300 [07:15<35:53,  8.62s/it]

{'loss': 0.5348, 'learning_rate': 0.0001666666666666667, 'epoch': 0.33}


 20%|██        | 60/300 [08:43<35:22,  8.84s/it]

{'loss': 0.4567, 'learning_rate': 0.00016, 'epoch': 0.4}


 23%|██▎       | 70/300 [10:10<33:31,  8.74s/it]

{'loss': 0.3981, 'learning_rate': 0.00015333333333333334, 'epoch': 0.47}


 27%|██▋       | 80/300 [11:38<31:40,  8.64s/it]

{'loss': 0.3668, 'learning_rate': 0.00014666666666666666, 'epoch': 0.53}


 30%|███       | 90/300 [13:04<30:14,  8.64s/it]

{'loss': 0.3293, 'learning_rate': 0.00014, 'epoch': 0.6}


 33%|███▎      | 100/300 [14:30<28:35,  8.58s/it]

{'loss': 0.3672, 'learning_rate': 0.00013333333333333334, 'epoch': 0.67}


 37%|███▋      | 110/300 [15:58<27:42,  8.75s/it]

{'loss': 0.3168, 'learning_rate': 0.00012666666666666666, 'epoch': 0.73}


 40%|████      | 120/300 [17:24<25:45,  8.58s/it]

{'loss': 0.3015, 'learning_rate': 0.00012, 'epoch': 0.8}


 43%|████▎     | 130/300 [18:50<24:29,  8.65s/it]

{'loss': 0.305, 'learning_rate': 0.00011333333333333334, 'epoch': 0.87}


 47%|████▋     | 140/300 [20:16<22:54,  8.59s/it]

{'loss': 0.2747, 'learning_rate': 0.00010666666666666667, 'epoch': 0.93}


 50%|█████     | 150/300 [42:47<5:48:13, 139.29s/it]The following columns in the evaluation set don't have a corresponding argument in `PeftModelForSequenceClassification.forward` and have been ignored: text. If text are not expected by `PeftModelForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 20


{'loss': 0.2519, 'learning_rate': 0.0001, 'epoch': 1.0}


                                                    
 50%|█████     | 150/300 [1:05:53<5:48:13, 139.29s/it]Saving model checkpoint to ./lora-distilbert-imdb/checkpoint-150
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


{'eval_loss': 0.32188260555267334, 'eval_accuracy': 0.871, 'eval_runtime': 1386.3781, 'eval_samples_per_second': 0.721, 'eval_steps_per_second': 0.036, 'epoch': 1.0}


tokenizer config file saved in ./lora-distilbert-imdb/checkpoint-150/tokenizer_config.json
Special tokens file saved in ./lora-distilbert-imdb/checkpoint-150/special_tokens_map.json
 53%|█████▎    | 160/300 [1:07:24<1:07:50, 29.07s/it]  

{'loss': 0.2993, 'learning_rate': 9.333333333333334e-05, 'epoch': 1.07}


 57%|█████▋    | 170/300 [1:24:44<10:40:32, 295.63s/it]

{'loss': 0.2978, 'learning_rate': 8.666666666666667e-05, 'epoch': 1.13}


 60%|██████    | 180/300 [1:26:10<32:52, 16.44s/it]    

{'loss': 0.2894, 'learning_rate': 8e-05, 'epoch': 1.2}


 63%|██████▎   | 190/300 [1:27:41<17:26,  9.51s/it]

{'loss': 0.2785, 'learning_rate': 7.333333333333333e-05, 'epoch': 1.27}


 67%|██████▋   | 200/300 [1:29:05<14:03,  8.43s/it]

{'loss': 0.1917, 'learning_rate': 6.666666666666667e-05, 'epoch': 1.33}


 70%|███████   | 210/300 [1:30:29<12:35,  8.39s/it]

{'loss': 0.2523, 'learning_rate': 6e-05, 'epoch': 1.4}


 73%|███████▎  | 220/300 [1:31:55<11:24,  8.56s/it]

{'loss': 0.2942, 'learning_rate': 5.333333333333333e-05, 'epoch': 1.47}


 77%|███████▋  | 230/300 [1:33:18<09:44,  8.35s/it]

{'loss': 0.2462, 'learning_rate': 4.666666666666667e-05, 'epoch': 1.53}


 80%|████████  | 240/300 [1:34:43<08:30,  8.51s/it]

{'loss': 0.245, 'learning_rate': 4e-05, 'epoch': 1.6}


 83%|████████▎ | 250/300 [1:36:10<07:03,  8.47s/it]

{'loss': 0.3227, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.67}


 87%|████████▋ | 260/300 [1:37:36<05:42,  8.56s/it]

{'loss': 0.2634, 'learning_rate': 2.6666666666666667e-05, 'epoch': 1.73}


 90%|█████████ | 270/300 [1:39:02<04:17,  8.57s/it]

{'loss': 0.2817, 'learning_rate': 2e-05, 'epoch': 1.8}


 93%|█████████▎| 280/300 [1:40:28<02:52,  8.62s/it]

{'loss': 0.2858, 'learning_rate': 1.3333333333333333e-05, 'epoch': 1.87}


 97%|█████████▋| 290/300 [1:41:55<01:26,  8.63s/it]

{'loss': 0.2556, 'learning_rate': 6.666666666666667e-06, 'epoch': 1.93}


100%|██████████| 300/300 [1:43:21<00:00,  8.56s/it]The following columns in the evaluation set don't have a corresponding argument in `PeftModelForSequenceClassification.forward` and have been ignored: text. If text are not expected by `PeftModelForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 20


{'loss': 0.2772, 'learning_rate': 0.0, 'epoch': 2.0}


                                                   
100%|██████████| 300/300 [1:45:39<00:00,  8.56s/it]Saving model checkpoint to ./lora-distilbert-imdb/checkpoint-300
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


{'eval_loss': 0.30451661348342896, 'eval_accuracy': 0.877, 'eval_runtime': 138.2998, 'eval_samples_per_second': 7.231, 'eval_steps_per_second': 0.362, 'epoch': 2.0}


tokenizer config file saved in ./lora-distilbert-imdb/checkpoint-300/tokenizer_config.json
Special tokens file saved in ./lora-distilbert-imdb/checkpoint-300/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./lora-distilbert-imdb/checkpoint-300 (score: 0.30451661348342896).
100%|██████████| 300/300 [1:45:39<00:00, 21.13s/it]

{'train_runtime': 6339.9275, 'train_samples_per_second': 0.946, 'train_steps_per_second': 0.047, 'train_loss': 0.3517199206352234, 'epoch': 2.0}





TrainOutput(global_step=300, training_loss=0.3517199206352234, metrics={'train_runtime': 6339.9275, 'train_samples_per_second': 0.946, 'train_steps_per_second': 0.047, 'train_loss': 0.3517199206352234, 'epoch': 2.0})

In [18]:
metrics = trainer.evaluate()
print(metrics)

The following columns in the evaluation set don't have a corresponding argument in `PeftModelForSequenceClassification.forward` and have been ignored: text. If text are not expected by `PeftModelForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 20
100%|██████████| 50/50 [02:16<00:00,  2.74s/it]

{'eval_loss': 0.30451661348342896, 'eval_accuracy': 0.877, 'eval_runtime': 139.7126, 'eval_samples_per_second': 7.158, 'eval_steps_per_second': 0.358, 'epoch': 2.0}



