In [1]:
!nvidia-smi

Sun Dec 10 06:15:20 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.98                 Driver Version: 535.98       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 Ti     Off | 00000000:1B:00.0 Off |                  N/A |
| 30%   22C    P8               3W / 250W |      0MiB / 11264MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files="../data/imdb-dataset.csv", split="train")

# Set sentiment <---> class id mappings
sentiments = ['positive', "negative"]
ID2SENT = { idx: sentiment for idx, sentiment in enumerate(sorted(sentiments)) }
SENT2ID = { sentiment: idx for idx, sentiment in enumerate(sorted(sentiments)) }
NUM_LABELS = len(sentiments)
print(f"ID to sentiment: {ID2SENT}")
print(f"Sentiment to ID: {SENT2ID}")

##########################################
# Change labels to class ids
LABEL_COLUMN = "labels"
TEXT_COLUMN = "text"
dataset = dataset.class_encode_column("sentiment")
dataset = dataset.align_labels_with_mapping(SENT2ID, "sentiment")
dataset = (dataset
    .rename_column("review", TEXT_COLUMN)
    .rename_column("sentiment", LABEL_COLUMN)
)

ID to sentiment: {0: 'negative', 1: 'positive'}
Sentiment to ID: {'negative': 0, 'positive': 1}


In [3]:
# Generate train, val, test split
split = dataset.train_test_split(test_size=0.1, stratify_by_column=LABEL_COLUMN)
train_val = split["train"].train_test_split(test_size=0.11, stratify_by_column=LABEL_COLUMN)
train_dataset = train_val["train"]
val_dataset = train_val["test"]
test_dataset = split["test"]
print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Training dataset size: 40050
Validation dataset size: 4950
Test dataset size: 5000


In [4]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import get_peft_model, LoraConfig, TaskType

base_model = "cardiffnlp/twitter-roberta-base-sentiment-latest"
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

tokenizer = AutoTokenizer.from_pretrained(base_model, add_prefix_space=True)
model = AutoModelForSequenceClassification.from_pretrained(
    base_model, 
    num_labels=NUM_LABELS,
    ignore_mismatched_sizes=True
)
model = model.to(device)

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, 
    r=8, 
    lora_alpha=16, 
    lora_dropout=0.1, 
    bias="none"
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

trainable params: 887,042 || all params: 125,534,212 || trainable%: 0.7066137476531099


In [5]:
# Tokenize and prepare dataset
CUTOFF_LENGTH = 256

def tokenize_dataset(examples):
    return tokenizer(examples[TEXT_COLUMN], truncation=True, max_length=CUTOFF_LENGTH)

train_dataset = train_dataset.map(tokenize_dataset, batched=True, remove_columns=TEXT_COLUMN)
val_dataset = val_dataset.map(tokenize_dataset, batched=True, remove_columns=TEXT_COLUMN)
test_dataset = test_dataset.map(tokenize_dataset, batched=True, remove_columns=TEXT_COLUMN)

Map:   0%|          | 0/40050 [00:00<?, ? examples/s]

Map:   0%|          | 0/4950 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [6]:
from transformers import DataCollatorWithPadding
train_dataset.set_format("torch")
val_dataset.set_format("torch")
test_dataset.set_format("torch")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
import evaluate
import numpy as np
import torch

accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy_score = accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    precision_score = precision.compute(predictions=predictions, references=labels, average="macro", zero_division=1)["precision"]
    recall_score = recall.compute(predictions=predictions, references=labels, average="macro")["recall"]
    f1_score = f1.compute(predictions=predictions, references=labels, average="macro")["f1"]
    return {
        "accuracy": accuracy_score,
        "precision": precision_score,
        "recall": recall_score,
        "f1": f1_score
    }

In [8]:
from transformers import TrainingArguments, Trainer

lr = 5e-4
batch_size = 128
num_epochs = 4

training_args = TrainingArguments(
    output_dir="../models/lora",
    learning_rate=lr,
    lr_scheduler_type= "cosine_with_restarts", #https://huggingface.co/transformers/v4.7.0/_modules/transformers/trainer_utils.html#:~:text=class-,SchedulerType,-(ExplicitEnum)%3A
    warmup_ratio= 0.1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    # weight_decay=0.001,
    evaluation_strategy="steps",
    logging_steps=200,
    eval_steps=200,
    save_steps=400,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none",
    fp16=False,
    save_total_limit=1,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant":False}
)

In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
200,0.2945,0.212768,0.916162,0.916467,0.916162,0.916146
400,0.2084,0.205232,0.923434,0.923529,0.923434,0.92343
600,0.1893,0.186536,0.922424,0.922514,0.922424,0.92242
800,0.1745,0.192401,0.927879,0.92793,0.927879,0.927877
1000,0.1583,0.190073,0.924848,0.925338,0.924848,0.924827
1200,0.1552,0.188929,0.927071,0.92733,0.927071,0.92706


TrainOutput(global_step=1252, training_loss=0.1952643638220839, metrics={'train_runtime': 2934.7055, 'train_samples_per_second': 54.588, 'train_steps_per_second': 0.427, 'total_flos': 2.12934674755584e+16, 'train_loss': 0.1952643638220839, 'epoch': 4.0})

In [10]:
model.push_to_hub("00BER/imbd-roberta-base-sentiment-lora-latest")
merged_model = model.merge_and_unload()
merged_model.push_to_hub("00BER/imbd-roberta-base-sentiment-merged-latest")
merged_model.save_pretrained("../models/imbd-roberta-base-sentiment-merged-latest")

adapter_model.safetensors:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]