In [17]:
from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    BertTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
from peft import get_peft_config, PeftModel, LoraConfig, get_peft_model, LoraConfig, TaskType
import evaluate
import torch
import numpy as np
import seqeval

print("Setup complete!")

Setup complete!


In [18]:
# constants
model_name = "bert-base-uncased"
batch_size = 16
epochs = 1

In [19]:
# Loading dataset and basic stats
emotion_dataset = load_dataset("dair-ai/emotion")
print(emotion_dataset)

No config specified, defaulting to: emotion/split
Found cached dataset emotion (/home/dejang/.cache/huggingface/datasets/dair-ai___emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd)
100%|██████████| 3/3 [00:00<00:00, 1041.37it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})





In [20]:
# show one train example
print(emotion_dataset["train"][0])

{'text': 'i didnt feel humiliated', 'label': 0}


In [21]:
emotion_dataset["train"].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}

In [22]:
# Decrease the size of the dataset for faster training (local on cpu)
emotion_dataset["train"] = emotion_dataset["train"].select(range(100))
emotion_dataset["validation"] = emotion_dataset["validation"].select(range(100))
emotion_dataset["test"] = emotion_dataset["test"].select(range(100))

In [23]:
label2id = {text: num for num, text in enumerate(emotion_dataset["train"].features["label"].names)}
id2label = {num: text for num, text in enumerate(emotion_dataset["train"].features["label"].names)}
print(label2id)
print(id2label)

{'sadness': 0, 'joy': 1, 'love': 2, 'anger': 3, 'fear': 4, 'surprise': 5}
{0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}


In [24]:
# load tokenizer
tokenizer = BertTokenizer.from_pretrained(model_name)

# take one example from train dataset and tokenize it
example = emotion_dataset["train"][0]
print(example)
print(tokenizer(example["text"]))

# take bath of examples and tokenize them
batch = tokenizer(emotion_dataset["train"]["text"][:5], padding="max_length", truncation=True)
for ids in batch["input_ids"]:
    print(len(ids))

{'text': 'i didnt feel humiliated', 'label': 0}
{'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
512
512
512
512
512


In [25]:
# method for tokenizing examples
def tokenize(batch): 
    return tokenizer(batch["text"], padding="max_length", truncation=True, return_tensors="pt")

# tokenize dataset
tokenized_emotions = emotion_dataset.map(tokenize, batched=True, batch_size=batch_size)
tokenized_emotions = tokenized_emotions.remove_columns(["text"])
tokenized_emotions = tokenized_emotions.rename_column("label", "labels")
print(tokenized_emotions)

                                                   

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 100
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 100
    })
})




In [26]:
# initialize data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# define metrics
def compute_metrics(eval_preds):
    metrics = evaluate.combine([
        evaluate.load("accuracy"),
        evaluate.load("precision", average="weighted"),
        evaluate.load("recall", average="weighted"),
        evaluate.load("f1", average="weighted")
    ])
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metrics.compute(predictions=predictions, references=labels)


# create model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label2id))
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=1,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_emotions["train"],
    eval_dataset=tokenized_emotions["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [27]:
trainer.train()

100%|██████████| 7/7 [04:54<00:00, 36.28s/it]
Downloading builder script: 100%|██████████| 5.75k/5.75k [00:00<00:00, 3.99MB/s]


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [14]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1, bias="all"
)
# Train lora model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 397836 || all params: 109786380 || trainable%: 0.36237281892344025


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_emotions["train"],
    eval_dataset=tokenized_emotions["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)
trainer.train()