Note: code from this website was used (changes were made to it): \\
https://achimoraites.medium.com/lightweight-roberta-sequence-classification-fine-tuning-with-lora-using-the-hugging-face-peft-8dd9edf99d19 \\


Install necessary Libraries

In [2]:
!pip install transformers datasets evaluate accelerate peft

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7

Import required libraries

In [3]:
import torch
from torch.nn.functional import softmax
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from transformers import RobertaModel, RobertaTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification
from peft import LoraConfig, get_peft_model
from datasets import Dataset
from sklearn.metrics import f1_score, classification_report
import numpy as np
import os

# Preprocessing


Define model and tokenizer names

In [None]:
peft_model_name = 'roberta-base-peft'
modified_base = 'roberta-base-modified'
base_model = 'roberta-base'

Load the tokenizer

In [None]:
tokenizer = RobertaTokenizer.from_pretrained(base_model)


Encode sentences for input to the model

In [None]:
def encode(examples):
    return tokenizer(examples['sentence'], truncation=True, padding=True)

Load data and convert labels to numeric format

In [4]:
train_data = pd.read_csv('train_en.tsv', sep='\t')
train_data['label'] = train_data['label'].apply(lambda x: 1 if x == 'SUBJ' else 0)

dev_data = pd.read_csv('dev_test_en.tsv', sep='\t')
dev_data['label'] = train_data['label'].apply(lambda x: 1 if x == 'SUBJ' else 0)

test_data = pd.read_csv('test_en_gold.tsv', sep='\t')
test_data['label'] = test_data['label'].apply(lambda x: 1 if x == 'SUBJ' else 0)

In [5]:
# Class distribution
label_distribution = train_data['label'].value_counts()
print(label_distribution)

# Percentage distribution
label_percentage = train_data['label'].value_counts(normalize=True) * 100
print(label_percentage)


label
0    532
1    298
Name: count, dtype: int64
label
0    64.096386
1    35.903614
Name: proportion, dtype: float64


Convert dataframes to HuggingFace datasets

In [None]:
train_dataset = Dataset.from_pandas(train_data)
dev_dataset = Dataset.from_pandas(dev_data)
test_dataset = Dataset.from_pandas(test_data)

train_dataset = train_dataset.map(encode, batched=True, remove_columns=["solved_conflict", "sentence_id"])
dev_dataset = dev_dataset.map(encode, batched=True, remove_columns=["sentence_id"])
test_dataset = test_dataset.map(encode, batched=True, remove_columns=["sentence_id"])

Map:   0%|          | 0/830 [00:00<?, ? examples/s]

Map:   0%|          | 0/243 [00:00<?, ? examples/s]

Map:   0%|          | 0/484 [00:00<?, ? examples/s]

Create label mappings

In [None]:
# Create an id2label mapping
# We will need this for our classifier.
id2label = {0: "OBJ", 1: "SUBJ"}
label2id = {"OBJ": 0, "SUBJ": 1}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# Model 0: Baseline RoBERTa Model

Load pre-trained RoBERTa model for classification

In [None]:
model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluate baseline model

In [None]:
from tqdm import tqdm

def evaluate_model(inference_model, dataset, data_collator):

    eval_dataloader = DataLoader(
        dataset,
        batch_size=8,
        collate_fn=data_collator  # Handles padding
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    all_references = []

    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {key: value.to(device) for key, value in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        references = batch["labels"]

        all_predictions.extend(predictions.cpu().numpy())
        all_references.extend(references.cpu().numpy())

    report = classification_report(all_references, all_predictions, digits=4)
    print("Classification Report:\n")
    print(report)

# Evaluate the non fine-tuned model
test_dataset_torch = test_dataset
test_dataset_torch.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
evaluate_model(model, test_dataset_torch, data_collator)

100%|██████████| 61/61 [00:03<00:00, 15.98it/s]

Classification Report:

              precision    recall  f1-score   support

           0     0.7479    1.0000    0.8558       362
           1     0.0000    0.0000    0.0000       122

    accuracy                         0.7479       484
   macro avg     0.3740    0.5000    0.4279       484
weighted avg     0.5594    0.7479    0.6401       484




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Model 1: Fine-tuned RoBERTa


In [None]:
model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

Training Model

In [None]:
os.environ["WANDB_MODE"] = "disabled"

# Set training hyperparameters
learning_rate = 3e-5
epochs = 25
warmup_steps = 500

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    logging_dir='./logs',
    learning_rate=learning_rate,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    warmup_steps=warmup_steps,
    weight_decay=0.01
)

# Initialize Trainer with model, training arguments, and datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start the training process
trainer.train()

  trainer = Trainer(


Step,Training Loss
500,0.0458
1000,0.0258


TrainOutput(global_step=1300, training_loss=0.027536203616227096, metrics={'train_runtime': 571.537, 'train_samples_per_second': 36.306, 'train_steps_per_second': 2.275, 'total_flos': 1535499674640000.0, 'train_loss': 0.027536203616227096, 'epoch': 25.0})

Evaluate Model on Test Dataset

In [None]:
# Evaluate the model on the test dataset
test_results = trainer.predict(test_dataset)
logits = test_results.predictions

# Convert logits to predicted class indices
predicted_labels = np.argmax(logits, axis=-1)

# predicted_labels = [id2label[idx] for idx in predicted_class_indices]

actual_labels = test_dataset["label"]

f1 = f1_score(actual_labels, predicted_labels, average="weighted")

# Print results
print("F1-score:", f1)
print("Classification Report:")
print(classification_report(actual_labels, predicted_labels, target_names=["Objective", "Subjective"]))

F1-score: 0.8096142953185782
Classification Report:
              precision    recall  f1-score   support

   Objective       0.83      0.96      0.89       362
  Subjective       0.79      0.43      0.56       122

    accuracy                           0.83       484
   macro avg       0.81      0.70      0.73       484
weighted avg       0.82      0.83      0.81       484



# Model 2: LoRA Fine-Tuning leveraging the PEFT (Parameter-Efficient Fine-Tuning) library

In [None]:
# Configure the LoRA settings for parameter-efficient fine-tuning
peft_config = LoraConfig(task_type="SEQ_CLS", inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)

# Create a PEFT model using LoRA configuration
peft_model = get_peft_model(model, peft_config)

In [None]:
peft_model.print_trainable_parameters()

trainable params: 887,042 || all params: 125,534,212 || trainable%: 0.7066


Train the Model

In [None]:
# Set hyperparameters for training
learning_rate = 3e-5
epochs = 25
warmup_steps = 500

# Define training arguments for LoRA
training_args = TrainingArguments(
    output_dir='./lora_results',
    logging_dir='./lora_logs',
    learning_rate=learning_rate,  # You can experiment with slightly higher learning rates for LoRA (e.g., 2e-4)
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    warmup_steps=warmup_steps,
    weight_decay=0.01,
)

# Initialize the Trainer with the PEFT model, datasets, and training arguments
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start the training process
trainer.train()

  trainer = Trainer(


Step,Training Loss
500,0.6522
1000,0.3921


TrainOutput(global_step=1300, training_loss=0.47892373891977164, metrics={'train_runtime': 371.8962, 'train_samples_per_second': 55.795, 'train_steps_per_second': 3.496, 'total_flos': 1551402563616000.0, 'train_loss': 0.47892373891977164, 'epoch': 25.0})

Evaluate the model on the test dataset

In [None]:
test_results = trainer.predict(test_dataset)
logits = test_results.predictions

# Convert logits to predicted class indices
predicted_labels = np.argmax(logits, axis=-1)

# predicted_labels = [id2label[idx] for idx in predicted_class_indices]

actual_labels = test_dataset["label"]

f1 = f1_score(actual_labels, predicted_labels, average="weighted")

# Print results
print("F1-score:", f1)
print("Classification Report:")
print(classification_report(actual_labels, predicted_labels, target_names=["Objective", "Subjective"]))

F1-score: 0.7830971413087657
Classification Report:
              precision    recall  f1-score   support

   Objective       0.81      0.97      0.89       362
  Subjective       0.79      0.34      0.48       122

    accuracy                           0.81       484
   macro avg       0.80      0.66      0.68       484
weighted avg       0.81      0.81      0.78       484

