# **Fine tuning RoBerta model for binary sentiment classification using "imdb dataset" and LORA fine tuning technique**

In [1]:
!pip install transformers torch evaluate peft

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting pyarrow>=15.0.0 (from datasets>=2.0.0->evaluate)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading peft-0.12.0-py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━

In [13]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
import pandas as pd
from peft import get_peft_model
from peft import LoraConfig , TaskType
import numpy as np
import evaluate

In [3]:
dataset = load_dataset('Imdb')
dataset

Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [8]:
tokenizer=AutoTokenizer.from_pretrained('bert-base-uncased')
def tokenizer_function(dataset):
  return tokenizer(dataset['text'], padding='max_length', truncation=True)

tokenized_data=dataset.map(tokenizer_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [17]:
# Reduce dataset size :
train_dataset=tokenized_data['train'].shuffle(seed=42).select(range(5000))
test_dataset=tokenized_data['test'].shuffle(seed=42).select(range(1000))

### Injecting Lora to the peft :

In [11]:
lora_config= LoraConfig(
    task_type=TaskType.SEQ_CLS , r=1 , lora_alpha=1 , lora_dropout=0.1
)

In [None]:
model= AutoModelForSequenceClassification.from_pretrained('FacebookAI/roberta-base', num_labels=2)
model=get_peft_model(model, lora_config)

metric=evaluate.load('accuracy')

def compute_metrics(eval_pred):
  logits,labels=eval_pred
  predictions=np.argmax(logits)
  return metric.compute(predictions=predictions, references=labels)

training_args=TrainingArguments(
    output_dir='_trainer_test',
    evaluation_strategy= 'epoch',
    num_train_epochs=3
)
trainer=Trainer(
    model = model,
    args = training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
