In [None]:
!pip install datasets
!pip install transformers
!pip install peft
!pip install evaluate

In [None]:
from datasets import load_dataset, DatasetDict,Dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [None]:
# the standford sentiment treebank. the task to predict the sentiment of a giver sentence
dataset = load_dataset("glue", "sst2")
dataset

In [None]:
model_checkpoint = 'roberta-base'

# define label maps
id_label = {0:"negative",1:"positive"}
label_id = {"negative":0, "positive":1}
# generate classificaition model form model checkpoint

Robertamodel = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=2,
    id2label=id_label,
    label2id=label_id
)

Robertamodel

In [None]:
# creating tokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
# add pad token if none exists
if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token':'[PAD]'})
  Robertamodel.resize_token_embeddings(len(tokenizer))

In [None]:
# create tokenizer function
def tokenize_function(examples):
  # extract text
  text = examples['sentence']
  #tokenize and truncate text
  tokenizer.truncatioin_side='left'
  tokenized_inputs = tokenizer(
      text,return_tensors = 'np',
      truncation = True,
      max_length = 512
  )
  return tokenized_inputs

In [None]:
# tokenize training and validation datasets
tokenized_dataset= dataset.map(tokenize_function, batched=True)
tokenized_dataset

In [None]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer= tokenizer)


In [None]:
accuracy = evaluate.load('accuracy')

# define an evaluation function to pass into trainer later

def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=1)
  return {"accuracy":accuracy.compute(predictions=predictions, references = labels)}


In [None]:
# lisst of examples
text_list = ["The movie was a breathtaking masterpiece that kept me hooked from start to finish.",
             "The plot was predictable and painfully slow.",
             "I regret wasting two hours on this boring film.",
             "The performances were outstanding, especially the lead actor’s emotional depth.",
             "The visuals and soundtrack made the film an unforgettable experience."]
print("Untrained model predictions")
print("-------------------")
for text in text_list:
  inputs = tokenizer.encode(text, return_tensors='pt')
  logits = Robertamodel(inputs).logits
  predictions = torch.argmax(logits)
  print(text + "-" + id_label[predictions.tolist()])

In [None]:
# Robertamodel.unload()
peft_config = LoraConfig(task_type="SEQ_CLS",
                         r = 1,
                         lora_alpha = 32,
                         lora_dropout = 0.01,
                         target_modules = ['query'])

In [None]:
model = get_peft_model(Robertamodel, peft_config)
model.print_trainable_parameters()

In [None]:
lr = 1e-3
batch_size = 16
num_epochs = 1

In [None]:
training_args = TrainingArguments(
    output_dir = model_checkpoint+'-lora-text-classification',
    learning_rate = lr,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_epochs,
    weight_decay = 0.01,
    # save_strategy = 'epoch',
    # evaluation_strategy = 'epoch',
    # load_best_model_at_end = True
    # # load_best_model_at_end = True
    )

In [None]:
trainer = Trainer(
    model = Robertamodel,
    args = training_args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['validation'],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

In [None]:
trainer.train()

In [None]:
# trained model prediction
Robertamodel.to("cpu")
for text in text_list:
  inputs = tokenizer.encode(text, return_tensors = 'pt').to('cpu')
  logits = Robertamodel(inputs).logits
  predictions = torch.argmax(logits)
  print(text + "-" + id_label[predictions.tolist()])