# Guide to use RoBERTa model using Transformers

In [1]:
# Install the transformers library
!pip install datasets transformers accelerate

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m94.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.22.0-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

## Import Required modules

In [9]:
# Import required packages
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from transformers import pipeline

In [3]:
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

## Approach-1 Using pipeline

In [10]:
sentiment_task = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [11]:
sentiment = sentiment_task("I watched Oppenheimer movie, one the finest and incredible movie")
sentiment

[{'label': 'POSITIVE', 'score': 0.9998806715011597}]

In [12]:
sentiment_neg = sentiment_task("This movie is worst and terrible")
sentiment_neg

[{'label': 'NEGATIVE', 'score': 0.9998193383216858}]

## Tokenization and Model initialization

In [13]:
model_name = "siebert/sentiment-roberta-large-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

## Dataset preparation

- Text to Tokens
- Tokens to Unique IDS
- Map Tokens with its unique IDs in a dictionary

In [14]:
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts

    def __len__(self):
        return len(self.tokenized_texts["input_ids"])

    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

## Approach -2 Sample User Text- Using pre-trained models

In [36]:
tweets = [
    "Just had the best meal ever at my favorite restaurant! The food was amazing, and the service was top-notch. Feeling so satisfied and happy right now! 😃🍔🥗 #Foodie #HappyCustomer",
    "Feeling really frustrated after sitting in traffic for hours today. Missed an important meeting and wasted so much time. Not a great start to the day. 😤🚗 #TrafficWoes #Stressed"
]

## Tokenize the user input text

In [37]:
tokenized_texts = tokenizer(tweets,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)

## Create Model Trainer

In [38]:
trainer = Trainer(model=model)

In [39]:
predictions = trainer.predict(pred_dataset) #run predictions

## Model Evaluation- Predictions

In [40]:
preds = predictions.predictions.argmax(-1) #get prediction labels
labels = pd.Series(preds).map(model.config.id2label)  #convert prediction into POSITIVE or NEGATIVE
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1) # ACCURACY SCORE

In [41]:
scores

array([0.9989399 , 0.99949604], dtype=float32)

In [42]:
labels

0    POSITIVE
1    NEGATIVE
dtype: object

## RoBERTa Sentiment results on user input

In [44]:
df = pd.DataFrame(list(zip(tweets,preds,labels,scores)), columns=['text','pred','label','score'])
df.head()

Unnamed: 0,text,pred,label,score
0,Just had the best meal ever at my favorite res...,1,POSITIVE,0.99894
1,Feeling really frustrated after sitting in tra...,0,NEGATIVE,0.999496


## Approach-3 On Custom Dataset

Download the dataset and upload the dataset in the Google Colab to proceed further

Dataset: [Twitter Sentiment Analysis Dataset](https://www.kaggle.com/datasets/yasserh/twitter-tweets-sentiment-dataset)

In [49]:
data = pd.read_csv("Tweets.csv")
data.tail()

Unnamed: 0,textID,text,selected_text,sentiment
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive
27480,6f7127d9d7,All this flirting going on - The ATG smiles...,All this flirting going on - The ATG smiles. Y...,neutral


In [50]:
data_tweets = data['text'].dropna().astype('str').tolist()

In [46]:
tokenized_data = tokenizer(data_tweets,truncation=True,padding=True)
pred_data = SimpleDataset(tokenized_data)

In [52]:
evaluate = trainer.predict(pred_data) #will take some time

In [53]:
new_preds = evaluate.predictions.argmax(-1) #get prediction labels
new_labels = pd.Series(new_preds).map(model.config.id2label) #convert prediction into POSITIVE or NEGATIVE
new_scores = (np.exp(evaluate[0])/np.exp(evaluate[0]).sum(-1,keepdims=True)).max(1) # ACCURACY SCORE

In [55]:
final_df = pd.DataFrame(list(zip(data_tweets,new_preds,new_labels,new_scores)), columns=['tweets','target','sentiment','accuracy'])
final_df.head(10)

Unnamed: 0,tweets,target,sentiment,accuracy
0,"I`d have responded, if I were going",0,NEGATIVE,0.994793
1,Sooo SAD I will miss you here in San Diego!!!,0,NEGATIVE,0.995364
2,my boss is bullying me...,0,NEGATIVE,0.999434
3,what interview! leave me alone,0,NEGATIVE,0.997404
4,"Sons of ****, why couldn`t they put them on t...",0,NEGATIVE,0.998303
5,http://www.dothebouncy.com/smf - some shameles...,1,POSITIVE,0.994551
6,2am feedings for the baby are fun when he is a...,1,POSITIVE,0.997768
7,Soooo high,1,POSITIVE,0.997417
8,Both of you,0,NEGATIVE,0.829015
9,Journey!? Wow... u just became cooler. hehe....,1,POSITIVE,0.99725
