In [1]:
! pip install datasets transformers accelerate

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m81.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
#importing packages
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from transformers import pipeline

In [3]:
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

## 1 - Using Pipeline

In [4]:
sentiment_task = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [5]:
sentiment = sentiment_task("I watched the oppeningham movie. it was very cool and fun")
sentiment

[{'label': 'POSITIVE', 'score': 0.9998703002929688}]

In [6]:
sent_neg = sentiment_task("That was the worst game played by chelsea")
sent_neg

[{'label': 'NEGATIVE', 'score': 0.9997918009757996}]

### Tokenization and model initialization

In [7]:
model_name = "siebert/sentiment-roberta-large-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

### Dataset preparation
text to token

token to unique id

map token to unique id in the dictionary

In [8]:
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts

    def __len__(self):
        return len(self.tokenized_texts["input_ids"])

    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

## 2 - Sample User text using Pretrained models

In [9]:
tweets = [
    "Just had the best meal ever at my favorite restaurant! The food was amazing, and the service was top-notch. Feeling so satisfied and happy right now! 😃🍔🥗 #Foodie #HappyCustomer",
    "Feeling really frustrated after sitting in traffic for hours today. Missed an important meeting and wasted so much time. Not a great start to the day. 😤🚗 #TrafficWoes #Stressed"
]

### tokenize

In [10]:
tokenized_texts = tokenizer(tweets, truncation=True, padding=True)
pred_dataset =SimpleDataset(tokenized_texts)

## Model trainer

In [11]:
trainer = Trainer(model=model)

In [12]:
predictions = trainer.predict(pred_dataset)

## Model Evaluation -Predictions

In [13]:
preds = predictions.predictions.argmax(-1) #get prediction labels
labels = pd.Series(preds).map(model.config.id2label)  #convert prediction into POSITIVE or NEGATIVE
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1) # ACCURACY SCORE

In [14]:
labels

0    POSITIVE
1    NEGATIVE
dtype: object

In [15]:
df = pd.DataFrame(list(zip(tweets,preds,labels,scores)), columns=['text','pred','label','score'])
df.head()

Unnamed: 0,text,pred,label,score
0,Just had the best meal ever at my favorite res...,1,POSITIVE,0.99894
1,Feeling really frustrated after sitting in tra...,0,NEGATIVE,0.999496


## 3- Custom Dataset

In [18]:
data = pd.read_csv("/content/Tweets.csv")
data.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [19]:
data_tweets = data["text"].dropna().astype("str").tolist()

In [20]:
tokenized_data = tokenizer(data_tweets, truncation=True, padding=True)
pred_data =SimpleDataset(tokenized_data)

In [21]:
evaluate = trainer.predict(pred_data)

In [22]:
new_preds = evaluate.predictions.argmax(-1) #get prediction labels
new_labels = pd.Series(new_preds).map(model.config.id2label) #convert prediction into POSITIVE or NEGATIVE
new_scores = (np.exp(evaluate[0])/np.exp(evaluate[0]).sum(-1,keepdims=True)).max(1) # ACCURACY SCORE

In [23]:
final_df = pd.DataFrame(list(zip(data_tweets,new_preds,new_labels,new_scores)), columns=['tweets','target','sentiment','accuracy'])
final_df.head(10)

Unnamed: 0,tweets,target,sentiment,accuracy
0,"I`d have responded, if I were going",0,NEGATIVE,0.994793
1,Sooo SAD I will miss you here in San Diego!!!,0,NEGATIVE,0.995364
2,my boss is bullying me...,0,NEGATIVE,0.999434
3,what interview! leave me alone,0,NEGATIVE,0.997404
4,"Sons of ****, why couldn`t they put them on t...",0,NEGATIVE,0.998303
5,http://www.dothebouncy.com/smf - some shameles...,1,POSITIVE,0.994551
6,2am feedings for the baby are fun when he is a...,1,POSITIVE,0.997768
7,Soooo high,1,POSITIVE,0.997417
8,Both of you,0,NEGATIVE,0.829015
9,Journey!? Wow... u just became cooler. hehe....,1,POSITIVE,0.99725
