In [1]:
from transformers import pipeline

In [2]:
pipeline_sa = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [3]:
input_data = ["I don't know how you can be so stupid", "You mean a lot to me", "We are not friends anymore."]

In [4]:
pipeline_sa(input_data)

[{'label': 'NEGATIVE', 'score': 0.9943351149559021},
 {'label': 'POSITIVE', 'score': 0.8074020147323608},
 {'label': 'NEGATIVE', 'score': 0.998809814453125}]

Sentiment analysis using roBERTa

In [5]:
from transformers import (AutoModelForSequenceClassification, 
                          TFAutoModelForSequenceClassification,
                          AutoTokenizer)
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

In [6]:
def preprocess_text(text):
    new = []
    
    for chunk in text.split(" "):
        chunk = '@user' if chunk.startswith('@') and len(chunk) > 1 else chunk
        chunk = 'http' if chunk.startswith('http') else chunk
        
        new.append(chunk)
    return " ".join(new)

In [7]:
task = 'sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [9]:
labels = []
mapping = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"

with urllib.request.urlopen(mapping) as m:
    html = m.read().decode("utf-8").split("\n")
    csv_reader = csv.reader(html, delimiter="\t")

labels = [row[1] for row in csv_reader if len(row) > 1]

In [10]:
labels

['negative', 'neutral', 'positive']

Pretraining:

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [12]:
example_text = "Have a nice day! :D"
example_text_2 = "Don't raise your voice."

In [15]:
pp_example_text = preprocess_text(example_text)
pp_example_text_2 = preprocess_text(example_text_2)

encoded_text = tokenizer(pp_example_text, return_tensors='pt')
encoded_text_2 = tokenizer(pp_example_text_2, return_tensors='pt')

In [16]:
sa_output = model(**encoded_text)
sa_output_2 = model(**encoded_text_2)

In [24]:
scores = sa_output[0][0].detach().numpy()
scores = softmax(scores)

scores_2 = sa_output_2[0][0].detach().numpy()
scores_2 = softmax(scores_2)

In [25]:
scores

array([0.00145376, 0.01081943, 0.98772687], dtype=float32)

In [26]:
scores_2

array([0.51257557, 0.4641963 , 0.02322812], dtype=float32)

In [29]:
rank = np.argsort(scores)
rank = rank[::-1]

rank_2 = np.argsort(scores_2)
rank_2 = rank_2[::-1]

In [38]:
print(example_text)
for i in range(scores.shape[0]):
    label = labels[rank[i]]
    score = scores[rank[i]]
    
    print(f"{i+1} {label} {np.round(float(score), 4)}")

Have a nice day! :D
1 positive 0.9877
2 neutral 0.0108
3 negative 0.0015


In [39]:
print(example_text_2)
for i in range(scores_2.shape[0]):
    label = labels[rank[i]]
    score = scores_2[rank[i]]
    
    print(f"{i+1} {label} {np.round(float(score), 4)}")

Don't raise your voice.
1 positive 0.0232
2 neutral 0.4642
3 negative 0.5126
