# Sentiment analysis on movie reviews by rotten tomatoes using roBERTa transformer model

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

import nltk

In [38]:
df = pd.read_csv('rotten_tomatoes_critic_reviews.csv') 

#csv at (https://www.kaggle.com/datasets/stefanoleone992/rotten-tomatoes-movies-and-critic-reviews-dataset?resource=download)

df.drop(['top_critic', 'publisher_name', 'review_type', 'review_score', 'review_date'], axis=1, inplace=True)
df['id'] = df.reset_index().index
df=df.head(78)

# # RoBERTa

This is a roBERTa-base model trained on ~58M tweets and finetuned for sentiment analysis with the TweetEval benchmark. This model is suitable for English (for a similar multilingual model, see XLM-T).

In [25]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
import tqdm as notebook_tqdm
from tqdm import tqdm

In [15]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Downloading: 100%|██████████████████████████████| 747/747 [00:00<00:00, 281kB/s]
Downloading: 100%|███████████████████████████| 899k/899k [00:00<00:00, 1.05MB/s]
Downloading: 100%|████████████████████████████| 456k/456k [00:00<00:00, 625kB/s]
Downloading: 100%|█████████████████████████████| 150/150 [00:00<00:00, 40.9kB/s]
Downloading: 100%|███████████████████████████| 499M/499M [02:14<00:00, 3.70MB/s]


In [16]:
def roberta_scores(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

In [39]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        if row['review_content']:
            text = row['review_content']
            myid = row['id']
            roberta_result = roberta_scores(text)
            res[myid] = roberta_result
    except RuntimeError:
        print(f'Broke for id {myid}')

100%|███████████████████████████████████████████| 78/78 [00:30<00:00,  2.53it/s]


In [44]:
results_df = pd.DataFrame(res).T

results_df = results_df.reset_index().rename(columns={'index': 'id'})
results_df = results_df.merge(df, how='left')

In [45]:
results_df.head()

Unnamed: 0,id,roberta_neg,roberta_neu,roberta_pos,rotten_tomatoes_link,critic_name,review_content
0,0,0.004961,0.241265,0.753774,m/0814255,Andrew L. Urban,A fantasy adventure that fuses Greek mythology...
1,1,0.08727,0.548667,0.364063,m/0814255,Louise Keller,"Uma Thurman as Medusa, the gorgon with a coiff..."
2,2,0.001744,0.062746,0.93551,m/0814255,,With a top-notch cast and dazzling special eff...
3,3,0.048001,0.28599,0.666008,m/0814255,Ben McEachen,Whether audiences will get behind The Lightnin...
4,4,0.578026,0.355937,0.066037,m/0814255,Ethan Alter,What's really lacking in The Lightning Thief i...
