In [1]:
## Read Data
import csv
import pandas as pd
import re
import numpy as np


In [2]:
data = pd.read_csv('/Users/andrefuentes/Desktop/Python_Project/imdb.csv')

In [3]:
len(data)

261391

In [4]:
data.columns

Index(['review_id', 'review', 'movie_title', 'tconst', 'rating'], dtype='object')

In [5]:
data.head()

Unnamed: 0,review_id,review,movie_title,tconst,rating
0,1,"""Heres a perfect example of the pitfalls of wr...",Carmencita (Short 1894),tt0000001,
1,2,"""This film is part of the series of short Edis...",Carmencita (Short 1894),tt0000001,
2,3,"""Objectively, theres nothing really WRONG with...",Carmencita (Short 1894),tt0000001,
3,4,"""This is the first movie in what quickly becam...",Carmencita (Short 1894),tt0000001,
4,5,"""Watching a film like this, it becomes fairly ...",Carmencita (Short 1894),tt0000001,


In [6]:
# extract movie type and year out of movie_title

data[['title', 'kind_of_movie', 'year']] = data['movie_title'].str.extract(r'^(.*?) \((.*?) (\d{4})\)$')


In [7]:
data = data.rename(columns={'tconst': 'movie_id'})

In [8]:
data = data.drop(columns=['movie_title', 'rating'])

In [9]:
data.columns

Index(['review_id', 'review', 'movie_id', 'title', 'kind_of_movie', 'year'], dtype='object')

In [10]:
data = data[['title', 'kind_of_movie', 'year','review','movie_id','review_id']]

In [11]:
# Distribution of reviews per
review_distribution = data.groupby('review_id')['title'].first().value_counts()

review_distribution

Alice in Wonderland        67
A Christmas Carol          58
The Wind in the Willows    48
Rebecca                    43
Oliver Twist               42
                           ..
A Small Town Princess       1
Retik the Moon Menace       1
Satan's Storybook           1
Roe vs. Wade                1
The 3,000 Mile Chase        1
Name: title, Length: 7130, dtype: int64

In [12]:
# Filter movies with 5 or more reviews
filtered_data = data.groupby('title').filter(lambda x: len(x) >= 5)

In [13]:
len(filtered_data)

30202

# Roberta model exploration
- https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment
- https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment
- https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment

# Comparison: Pretrained Language Model(Roberta) vs. NLTK


### Pretrained Language Model:
The `twitter-xlm-roberta-base-sentiment` model is a transformer-based model fine-tuned for sentiment analysis on a multilingual dataset (specifically, tweets). It can capture more nuanced meanings, slang, and contextual variations, especially in different languages or on social media.

In contrast, **TextBlob** is a lexicon-based approach. It uses predefined word sentiment scores to determine sentiment, which can sometimes fail to capture context, irony, or more complex sentence structures.

### Deep Learning vs. Lexicon-Based:
Transformer models like **XLM-RoBERTa** learn from vast amounts of data and fine-tuned tasks, allowing them to generalize better for different types of text. This leads to more accurate sentiment predictions, especially in real-world use cases.

TextBlob, based on **nltk**, is more rule-based and not as flexible. Its sentiment analysis works well for simpler sentences, but it may struggle with sarcasm, negation, or idiomatic expressions.

### Multilingual Capability:
The `twitter-xlm-roberta-base-sentiment` model is multilingual, designed to understand and analyze sentiment in many languages.

TextBlob primarily works in English, limiting its usefulness for non-English data.


In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax

# Load tokenizer and model


MODEL = "distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Batch size for processing
batch_size = 16

# Function to predict sentiment for a batch of feedback
def predict_sentiment_batch(feedbacks):
    # Tokenize the input batch
    encoded_feedbacks = tokenizer(feedbacks, return_tensors='pt', truncation=True, padding=True, max_length=512)
    
    # Predict sentiment with no gradient calculation (for inference)
    with torch.no_grad():
        outputs = model(**encoded_feedbacks)
    
    # Extract sentiment scores and apply softmax to get probabilities
    scores = outputs.logits.detach().numpy()
    sentiments = []
    for score in scores:
        score = softmax(score, axis=-1)
        sentiment_labels = ['rotten', 'neutral', 'ripe']  
        sentiments.append(sentiment_labels[score.argmax()])  
    return sentiments

# Ensure the 'sentiment' column is created with the same size as 'review'
filtered_data['sentiment'] = None

# Batch processing the feedbacks to predict sentiments
for i in range(0, len(filtered_data), batch_size):
    feedback_batch = filtered_data['review'].iloc[i:i+batch_size].tolist()
    sentiments = predict_sentiment_batch(feedback_batch)
    
    # Assign the predicted sentiments back to the 'sentiment' column
    filtered_data.loc[i:i+batch_size-1, 'sentiment'] = sentiments

# Check the results
print(filtered_data[['review', 'sentiment']])
