In [4]:
import torch
import pandas as pd

from pathlib import Path
from transformers import AutoTokenizer
from transformers import pipeline
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


DATA_DIR_RAW = Path("data/raw")

REVIEWS_CLEAN = DATA_DIR_RAW / "review_clean.ndjson"

In [2]:
# load the reviews as a dataframe 
df = pd.read_json(REVIEWS_CLEAN, lines=True, dtype_backend="pyarrow")

In [None]:
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL,
    use_safetensors=True,   # <- key change
)

model = model.to(device)

sentiment = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device,)


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda


In [21]:
sample = df.head(10).copy()

texts = sample["text"].tolist()

results = sentiment(texts)  

In [22]:
for i, (t, r) in enumerate(zip(texts, results)):
    stars = sample.iloc[i]["stars"]
    print("-" * 80)
    print(f"Stars: {stars}")
    print(t[:400].replace("\n", " "))
    print(f"\n→ sentiment: {r['label']}, score: {r['score']:.3f}")


--------------------------------------------------------------------------------
Stars: 3
If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it's other locations in NJ and never had a bad experience.   The food is good, but it takes a very long time to come out. The waitstaff is very young, but usually pleasant. We have just had too many experiences where we spent w

→ sentiment: positive, score: 0.571
--------------------------------------------------------------------------------
Stars: 5
I've taken a lot of spin classes over the years, and nothing compares to the classes at Body Cycle. From the nice, clean space and amazing bikes, to the welcoming and motivating instructors, every class is a top notch work out.  For anyone who struggles to fit workouts in, the online scheduling system makes it easy to plan ahead (and there's no need to line up way in advanced lik