Sentiment Analysis by PyABSA

In [5]:
# Import libraries
import pandas as pd  # Data manipulation and saving results to CSV
import sys
sys.path.append('../src')
# Import feature engineering functions
from preprocess import clean_text

In [6]:
# load the dataset
df = pd.read_csv('../data/reddit_raw.csv')
df.head()

Unnamed: 0,subreddit,title,text,url,created,keyword,score
0,ukraine,Volunteering in civilian roles,"Hi,\n\nI’m an American. When the war broke out...",https://www.reddit.com/r/ukraine/comments/1m3v...,2025-07-19 14:54:55,refugee,62
1,ukraine,<3,As a Polish person I just came here to tell yo...,https://www.reddit.com/r/ukraine/comments/1lxf...,2025-07-11 21:03:10,refugee,544
2,ukraine,The Angry Ukrainian Syndrome: Injustice and St...,I found this useful for understanding my own b...,https://www.reddit.com/r/ukraine/comments/1lio...,2025-06-23 20:38:06,refugee,230
3,ukraine,I wrote this letter to my representatives in c...,**Find your representatives here:** [**https:/...,https://www.reddit.com/r/ukraine/comments/1lfq...,2025-06-20 02:45:54,refugee,125
4,ukraine,Looking forward - would love to hear ppl's tho...,"Sorry for the vague question, but I only have ...",https://www.reddit.com/r/ukraine/comments/1kv8...,2025-05-25 19:56:51,refugee,13


In [7]:
# Preprocess the text data
df['text'] = df['text'].apply(clean_text)
df['title'] = df['title'].apply(clean_text)
df.head()

Unnamed: 0,subreddit,title,text,url,created,keyword,score
0,ukraine,Volunteering in civilian roles,"Hi, I’m an American. When the war broke out, I...",https://www.reddit.com/r/ukraine/comments/1m3v...,2025-07-19 14:54:55,refugee,62
1,ukraine,<3,As a Polish person I just came here to tell yo...,https://www.reddit.com/r/ukraine/comments/1lxf...,2025-07-11 21:03:10,refugee,544
2,ukraine,The Angry Ukrainian Syndrome: Injustice and St...,I found this useful for understanding my own b...,https://www.reddit.com/r/ukraine/comments/1lio...,2025-06-23 20:38:06,refugee,230
3,ukraine,I wrote this letter to my representatives in c...,Find your representatives here: Subject: Urgen...,https://www.reddit.com/r/ukraine/comments/1lfq...,2025-06-20 02:45:54,refugee,125
4,ukraine,Looking forward - would love to hear ppl's tho...,"Sorry for the vague question, but I only have ...",https://www.reddit.com/r/ukraine/comments/1kv8...,2025-05-25 19:56:51,refugee,13


Supervised Sentiment Classifier with PyABSA

Optimizations for Apple M1

    ✅ no_cuda=True: Ensures the script uses CPU/MPS, not expecting NVIDIA CUDA.

    ✅ batch_size=8: Prevents memory overload.

    ✅ select(range(n)): Reduces the dataset size to allow quick experiments.

    ✅ num_train_epochs=1: Fast testing, can be increased later.

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import pandas as pd

model = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForSequenceClassification.from_pretrained(model)
absa_pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Mapping to your desired output labels
sentiment_mapping = {
    'Positive': 'Positive',
    'Negative': 'Negative',
    'Neutral': 'Neutral',
    None: None
}

def get_text_sentiment_score(text: str):
    """
    Predict sentiment on the whole text (no aspect).
    
    Returns:
    --------
    sentiment_label: str (mapped to POSITIVE/CRITICAL/NEUTRAL)
    score: float
    """
    if pd.notnull(text) and isinstance(text, str) and len(text.strip()) > 0:
        result = absa_pipe(text)[0]  # just pass text alone, no aspect
        sentiment_label = result['label']
        score = result['score']
        return sentiment_mapping.get(sentiment_label, None), score
    return None, None

def add_sentiment_columns(df: pd.DataFrame, text_col='text', save=False, path=None, truncate_text_max_length=2500):
    """
    Add whole-text sentiment columns to dataframe.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Your dataframe with a column of texts to analyze.
    text_col : str
        Column name for text.
    save : bool
        If True, save to CSV file.
    path : str or None
        File path to save if save=True.
    truncate_text_max_length : int
        Max characters to analyze per text (helps long text).
    
    Returns:
    --------
    pandas.DataFrame with new columns:
      - 'absa_sentiment'
      - 'absa_confidence'
    """
    df_copy = df.copy()
    df_copy[text_col] = df_copy[text_col].apply(lambda x: x[:truncate_text_max_length] if isinstance(x, str) else x)

    sentiments = []
    confidences = []
    for text in df_copy[text_col]:
        sentiment, confidence = get_text_sentiment_score(text)
        sentiments.append(sentiment)
        confidences.append(confidence)
        
    df_copy['absa_sentiment'] = sentiments
    df_copy['absa_confidence'] = confidences

    if save:
        if path is None:
            raise ValueError("Please specify a path to save the dataframe.")
        df_copy.to_csv(path, index=False)

    return df_copy

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


In [None]:
sentiment_pipe = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
sentiment_pipe(["I found this Reddit post fantastically discouraging!"])

Device set to use mps:0
  return forward_call(*args, **kwargs)


[{'label': 'negative', 'score': 0.8633114099502563}]

In [None]:
#df = add_sentiment_columns(df, text_col='text', save=True, path='../data/reddit_predicted_sentiment.csv')
#df[['text', 'absa_sentiment', 'absa_confidence']].head()

In [None]:
# Example: sentiment prediction on your Reddit titles or texts
#df['predicted_sentiment'] = df['text'].apply(lambda x: sentiment_pipe(x)[0]['label'])
#df['text','predicted_sentiment'].head()
# Save the results to a CSV file
#df[['title','text','subreddit', 'created', 'keyword', 'score', 'sentiment_pyabsa']].to_csv('../data/reddit_sentiment_predicted.csv', index=False)  