In [1]:
# Data handling
import pandas as pd

# Hugging Face Transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Utilities
import numpy as np
import torch.nn.functional as F

In [3]:
# Load the dataset
df = pd.read_csv("stocks_tweets_merge.csv")

# Display shape and a sample of the tweet text
print("Shape of dataset:", df.shape)
df[["tweet_body"]].sample(5)

Shape of dataset: (4709, 14)


Unnamed: 0,tweet_body
3270,He’s not wrong
3455,Great meme review hosted by Will Smith
1111,Concerning
1409,"The problem is not just Google Gemini, it’s Go..."
3410,Reading the Grok newsfeed in fun mode is aweso...


In [4]:
# Load the tokenizer and model from Hugging Face
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [5]:
# Define sentiment labels in the order used by the model
labels = ['negative', 'neutral', 'positive']

def get_sentiment(text):
    # Preprocessing (Roberta expects lowercase text)
    encoded_input = tokenizer(text.lower(), return_tensors='pt', truncation=True, padding=True).to(device)

    # Run model inference
    with torch.no_grad():
        output = model(**encoded_input)
        scores = F.softmax(output.logits, dim=1)
        scores = scores.cpu().numpy()[0]  # Move back to CPU and get first row

    # Get label with highest score
    label = labels[np.argmax(scores)]

    return label, scores


In [6]:
# Test on a small sample (first 10 tweets)
sample_df = df.head(10).copy()

# Apply sentiment function
results = sample_df['tweet_body'].apply(get_sentiment)

# Extract label and scores
sample_df['roberta_sentiment'] = results.apply(lambda x: x[0])
sample_df['roberta_neg_score'] = results.apply(lambda x: x[1][0])
sample_df['roberta_neu_score'] = results.apply(lambda x: x[1][1])
sample_df['roberta_pos_score'] = results.apply(lambda x: x[1][2])

# Show the results
sample_df[['tweet_body', 'roberta_sentiment', 'roberta_neg_score', 'roberta_neu_score', 'roberta_pos_score']]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unnamed: 0,tweet_body,roberta_sentiment,roberta_neg_score,roberta_neu_score,roberta_pos_score
0,Yup,neutral,0.215288,0.4943,0.290412
1,Massive public manipulation,negative,0.72385,0.266601,0.009549
2,🤣🤣,neutral,0.235308,0.454288,0.310404
3,Prescient,neutral,0.126566,0.668711,0.204723
4,Congratulations Tesla team on a great year!!,positive,0.001338,0.007122,0.99154
5,Improved longform posts,positive,0.010238,0.394901,0.594861
6,Prelude to Mars,neutral,0.123626,0.793278,0.083096
7,Key milestone completed for flight 2,positive,0.002365,0.445713,0.551922
8,“Ads” like this where you learn or are enterta...,positive,0.00249,0.027561,0.969949
9,Great conversation with @NarendraModi,positive,0.002627,0.107091,0.890282


In [7]:
from tqdm import tqdm
tqdm.pandas()  # Enable progress_apply

# Run sentiment analysis across the full dataset
results = df['tweet_body'].progress_apply(get_sentiment)

# Store results
df['roberta_sentiment'] = results.apply(lambda x: x[0])
df['roberta_neg_score'] = results.apply(lambda x: x[1][0])
df['roberta_neu_score'] = results.apply(lambda x: x[1][1])
df['roberta_pos_score'] = results.apply(lambda x: x[1][2])


100%|██████████████████████████████████████████████████████████████████████████████| 4709/4709 [03:37<00:00, 21.66it/s]


In [8]:
df[['tweet_body', 'roberta_sentiment', 'roberta_neg_score', 'roberta_neu_score', 'roberta_pos_score']].sample(10)


Unnamed: 0,tweet_body,roberta_sentiment,roberta_neg_score,roberta_neu_score,roberta_pos_score
2276,This essentially means that X Premium (fka Twi...,neutral,0.151266,0.647852,0.200882
2903,141 MPGe!,neutral,0.034418,0.633666,0.331916
147,That would be great,positive,0.00335,0.028875,0.967774
3747,"Um, @satyanadella, this is illegal …",negative,0.730242,0.253893,0.015865
2322,This is you with federal taxpayer money https:...,neutral,0.312723,0.627984,0.059293
1681,Congrats Falcon &amp; Starlink teams!,positive,0.00094,0.02427,0.97479
3888,Each rocket engine produces twice as much thru...,neutral,0.142995,0.753092,0.103913
1340,I tried using both 𝕏 and legacy media this wee...,negative,0.77703,0.194122,0.028848
3069,Why is corporate journalism rushing to defend ...,negative,0.751615,0.238433,0.009953
4400,You cell phone in New Zealand now works anywhere!,positive,0.003502,0.116861,0.879636


In [9]:
df['roberta_sentiment'].value_counts()


roberta_sentiment
neutral     2029
positive    1377
negative    1303
Name: count, dtype: int64

In [10]:
# For example, show a few negative tweets
df[df['roberta_sentiment'] == 'negative'][['tweet_body', 'roberta_neg_score']].head(10)


Unnamed: 0,tweet_body,roberta_neg_score
1,Massive public manipulation,0.72385
11,Why ESG is the devil …,0.686758
24,Was the Internet Archive manipulated for nepot...,0.528427
33,WhatsApp cannot be trusted,0.891644
37,Anyone making materially false statements on t...,0.602115
54,"This is a life leader launch, so more risk tha...",0.575328
67,Insane,0.650554
90,Major and immediate action is needed to preven...,0.497616
93,It’s harder than it looks,0.711045
94,A 12-year-old script kiddie could hack into Ye...,0.86046


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4709 entries, 0 to 4708
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   tweet_id           4709 non-null   int64  
 1   tweet_body         4709 non-null   object 
 2   retweet_count      4709 non-null   int64  
 3   reply_count        4709 non-null   int64  
 4   like_count         4709 non-null   int64  
 5   quote_count        4709 non-null   float64
 6   view_count         4356 non-null   float64
 7   bookmark_count     4709 non-null   int64  
 8   date               4709 non-null   object 
 9   clean_text         4688 non-null   object 
 10  Open               4709 non-null   float64
 11  Close              4709 non-null   float64
 12  Volume             4709 non-null   int64  
 13  pct_change         4709 non-null   float64
 14  roberta_sentiment  4709 non-null   object 
 15  roberta_neg_score  4709 non-null   float32
 16  roberta_neu_score  4709 

In [12]:
df.to_csv("tweets_with_roberta_sentiment.csv", index=False)
