In [2]:
# Data handling
import pandas as pd

# Hugging Face Transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Utilities
import numpy as np
import torch.nn.functional as F

In [3]:
# Load the dataset
df = pd.read_csv("stocks_tweets_merge.csv")

# Display shape and a sample of the tweet text
print("Shape of dataset:", df.shape)
df[["tweet_body"]].sample(5)

Shape of dataset: (4709, 14)


Unnamed: 0,tweet_body
1508,The propaganda level in legacy media has becom...
1045,So true 😂
2234,Artificial intelligence discussion with PM Net...
831,True
2501,Those are just the explicit DEI grants found. ...


In [4]:
# Load the tokenizer and model from Hugging Face
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [5]:
# Define sentiment labels in the order used by the model
labels = ['negative', 'neutral', 'positive']

def get_sentiment(text):
    # Preprocessing (Roberta expects lowercase text)
    encoded_input = tokenizer(text.lower(), return_tensors='pt', truncation=True, padding=True).to(device)

    # Run model inference
    with torch.no_grad():
        output = model(**encoded_input)
        scores = F.softmax(output.logits, dim=1)
        scores = scores.cpu().numpy()[0]  # Move back to CPU and get first row

    # Get label with highest score
    label = labels[np.argmax(scores)]

    return label, scores


In [6]:
# Test on a small sample (first 10 tweets)
sample_df = df.head(10).copy()

# Apply sentiment function
results = sample_df['tweet_body'].apply(get_sentiment)

# Extract label and scores
sample_df['roberta_sentiment'] = results.apply(lambda x: x[0])
sample_df['roberta_neg_score'] = results.apply(lambda x: x[1][0])
sample_df['roberta_neu_score'] = results.apply(lambda x: x[1][1])
sample_df['roberta_pos_score'] = results.apply(lambda x: x[1][2])

# Show the results
sample_df[['tweet_body', 'roberta_sentiment', 'roberta_neg_score', 'roberta_neu_score', 'roberta_pos_score']]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unnamed: 0,tweet_body,roberta_sentiment,roberta_neg_score,roberta_neu_score,roberta_pos_score
0,Yup,neutral,0.215288,0.4943,0.290412
1,Massive public manipulation,negative,0.72385,0.266601,0.009549
2,🤣🤣,neutral,0.235308,0.454288,0.310404
3,Prescient,neutral,0.126566,0.668711,0.204723
4,Congratulations Tesla team on a great year!!,positive,0.001338,0.007122,0.99154
5,Improved longform posts,positive,0.010238,0.394901,0.594861
6,Prelude to Mars,neutral,0.123626,0.793278,0.083096
7,Key milestone completed for flight 2,positive,0.002365,0.445713,0.551922
8,“Ads” like this where you learn or are enterta...,positive,0.00249,0.027561,0.969949
9,Great conversation with @NarendraModi,positive,0.002627,0.107091,0.890282


In [8]:
from tqdm import tqdm
tqdm.pandas()  # Enable progress_apply

# Run sentiment analysis across the full dataset
results = df['tweet_body'].progress_apply(get_sentiment)

# Store results
df['roberta_sentiment'] = results.apply(lambda x: x[0])
df['roberta_neg_score'] = results.apply(lambda x: x[1][0])
df['roberta_neu_score'] = results.apply(lambda x: x[1][1])
df['roberta_pos_score'] = results.apply(lambda x: x[1][2])


100%|██████████████████████████████████████████████████████████████████████████████| 4709/4709 [03:48<00:00, 20.57it/s]


In [9]:
df[['tweet_body', 'roberta_sentiment', 'roberta_neg_score', 'roberta_neu_score', 'roberta_pos_score']].sample(10)


Unnamed: 0,tweet_body,roberta_sentiment,roberta_neg_score,roberta_neu_score,roberta_pos_score
1059,@cleantechnica Oh and umm …,neutral,0.097294,0.84474,0.057966
2534,They are importing voters,negative,0.677026,0.309321,0.013653
827,That is how the scam works,negative,0.755479,0.233269,0.011252
452,"While many other countries are worse, America ...",negative,0.932152,0.062728,0.00512
439,You can now play videos on most TVs from your ...,neutral,0.022685,0.574556,0.402759
2740,Important,neutral,0.214522,0.541002,0.244477
933,Nothing is more dangerous than getting between...,negative,0.82044,0.162026,0.017534
1014,Exactly,neutral,0.216276,0.534824,0.248901
24,Was the Internet Archive manipulated for nepot...,negative,0.528427,0.460051,0.011522
3079,Second launch today,neutral,0.01499,0.896385,0.088625


In [10]:
df['roberta_sentiment'].value_counts()


roberta_sentiment
neutral     2029
positive    1377
negative    1303
Name: count, dtype: int64

In [11]:
# For example, show a few negative tweets
df[df['roberta_sentiment'] == 'negative'][['tweet_body', 'roberta_neg_score']].head(10)


Unnamed: 0,tweet_body,roberta_neg_score
1,Massive public manipulation,0.72385
11,Why ESG is the devil …,0.686758
24,Was the Internet Archive manipulated for nepot...,0.528427
33,WhatsApp cannot be trusted,0.891644
37,Anyone making materially false statements on t...,0.602115
54,"This is a life leader launch, so more risk tha...",0.575328
67,Insane,0.650554
90,Major and immediate action is needed to preven...,0.497616
93,It’s harder than it looks,0.711045
94,A 12-year-old script kiddie could hack into Ye...,0.86046


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4709 entries, 0 to 4708
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   tweet_id           4709 non-null   int64  
 1   tweet_body         4709 non-null   object 
 2   retweet_count      4709 non-null   int64  
 3   reply_count        4709 non-null   int64  
 4   like_count         4709 non-null   int64  
 5   quote_count        4709 non-null   float64
 6   view_count         4356 non-null   float64
 7   bookmark_count     4709 non-null   int64  
 8   date               4709 non-null   object 
 9   clean_text         4688 non-null   object 
 10  Open               4709 non-null   float64
 11  Close              4709 non-null   float64
 12  Volume             4709 non-null   int64  
 13  pct_change         4709 non-null   float64
 14  roberta_sentiment  4709 non-null   object 
 15  roberta_neg_score  4709 non-null   float32
 16  roberta_neu_score  4709 

In [14]:
# Calculate Sentiment Polarity: This subtracts the negative sentiment score from the positive one.
df['sentiment_polarity'] = df['roberta_pos_score'] - df['roberta_neg_score']
df.head()

Unnamed: 0,tweet_id,tweet_body,retweet_count,reply_count,like_count,quote_count,view_count,bookmark_count,date,clean_text,Open,Close,Volume,pct_change,roberta_sentiment,roberta_neg_score,roberta_neu_score,roberta_pos_score,sentiment_polarity
0,1655978502187778073,Yup,3255,3747,39533,225.0,11392206.0,281,2023-05-09,yup,168.949997,169.149994,88965000,0.118376,neutral,0.215288,0.4943,0.290412,0.075124
1,1655968899903418373,Massive public manipulation,9811,2694,49528,534.0,14404853.0,1241,2023-05-09,massive public manipulation,168.949997,169.149994,88965000,0.118376,negative,0.72385,0.266601,0.009549,-0.714301
2,1646228474628280326,🤣🤣,10198,5076,108462,430.0,16690340.0,340,2023-04-12,,190.740005,180.539993,150256300,-5.3476,neutral,0.235308,0.454288,0.310404,0.075095
3,1640171198091866114,Prescient,9193,5118,56272,572.0,25169601.0,3792,2023-03-27,prescient,194.419998,191.809998,120851600,-1.342455,neutral,0.126566,0.668711,0.204723,0.078156
4,1742235895166652609,Congratulations Tesla team on a great year!!,5222,4154,67751,248.0,17889197.0,283,2024-01-02,congratulations tesla team on a great year,250.080002,248.419998,104654200,-0.663789,positive,0.001338,0.007122,0.99154,0.990203


In [15]:
# Calculate Engagement Score: This adds up likes and retweets to quantify how much attention a tweet got.
df['engagement_score'] = df['like_count'] + df['retweet_count']
df.head()

Unnamed: 0,tweet_id,tweet_body,retweet_count,reply_count,like_count,quote_count,view_count,bookmark_count,date,clean_text,Open,Close,Volume,pct_change,roberta_sentiment,roberta_neg_score,roberta_neu_score,roberta_pos_score,sentiment_polarity,engagement_score
0,1655978502187778073,Yup,3255,3747,39533,225.0,11392206.0,281,2023-05-09,yup,168.949997,169.149994,88965000,0.118376,neutral,0.215288,0.4943,0.290412,0.075124,42788
1,1655968899903418373,Massive public manipulation,9811,2694,49528,534.0,14404853.0,1241,2023-05-09,massive public manipulation,168.949997,169.149994,88965000,0.118376,negative,0.72385,0.266601,0.009549,-0.714301,59339
2,1646228474628280326,🤣🤣,10198,5076,108462,430.0,16690340.0,340,2023-04-12,,190.740005,180.539993,150256300,-5.3476,neutral,0.235308,0.454288,0.310404,0.075095,118660
3,1640171198091866114,Prescient,9193,5118,56272,572.0,25169601.0,3792,2023-03-27,prescient,194.419998,191.809998,120851600,-1.342455,neutral,0.126566,0.668711,0.204723,0.078156,65465
4,1742235895166652609,Congratulations Tesla team on a great year!!,5222,4154,67751,248.0,17889197.0,283,2024-01-02,congratulations tesla team on a great year,250.080002,248.419998,104654200,-0.663789,positive,0.001338,0.007122,0.99154,0.990203,72973


In [16]:
# Select and Reorder Columns for Tableau: This step builds a cleaner DataFrame with just the fields you need for a Tableau dashboard about sentiment & price volatility.
tableau_df = df[[
    'tweet_id', 'date', 'tweet_body',
    'roberta_sentiment', 'roberta_pos_score', 'roberta_neg_score', 'roberta_neu_score', 'sentiment_polarity',
    'like_count', 'retweet_count', 'engagement_score',
    'Open', 'Close', 'pct_change'
]]

tableau_df.head()

Unnamed: 0,tweet_id,date,tweet_body,roberta_sentiment,roberta_pos_score,roberta_neg_score,roberta_neu_score,sentiment_polarity,like_count,retweet_count,engagement_score,Open,Close,pct_change
0,1655978502187778073,2023-05-09,Yup,neutral,0.290412,0.215288,0.4943,0.075124,39533,3255,42788,168.949997,169.149994,0.118376
1,1655968899903418373,2023-05-09,Massive public manipulation,negative,0.009549,0.72385,0.266601,-0.714301,49528,9811,59339,168.949997,169.149994,0.118376
2,1646228474628280326,2023-04-12,🤣🤣,neutral,0.310404,0.235308,0.454288,0.075095,108462,10198,118660,190.740005,180.539993,-5.3476
3,1640171198091866114,2023-03-27,Prescient,neutral,0.204723,0.126566,0.668711,0.078156,56272,9193,65465,194.419998,191.809998,-1.342455
4,1742235895166652609,2024-01-02,Congratulations Tesla team on a great year!!,positive,0.99154,0.001338,0.007122,0.990203,67751,5222,72973,250.080002,248.419998,-0.663789


In [17]:
tableau_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4709 entries, 0 to 4708
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   tweet_id            4709 non-null   int64  
 1   date                4709 non-null   object 
 2   tweet_body          4709 non-null   object 
 3   roberta_sentiment   4709 non-null   object 
 4   roberta_pos_score   4709 non-null   float32
 5   roberta_neg_score   4709 non-null   float32
 6   roberta_neu_score   4709 non-null   float32
 7   sentiment_polarity  4709 non-null   float32
 8   like_count          4709 non-null   int64  
 9   retweet_count       4709 non-null   int64  
 10  engagement_score    4709 non-null   int64  
 11  Open                4709 non-null   float64
 12  Close               4709 non-null   float64
 13  pct_change          4709 non-null   float64
dtypes: float32(4), float64(3), int64(4), object(3)
memory usage: 441.6+ KB


In [18]:
tableau_df.to_csv("tableau_ready_sentiment_data.csv", index=False)