## 1 - Data Preprocessing

In [1]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from datetime import datetime
import nltk

In [3]:
data = pd.read_csv("/Users/marco/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/University - OD/EUR/MSc Thesis/stocktwits_data.csv")
data

Unnamed: 0,ticker,id,created_at,user,body
0,MMM,574948072,2024-05-31 22:07:52,TheClockworkGroup,"$MMM #NYSE | The markets are unpredictable, bu..."
1,MMM,574928917,2024-05-31 19:37:27,fishinNbanjo,"$MMM wow, that went to hell"
2,MMM,574915317,2024-05-31 18:13:34,RustyToo,$MMM Hear directly from the 3M scientist who w...
3,MMM,574908409,2024-05-31 17:30:20,LiveTradePro,$QQQ Setup confirmed All about follow through...
4,MMM,574908274,2024-05-31 17:29:31,DeepFknValue,@cajunman I was in $MMM preCovid and it just h...
...,...,...,...,...,...
1086042,NVDA,559107987,2024-01-25 00:13:30,Pianonymous,@dracorx What $NVDA bulls are feeling now is w...
1086043,NVDA,559107525,2024-01-25 00:09:20,CharlesXBrown,$LAES $NVDA Is it 12 or 15? Im good either way
1086044,NVDA,559107201,2024-01-25 00:06:21,SonGoku,Nothing is too oversold $TSLA \n\nNothing is t...
1086045,NVDA,559106977,2024-01-25 00:04:15,AthleticBeing,That’s how we trust our conviction and go agai...


In [4]:
data = data.dropna(subset=['created_at', 'ticker', 'body', 'user'])
print(data.isnull().sum())  

ticker        0
id            0
created_at    0
user          0
body          0
dtype: int64


In [5]:
data['created_at'] = pd.to_datetime(data['created_at'])
data = data.sort_values(by=['ticker', 'created_at'])
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['created_at'] = pd.to_datetime(data['created_at'])


Unnamed: 0,ticker,id,created_at,user,body
9557,A,556346410,2024-01-01 14:05:07,tickeron,How to make the right investment? Pick the bes...
9556,A,556637599,2024-01-04 00:31:07,ChiefOptionsTrader,$A
9555,A,556834165,2024-01-05 15:18:43,DoubleBank,$A IV tramadol worth billions instead of add...
9554,A,556866500,2024-01-05 17:57:28,Serhii,"$A Momentum Indicator for A turns negative, in..."
9553,A,556898971,2024-01-05 21:06:33,risenhoover,$A / Agilent Technologies files form 8-K - UNI...
...,...,...,...,...,...
441926,ZTS,574408980,2024-05-28 13:26:10,KabraxisX,$ZTS
441925,ZTS,574618123,2024-05-29 19:04:06,OLIVER815399,$ZTS who has brought all the shares in the las...
441924,ZTS,574620374,2024-05-29 19:17:11,WitzKash,$ZTS!!!
441923,ZTS,574742557,2024-05-30 16:45:11,KabraxisX,$ZTS May head lower again.


### 1.2 - Textual data cleaning

In [26]:
test = data[data['ticker']=='MMM']['body']
test = test[1]
test

'$MMM wow, that went to hell'

### 1.3 - VADER Sentiment Analysis 

In [7]:
nltk.download('vader_lexicon')
from tqdm.notebook import tqdm
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/marco/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [9]:
res = {}
for i, row in tqdm(data.iterrows(), total=len(data)):
    text = row['body']
    com_id = row['id']
    res[com_id] = sia.polarity_scores(text)

  0%|          | 0/1084175 [00:00<?, ?it/s]

In [10]:
vader = pd.DataFrame(res).T
vader = vader.reset_index().rename(columns={'index': 'id'})
vader = vader.merge(data, how='left')

In [11]:
vader.head()

Unnamed: 0,id,neg,neu,pos,compound,ticker,created_at,user,body
0,556346410,0.0,0.781,0.219,0.6369,A,2024-01-01 14:05:07,tickeron,How to make the right investment? Pick the bes...
1,556637599,0.0,1.0,0.0,0.0,A,2024-01-04 00:31:07,ChiefOptionsTrader,$A
2,556834165,0.0,0.84,0.16,0.2263,A,2024-01-05 15:18:43,DoubleBank,$A IV tramadol worth billions instead of add...
3,556866500,0.27,0.73,0.0,-0.5719,A,2024-01-05 17:57:28,Serhii,"$A Momentum Indicator for A turns negative, in..."
4,556898971,0.0,0.832,0.168,0.8256,A,2024-01-05 21:06:33,risenhoover,$A / Agilent Technologies files form 8-K - UNI...


### 1.4 - RoBERTa Sentiment Analysis

In [13]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [15]:
MODEL = f'cardiffnlp/twitter-roberta-base-sentiment'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

pytorch_model.bin:  74%|#######3  | 367M/499M [00:00<?, ?B/s]

In [27]:
# VADER polarity score example
sia.polarity_scores(test)

{'neg': 0.371, 'neu': 0.323, 'pos': 0.306, 'compound': -0.2023}

In [28]:
# Run for Roberta Model
encoded_text = tokenizer(test, return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
    'roberta_neg': scores[0],
    'roberta_neu': scores[1],
    'roberta_pos': scores[2]
}

print(scores_dict)

{'roberta_neg': 0.9415615, 'roberta_neu': 0.051394533, 'roberta_pos': 0.007043897}


In [33]:
def polarity_scores_roberta(test):
    encoded_text = tokenizer(test, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg': scores[0],
        'roberta_neu': scores[1],
        'roberta_pos': scores[2]
    }
    return scores_dict

In [36]:
res = {}
for i, row in tqdm(data.iterrows(), total=len(data)):
    try:
        text = row['body']
        com_id = row['id']
        vader_result = sia.polarity_scores(text)
        vader_result_rename = {}
        for key, value in vader_result.items():
            vader_result_rename[f'vader_{key}'] = value
        roberta_result = polarity_scores_roberta(text)
        both = {**vader_result, **roberta_result}
        res[com_id] = both
    except RuntimeError:
        print(f'Broke for id {com_id}')

#  It will take approximately 16/17 hours to run the model for all the comments

  0%|          | 0/1084175 [00:00<?, ?it/s]

Broke for id 569376457


KeyboardInterrupt: 

In [35]:
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index': 'id'})
results_df = results_df.merge(data, how='left')

{'neg': 0.0,
 'neu': 0.781,
 'pos': 0.219,
 'compound': 0.6369,
 'roberta_neg': 0.019772543,
 'roberta_neu': 0.5628837,
 'roberta_pos': 0.41734377}

In [None]:
results_df.head()

### 1.[] - Variables Definition

### 1.[] - Importing Financial Data

## 2 - Linear Models and Factors