Now we have labels for whether or not the price goes up or not for some different time horizons. In this notebook we will use various pre-trained models to estimate the sentiment score for the reddit posts

In [91]:
import pandas as pd
import os

reddit_df = pd.read_csv(os.path.join(os.getcwd(), 'data', 'filtered_posts.csv'), index_col=0)
reddit_df.head()

Unnamed: 0,created,title,selftext,upvote_ratio,score,gilded,total_awards_received,num_comments,stock,subreddit
145,2021-01-22,Hold the line. Stocks have no time limit. Shor...,We have plenty of solid DD that has a fair val...,0.92,9,0,0,0,TWTR,gme
239,2021-01-23,Is it too late to buy GME??,I’m a 19yr student with around 500 dollars spa...,0.96,35,0,0,22,GME,gme
437,2021-01-25,BB,"Need some reinforcements at BB stock, trying t...",0.43,0,0,0,1,BB,gme
753,2021-01-26,gme,im nervous,1.0,4,0,0,6,GME,gme
1060,2021-01-27,10k members of the r/GME community,I know I may have joined late but fuck it I’m ...,1.0,4,0,0,1,GME,gme


In [5]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\cerik\AppData\Roaming\nltk_data...


True

In [44]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
sentiment_df = reddit_df.copy()
def vader_analyze_sentiment(x):
    scores = sid.polarity_scores(x)
    return list(scores.values())


vader_analyze_sentiment('I expect GME to drop soon, might be a good opportunity to buy the dip')

[0.118, 0.562, 0.32, 0.5574]

In [47]:
from tqdm.notebook import tqdm

title_sentiments = []
text_sentiments = []
for title, text in tqdm(zip(reddit_df['title'], reddit_df['selftext']), total=len(reddit_df)):
    title_sentiments.append(vader_analyze_sentiment(title))
    text_sentiments.append(vader_analyze_sentiment(text))



  0%|          | 0/41272 [00:00<?, ?it/s]

In [49]:
import numpy as np
title_sentiments = np.array(title_sentiments)
text_sentiments = np.array(text_sentiments)
title_sentiments.shape, text_sentiments.shape

((41272, 4), (41272, 4))

In [79]:
np.concatenate([title_sentiments, text_sentiments], axis = 1).shape

(41272, 8)

In [80]:
sentiment_df = pd.DataFrame(np.concatenate([title_sentiments, text_sentiments], axis=1), 
    columns=['nltk_title_neg', 'nltk_title_neu', 'nltk_title_pos', 'nltk_title_comb', 'nltk_text_neg', 'nltk_text_neu', 'nltk_text_pos', 'nltk_text_comb'])
sentiment_df.head(20)

Unnamed: 0,nltk_title_neg,nltk_title_neu,nltk_title_pos,nltk_title_comb,nltk_text_neg,nltk_text_neu,nltk_text_pos,nltk_text_comb
0,0.196,0.804,0.0,-0.296,0.071,0.778,0.151,0.9149
1,0.0,1.0,0.0,0.0,0.0,0.845,0.155,0.8689
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.677,0.323,0.0,-0.2732
4,0.0,1.0,0.0,0.0,0.302,0.698,0.0,-0.6956
5,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
6,0.227,0.664,0.109,-0.8244,0.0,0.602,0.398,0.9813
7,0.0,0.714,0.286,0.5859,0.101,0.749,0.15,0.8258
8,0.0,1.0,0.0,0.0,0.041,0.845,0.115,0.5267
9,0.109,0.792,0.099,-0.1449,0.183,0.722,0.095,-0.999


In [92]:
reddit_df = pd.concat([reddit_df.reset_index(), sentiment_df.reset_index()], axis=1).drop(['index'], axis=1)
reddit_df.head()

Unnamed: 0,created,title,selftext,upvote_ratio,score,gilded,total_awards_received,num_comments,stock,subreddit,nltk_title_neg,nltk_title_neu,nltk_title_pos,nltk_title_comb,nltk_text_neg,nltk_text_neu,nltk_text_pos,nltk_text_comb
0,2021-01-22,Hold the line. Stocks have no time limit. Shor...,We have plenty of solid DD that has a fair val...,0.92,9,0,0,0,TWTR,gme,0.196,0.804,0.0,-0.296,0.071,0.778,0.151,0.9149
1,2021-01-23,Is it too late to buy GME??,I’m a 19yr student with around 500 dollars spa...,0.96,35,0,0,22,GME,gme,0.0,1.0,0.0,0.0,0.0,0.845,0.155,0.8689
2,2021-01-25,BB,"Need some reinforcements at BB stock, trying t...",0.43,0,0,0,1,BB,gme,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2021-01-26,gme,im nervous,1.0,4,0,0,6,GME,gme,0.0,1.0,0.0,0.0,0.677,0.323,0.0,-0.2732
4,2021-01-27,10k members of the r/GME community,I know I may have joined late but fuck it I’m ...,1.0,4,0,0,1,GME,gme,0.0,1.0,0.0,0.0,0.302,0.698,0.0,-0.6956


In [93]:
reddit_df.to_csv(os.path.join(os.getcwd(), 'data', 'with_sentiments.csv'))

I also want to try FinBERT, which is BERT model trained specifically on financial texts. The following cells will download Finbert which is available as part of HuggingFace

In [205]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert", model_max_length=512) # Berts have a maximum token amount of 512
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [206]:
from transformers import pipeline

nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

sentences = ['The company said that sales in the three months to the end of March slid to EUR86 .4 m US$ 113.4 m from EUR91 .2 m last year', 
'I have bad news for our annual profits',
'I think the price will rise a lot',
'The price will likely increase from 150 to 300 soon, you should buy before its too late']
results = nlp(sentences, return_all_scores=True)
results

[[{'label': 'positive', 'score': 0.010205263271927834},
  {'label': 'negative', 'score': 0.974563717842102},
  {'label': 'neutral', 'score': 0.015231101773679256}],
 [{'label': 'positive', 'score': 0.016686420887708664},
  {'label': 'negative', 'score': 0.8888905048370361},
  {'label': 'neutral', 'score': 0.09442301839590073}],
 [{'label': 'positive', 'score': 0.7563812732696533},
  {'label': 'negative', 'score': 0.017832791432738304},
  {'label': 'neutral', 'score': 0.22578589618206024}],
 [{'label': 'positive', 'score': 0.20621450245380402},
  {'label': 'negative', 'score': 0.018482137471437454},
  {'label': 'neutral', 'score': 0.7753033638000488}]]

In [209]:

def finbert_sentiment(sentences, keys=['finbert_pos', 'finbert_neg', 'finbert_neu']):
    # If sentences are longer than 
    predictions = nlp(sentences, return_all_scores=True, truncation=True)
    scores = []
    for pred in predictions:
        scores.append({
            keys[0] : pred[0]['score'],
            keys[1] : pred[1]['score'],
            keys[2] : pred[2]['score'],
        })
    return scores

finbert_sentiment(sentences)

[{'finbert_pos': 0.010205263271927834,
  'finbert_neg': 0.974563717842102,
  'finbert_neu': 0.015231101773679256},
 {'finbert_pos': 0.016686420887708664,
  'finbert_neg': 0.8888905048370361,
  'finbert_neu': 0.09442301839590073},
 {'finbert_pos': 0.7563812732696533,
  'finbert_neg': 0.017832791432738304,
  'finbert_neu': 0.22578589618206024},
 {'finbert_pos': 0.20621450245380402,
  'finbert_neg': 0.018482137471437454,
  'finbert_neu': 0.7753033638000488}]

Finbert is different from vader in that it returns logits for the three cases, positive, negative, neutral. So in this case, we cannot really compare the overall polarity score. Nonetheless, we can add the scores we get to our df

In [182]:
finbert_df_titles = pd.DataFrame.from_records(finbert_sentiment(reddit_df['title'].values.tolist(), keys=['finbert_title_pos', 'finbert_title_neg', 'finbert_title_neu']))
finbert_df_titles.to_csv(os.path.join(os.getcwd(), 'data', 'finbert_titles.csv'))
finbert_df_titles.head()

  0%|          | 0/41272 [00:00<?, ?it/s]

In [None]:
finbert_df_texts = pd.DataFrame.from_records(finbert_sentiment(reddit_df['selftext'].values.tolist(), keys=['finbert_text_pos', 'finbert_text_neg', 'finbert_text_neu']))
finbert_df_texts.to_csv(os.path.join(os.getcwd(), 'data', 'finbert_texts.csv'))
finbert_df_texts.head()

In [2]:
import os
import pandas as pd

reddit_df = pd.read_csv('data/with_sentiments.csv', index_col=0)
finbert_df_titles = pd.read_csv('data/finbert_titles.csv', index_col=0)
finbert_df_texts = pd.read_csv('data/finbert_texts.csv', index_col=0)

In [3]:
reddit_df = pd.concat([reddit_df.reset_index(), finbert_df_titles.reset_index(), finbert_df_texts.reset_index()], axis=1).drop(['index'], axis=1)
reddit_df.head()

Unnamed: 0,created,title,selftext,upvote_ratio,score,gilded,total_awards_received,num_comments,stock,subreddit,...,nltk_text_neg,nltk_text_neu,nltk_text_pos,nltk_text_comb,finbert_title_pos,finbert_title_neg,finbert_title_neu,finbert_text_pos,finbert_text_neg,finbert_text_neu
0,2021-01-22,Hold the line. Stocks have no time limit. Shor...,We have plenty of solid DD that has a fair val...,0.92,9,0,0,0,TWTR,gme,...,0.071,0.778,0.151,0.9149,0.03014,0.061696,0.908164,0.052972,0.311806,0.635222
1,2021-01-23,Is it too late to buy GME??,I’m a 19yr student with around 500 dollars spa...,0.96,35,0,0,22,GME,gme,...,0.0,0.845,0.155,0.8689,0.040854,0.174402,0.784744,0.057967,0.028233,0.9138
2,2021-01-25,BB,"Need some reinforcements at BB stock, trying t...",0.43,0,0,0,1,BB,gme,...,0.0,1.0,0.0,0.0,0.06254,0.04231,0.89515,0.154814,0.125547,0.719639
3,2021-01-26,gme,im nervous,1.0,4,0,0,6,GME,gme,...,0.677,0.323,0.0,-0.2732,0.03981,0.049968,0.910221,0.0287,0.053433,0.917867
4,2021-01-27,10k members of the r/GME community,I know I may have joined late but fuck it I’m ...,1.0,4,0,0,1,GME,gme,...,0.302,0.698,0.0,-0.6956,0.047779,0.020157,0.932064,0.095934,0.162041,0.742025


In [4]:
reddit_df.to_csv(os.path.join(os.getcwd(), 'data', 'with_sentiments.csv'))