In [1]:
import pandas as pd
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from transformers import pipeline
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax
import emoji
import re


In [2]:
# Load Pretrained models paths
STOCKWITS = '../Models/Sentiment/roberta-stocktwits-finetuned/snapshots/13_11_22/'
ROBERTA = '../Models/Sentiment/twitter-roberta-latest/snapshots/13_11_22/'

In [3]:
def process_text(texts):

  # remove URLs
  texts = re.sub(r'https?://\S+', "", texts)
  texts = re.sub(r'www.\S+', "", texts)
  # remove '
  texts = texts.replace('&#39;', "'")
  # remove symbol names
  texts = re.sub(r'(\#)(\S+)', r'hashtag_\2', texts)
  texts = re.sub(r'(\$)([A-Za-z]+)', r'cashtag_\2', texts)
  # remove usernames
  texts = re.sub(r'(\@)(\S+)', r'mention_\2', texts)
  # demojize
  texts = emoji.demojize(texts, delimiters=("", " "))

  return texts.strip()

In [4]:
def stockwits_signal(df):
   # Transformer pipeline with Stockwits
   # the model was trained upon below preprocessing
   tokenizer_stockwits = RobertaTokenizer.from_pretrained(STOCKWITS,
                                                          local_files_only=True)
   model_stockwits = RobertaForSequenceClassification.from_pretrained(STOCKWITS, 
                                                         local_files_only=True)

   stance_score = pipeline("text-classification", model=model_stockwits, 
                                                  tokenizer=tokenizer_stockwits,
                                                  top_k=None)
   # Clean Text and score
   df['Cleanned Text'] = df['Text'].apply(process_text)
   df['Stockwits Output'] = df['Cleanned Text'].apply(stance_score)
   stock_df = pd.DataFrame.from_records(df['Stockwits Output'].values)
   stock_df = pd.json_normalize(stock_df[0])

   # 2 labels, label 0 is bearish, label 1 is bullish
   for i in df.index:
      df.at[i, 'Bearish Score'] = stock_df[0][i]['score']
      df.at[i, 'Bullish Score'] = stock_df[1][i]['score']

   return df

In [5]:
def roberta_sentiment(df):
   # Transformer pipeline with Stockwits
   # the model was trained upon below preprocessing
   tokenizer_ROBERTA = AutoTokenizer.from_pretrained(ROBERTA, 
                                                  local_files_only=True)
   config_ROBERTA = AutoConfig.from_pretrained(ROBERTA, 
                                                  local_files_only=True)
   model_roberta = AutoModelForSequenceClassification.from_pretrained(ROBERTA, 
                                                   local_files_only=True)

   sentiment_task = pipeline("sentiment-analysis", model=model_roberta, 
                                                   tokenizer=tokenizer_ROBERTA,
                                                   top_k=None)
   # Clean Text and score
   df['Cleanned Text'] = df['Text'].apply(process_text)
   df['Roberta Output'] = df['Cleanned Text'].apply(sentiment_task)
   
   # Create helper df to keep the score per label
   roberta_df = pd.DataFrame.from_records(df["Roberta Output"].values)
   roberta_df = pd.json_normalize(roberta_df[0])

   roberta_df.head()
   #Return Helper DF scores to original df
   for i in df.index:
      df.at[i, 'Negative Score'] = roberta_df[0][i]['score']
      df.at[i, 'Neutral Score'] = roberta_df[1][i]['score']
      df.at[i, 'Positive Score'] = roberta_df[2][i]['score']

   return df

In [6]:
tweets_df=pd.read_csv("../Data/tweet_sample.csv")

In [7]:
stockwits_df = stockwits_signal(tweets_df)

In [8]:
stockwits_df.head()

Unnamed: 0.1,Unnamed: 0,ID,Text,Created at,Author ID,Retweet Count,Reply Count,Like Count,Quote Count,Cleanned Text,Stockwits Output,Bearish Score,Bullish Score
0,0,1576842927372111872,$AAPL: According to $MS - App Store net rev de...,2022-10-03 07:53:44+00:00,455309376,105,32,560,32,cashtag_AAPL: According to cashtag_MS - App St...,"[[{'label': 'LABEL_0', 'score': 0.989304423332...",0.989304,0.010696
1,1,1576727061150208000,"TSMC is raising prices for all clients, includ...",2022-10-03 00:13:20+00:00,45483286,98,10,347,18,"TSMC is raising prices for all clients, includ...","[[{'label': 'LABEL_1', 'score': 0.998606741428...",0.998607,0.001393
2,2,1576929387462950913,It's ok guys. I'm sure $TSLA and $AAPL are the...,2022-10-03 13:37:18+00:00,223520770,41,38,546,4,It's ok guys. I'm sure cashtag_TSLA and cashta...,"[[{'label': 'LABEL_1', 'score': 0.988406598567...",0.988407,0.011593
3,3,1576625559294996482,"Despite tough YTD performances thus far, these...",2022-10-02 17:30:00+00:00,499993648,43,16,217,5,"Despite tough YTD performances thus far, these...","[[{'label': 'LABEL_1', 'score': 0.998619556427...",0.99862,0.00138
4,4,1576708795983532032,The New Swiss Flag Once Credit Suisse Implodes...,2022-10-02 23:00:45+00:00,1218208096889208834,13,10,160,4,The New Swiss Flag Once Credit Suisse Implodes...,"[[{'label': 'LABEL_1', 'score': 0.622915148735...",0.622915,0.377085


In [9]:
sentiment_df = tweets_df.copy()
sentiment_df = roberta_sentiment(sentiment_df)
sentiment_df.head()

Some weights of the model checkpoint at ../Models/Sentiment/twitter-roberta-latest/snapshots/13_11_22/ were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Unnamed: 0.1,Unnamed: 0,ID,Text,Created at,Author ID,Retweet Count,Reply Count,Like Count,Quote Count,Cleanned Text,Stockwits Output,Bearish Score,Bullish Score,Roberta Output,Negative Score,Neutral Score,Positive Score
0,0,1576842927372111872,$AAPL: According to $MS - App Store net rev de...,2022-10-03 07:53:44+00:00,455309376,105,32,560,32,cashtag_AAPL: According to cashtag_MS - App St...,"[[{'label': 'LABEL_0', 'score': 0.989304423332...",0.989304,0.010696,"[[{'label': 'Negative', 'score': 0.82347857952...",0.823479,0.163567,0.012955
1,1,1576727061150208000,"TSMC is raising prices for all clients, includ...",2022-10-03 00:13:20+00:00,45483286,98,10,347,18,"TSMC is raising prices for all clients, includ...","[[{'label': 'LABEL_1', 'score': 0.998606741428...",0.998607,0.001393,"[[{'label': 'Neutral', 'score': 0.592991113662...",0.592991,0.399191,0.007818
2,2,1576929387462950913,It's ok guys. I'm sure $TSLA and $AAPL are the...,2022-10-03 13:37:18+00:00,223520770,41,38,546,4,It's ok guys. I'm sure cashtag_TSLA and cashta...,"[[{'label': 'LABEL_1', 'score': 0.988406598567...",0.988407,0.011593,"[[{'label': 'Neutral', 'score': 0.579201996326...",0.579202,0.230138,0.19066
3,3,1576625559294996482,"Despite tough YTD performances thus far, these...",2022-10-02 17:30:00+00:00,499993648,43,16,217,5,"Despite tough YTD performances thus far, these...","[[{'label': 'LABEL_1', 'score': 0.998619556427...",0.99862,0.00138,"[[{'label': 'Positive', 'score': 0.78556215763...",0.785562,0.203814,0.010624
4,4,1576708795983532032,The New Swiss Flag Once Credit Suisse Implodes...,2022-10-02 23:00:45+00:00,1218208096889208834,13,10,160,4,The New Swiss Flag Once Credit Suisse Implodes...,"[[{'label': 'LABEL_1', 'score': 0.622915148735...",0.622915,0.377085,"[[{'label': 'Neutral', 'score': 0.864137530326...",0.864138,0.077058,0.058804
