In [1]:
import os
import re
import lox
import time
import emoji
import pickle
import torch
import pandas as pd
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from transformers import pipeline
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax
from tqdm.notebook import tqdm_notebook
from datetime import datetime
from transformers.utils import logging
logging.set_verbosity_error

<function transformers.utils.logging.set_verbosity_error()>

In [2]:
# Load Pretrained models paths
STOCKWITS = '../Models/Sentiment/roberta-stocktwits-finetuned/snapshots/13_11_22/'
ROBERTA = '../Models/Sentiment/twitter-roberta-latest/snapshots/13_11_22/'

In [3]:
def process_text(text):
  texts = str(text)
  # make text lowercase
  texts = texts.lower()
  # remove URLs
  texts = re.sub(r"www\.[a-z]?\.?(com)+|[a-z]+\.(com)", "", texts)
  texts = re.sub(r'https?://\S+', "", texts)
  texts = re.sub(r'www.\S+', "", texts)
  texts = re.sub(r'&[a-z]+;', "", texts)
  # Remove placeholders
  texts = re.sub(r'{link}', "", texts)
  texts = re.sub(r"\[video\]", "", texts)
  # remove '
  texts = re.sub(r'{&#39;}', "'", texts)
  # remove new lines
  texts = re.sub(r'{\n}', " ", texts)
  
  texts = texts.strip()
  # remove symbol names
  texts = re.sub(r'(\#)(\S+)', r'hashtag_\2', texts)
  texts = re.sub(r'(\$)([A-Za-z]+)', r'cashtag_\2', texts)
  # remove usernames
  texts = re.sub(r'(\@)(\S+)', r'mention_\2', texts)
  # demojize
  texts = emoji.demojize(texts, delimiters=("", " "))

  return texts

In [4]:
def tweets_to_df(tweets_file):
   
   tweets_data = pickle.load(open(tweets_file, 'rb'))

  # Create Tweets DF
   tweets_df = pd.DataFrame(columns = ["ID", "Text", "Created at", 
                                       "Author ID","Retweet Count","Reply Count",
                                       "Like Count","Quote Count","Lang",
                                       ],index=[-1])

   for i in range(len(tweets_data)):
      for twt in range(len(tweets_data[i]['data'])):
         
         twt_id = tweets_data[i]['data'][twt]['id']
         text = tweets_data[i]['data'][twt]['text']
         cleaned_text = process_text(text)
         author_id=tweets_data[i]['data'][twt]['author_id']
         created_at=tweets_data[i]['data'][twt]['created_at']
         retweet_count=tweets_data[i]['data'][twt]['public_metrics']['retweet_count']
         reply_count=tweets_data[i]['data'][twt]['public_metrics']['reply_count']
         like_count=tweets_data[i]['data'][twt]['public_metrics']['like_count']
         quote_count=tweets_data[i]['data'][twt]['public_metrics']['quote_count']
         lang=tweets_data[i]['data'][twt]['lang']

         #Create temp df to hold the new line
         tmp_tweet_df = pd.DataFrame({ "ID": twt_id,
                                 "Text": text,
                                 "Cleaned Text": cleaned_text,
                                 "Author ID": author_id,
                                 "Created at": created_at,
                                 "Retweet Count": retweet_count, 
                                 "Reply Count": reply_count,
                                 "Like Count": like_count, 
                                 "Quote Count": quote_count,
                                 "Lang": lang,
                                  },index=[str(i)+str(twt)])
         # Wrap text in quotes
         tmp_tweet_df['Text'] = '"' + tmp_tweet_df['Text'].astype(str) +'"'
         
         # Concat the two dataframes
         tweets_df = pd.concat([tweets_df, tmp_tweet_df],ignore_index=True)
   return tweets_df

In [5]:

# Transformer pipeline with Roberta Sentimet Score
device = "cuda:0" if torch.cuda.is_available() else "cpu"
   
tokenizer_ROBERTA = AutoTokenizer.from_pretrained(ROBERTA, 
                                                  local_files_only=True)
config_ROBERTA = AutoConfig.from_pretrained(ROBERTA, 
                                                  local_files_only=True)
model_roberta = AutoModelForSequenceClassification.from_pretrained(ROBERTA, 
                                                  local_files_only=True)

sentiment_task = pipeline("sentiment-analysis", model=model_roberta, 
                                                tokenizer=tokenizer_ROBERTA,
                                                device=device)
@lox.thread(8)
def roberta_sentiment(df):
   # Clean Text again to ensure consistency
   df['Cleaned Text']=df['Text'].apply(process_text)
   #Get the Ouput as columns
   df['Roberta Output']=df['Cleaned Text'].apply(sentiment_task)
   df=df.explode('Roberta Output') #unnest the list
   df=pd.concat([df.drop(['Roberta Output'], axis=1), df['Roberta Output'].apply(pd.Series)], axis=1)
   df=df.rename(columns = {'label':'Sentiment Label','score':'Sentiment Score'})
   
   return df

Some weights of the model checkpoint at ../Models/Sentiment/twitter-roberta-latest/snapshots/13_11_22/ were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:

# Transformer pipeline with Stockwits Signal Scorer
device = "cuda:0" if torch.cuda.is_available() else "cpu"
   
tokenizer_stockwits = RobertaTokenizer.from_pretrained(STOCKWITS,
                                                       local_files_only=True)
model_stockwits = RobertaForSequenceClassification.from_pretrained(STOCKWITS, 
                                                         local_files_only=True)

stance_score = pipeline("text-classification", model=model_stockwits, 
                                               tokenizer=tokenizer_stockwits,
                                               device=device)
@lox.thread(8)
def stockwits_signal(df):
   # Clean Text again to ensure consistency
   df['Cleaned Text']=df['Text'].apply(process_text)
   #Score and Get the Ouput as columns
   df['Stockwits Output']=df['Cleaned Text'].apply(stance_score)   
   df=df.explode('Stockwits Output') #unnest the list
   df=pd.concat([df.drop(['Stockwits Output'], axis=1), df['Stockwits Output'].apply(pd.Series)], axis=1)
   # 2 labels, label 0 is bearish, label 1 is bullish
   df=df.replace({'LABEL_0':"Bearish", "LABEL_1": "Bullish"})
   df=df.rename(columns = {'label':'Signal Label','score':'Signal Score'})

   return df

In [7]:
raw_path ="../Data/RawTweets/"
#we shall store all the file names in this list
raw_data_files = []

for root, dirs, files in os.walk(raw_path):
	raw_data_files.extend(os.path.join(root,file) for file in files)
raw_data_files = list(map(lambda st: str.replace(st, "\\", "/"), raw_data_files))

# Replace RawTweets with  in list of strings
cleaned_data_files = list(map(lambda st: str.replace(st, "pkl", "csv"), raw_data_files))
cleaned_data_files = list(map(lambda st: str.replace(st, "RawTweets", "CleanedTweets"), cleaned_data_files))

In [8]:
#Check if files are already converted and remove them from both lists
#Find Already converted files
scored_path ="../Data/CleanedTweets/"
cleaned_files=[]
ts = datetime.now()
for root, dirs, files in os.walk(scored_path):
	cleaned_files.extend(os.path.join(root,file) for file in files)
cleaned_files = list(map(lambda st: str.replace(st, "\\", "/"), cleaned_files))

#Remove the already converted files
with open('../Logs/Conversions.txt', 'a') as f:
      f.write(f'{ts}-[Resume] - Resuming Conversion. \n')
      f.write(f'{ts}-[INFO] - Found Already {len(cleaned_files)} converted Files.\n')
        
cleaned_data_files_set = set(cleaned_data_files)
already_cleaned_set = set(cleaned_files)
cleaned_data_files = list(cleaned_data_files_set - already_cleaned_set)
raw_data_files = list(map(lambda st: str.replace(st, "csv","pkl"), cleaned_data_files))
raw_data_files = list(map(lambda st: str.replace(st, "CleanedTweets","RawTweets"), raw_data_files))

In [9]:
#Wrapper Function for the Conversion to enable parrallelisation
@lox.process(10)
def conversion_scorrer_wrapper(i, raw_file):
   ts = datetime.now()
   with open('../Logs/Conversions.txt', 'a') as f:
        f.write(f'{ts}-[INFO] - Converting tweets from file: {raw_file},\n')
        df=tweets_to_df(raw_file)
        
        f.write(f'{ts}-[INFO] - Scoring file: {raw_file}, with Stockwits and Roberta Models. \n')
        df=roberta_sentiment(df)
        df=stockwits_signal(df)
        ts = datetime.now()
        f.write(f'{ts}-[INFO] - Conversion Done. Saving as csv to file: {cleaned_data_files[i]} \n')
        df.to_csv(cleaned_data_files[i], sep='~', encoding='utf-8')
        f.write(f'{ts}-[INFO] - File saved. Files Remaining: {len(raw_data_files)-i} \n')

In [10]:
#iterate over the raw files and clean and score them
failed_rawfiles =[]
for i, raw_file in tqdm_notebook(enumerate(raw_data_files), unit='files', total=len(raw_data_files)):
   try:
      conversion_scorrer_wrapper(i, raw_file)
   except:
          ts = datetime.now()
          with open('../Logs/Conversions.txt', 'a') as f:
            f.write(f'{ts}-[ERROR] - Failed to score file: {raw_file}. Adding it to list of raw files to be manual inspected. \n')
            failed_rawfiles.append(raw_file)
          with open('../Logs/Conversion_Errors.txt', 'a') as f:
            f.write(f'{ts}-[Failed] - Failed to score file: {raw_file}. Adding it to list of raw files to be manual inspected. \n')  
   time.sleep(0.01)
   continue

  0%|          | 0/1914 [00:00<?, ?files/s]



In [None]:
#Split the csv's into Ticker and Ceo ones to create seperate Dataframes
cleaned_tickers_path ="../Data/CleanedTweets/Tickers"
#we shall store all the file names in this list
cleaned_tickers_files = []

for root, dirs, files in os.walk(cleaned_tickers_path):
	cleaned_tickers_files.extend(os.path.join(root,file) for file in files)
cleaned_tickers_files = list(map(lambda st: str.replace(st, "\\", "/"), cleaned_tickers_files))

cleaned_ceos_path ="../Data/CleanedTweets/Ceos"
cleaned_ceos_files = []

for root, dirs, files in os.walk(cleaned_ceos_path):
	cleaned_ceos_files.extend(os.path.join(root,file) for file in files)
cleaned_ceos_files = list(map(lambda st: str.replace(st, "\\", "/"), cleaned_ceos_files))

In [None]:
# Create Tickers Dataframe
tickers_dfs = []
failed_tickers_dfs =[]
ts = datetime.now()
for i, csv_file in tqdm_notebook(enumerate(cleaned_tickers_files), unit='files', total=len(cleaned_tickers_files)):
   with open('../Logs/CSV Processing.txt', 'a') as f:
        f.write(f'{ts}-[INFO] - Processing tweets from csv file: {csv_file},\n')
        ts = datetime.now()
        try:
           data = pd.read_csv(csv_file, sep='~', encoding='utf-8', low_memory=False)
           # Clean Text again to ensure consistency
           data['Cleaned Text']=data['Text'].apply(process_text)

           f.write(f'{ts}-[INFO] - Appending file: {csv_file}, to list of daraframes. \n')
           tickers_dfs.append(data)
        except:
           f.write(f'{ts}-[ERROR] - Failed to read file: {csv_file}. Adding it to list of daraframes to be redownloaded. \n')
           failed_tickers_dfs.append(csv_file)
        ts = datetime.now()
        f.write(f'{ts}-[INFO] - Files remaining: {len(cleaned_tickers_files)-i},\n')
tickers_df = pd.concat(tickers_dfs, ignore_index=True).dropna(axis=0)

In [None]:
# Create Ceos Dataframe
ceos_dfs = []
failed_ceos_dfs =[]
ts = datetime.now()
for i, csv_file in tqdm_notebook(enumerate(cleaned_ceos_files), unit='files', total=len(cleaned_ceos_files)):
   with open('../Logs/CSV Processing.txt', 'a') as f:
        f.write(f'{ts}-[INFO] - Processing tweets from csv file: {csv_file},\n')
        ts = datetime.now()
        try:
           data = pd.read_csv(csv_file, sep='~', encoding='utf-8', low_memory=False)
           # Clean Text again to ensure consistency
           data['Cleaned Text']=data['Text'].apply(process_text)
           f.write(f'{ts}-[INFO] - Appending file: {csv_file}, to list of daraframes. \n')
           ceos_dfs.append(data)
        except:
           f.write(f'{ts}-[ERROR] - Failed to read file: {csv_file}. Adding it to list of daraframes to be redownloaded. \n')
           failed_ceos_dfs.append(csv_file)
        ts = datetime.now()
        f.write(f'{ts}-[INFO] - Files remaining: {len(cleaned_ceos_files)-i},\n')

ceos_df = pd.concat(ceos_dfs, ignore_index=True).dropna(axis=0)

In [None]:
tickers_df.to_csv('../Data/ScoredDf/Tickers.csv', sep='~', encoding='utf-8')

In [None]:
ceos_df.to_csv('../Data/ScoredDf/Ceos.csv', sep='~', encoding='utf-8')

In [None]:
len(failed_ceos_dfs)+len(failed_tickers_dfs)

## Investigate missing values in Sentiment and Signal Scoring

In [None]:
x=ceos_df.head(100).copy()

In [None]:
x = roberta_sentiment(x)

In [None]:
x.head()