# Financial News Sentiment Analyzer

### Readme

- No need for lemmatizing/stemming
- No need for domain stop-words
- Used SentBert for vectorization

## Imports

In [14]:
'''Kernel Python Version 3.6.10 '''

# Standard libs
import os
import sys
import json
import warnings
import re
import io
from io import StringIO
import inspect
import shutil
import ast
import string
import time
import pickle
import glob
import traceback
import multiprocessing
import requests
import logging
import math
import pytz
from itertools import chain
from string import Template
from datetime import datetime, timedelta
from dateutil import parser
import base64
from collections import defaultdict, Counter, OrderedDict
from contextlib import contextmanager
import unicodedata
from functools import reduce
import itertools
import tempfile
import jsonschema
from typing import Any, Dict, List, Callable, Optional, Tuple, NamedTuple, Union
from functools import wraps
import tqdm

# graph
import networkx as nx

# Required pkgs
import numpy as np
from numpy import array, argmax
import pandas as pd
import ntpath
import tqdm

# General text correction - fit text for you (ftfy) and others
import ftfy
from fuzzywuzzy import fuzz
from wordcloud import WordCloud
from spellchecker import SpellChecker

# imbalanced-learn
from imblearn.over_sampling import SMOTE, SVMSMOTE, ADASYN

# scikit-learn
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, jaccard_score, silhouette_score, homogeneity_score, calinski_harabasz_score
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
from sklearn.neighbors import NearestNeighbors, LocalOutlierFactor
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.base import BaseEstimator, TransformerMixin

# scipy
from scipy import spatial, sparse
from scipy.sparse import coo_matrix, vstack, hstack
from scipy.spatial.distance import euclidean, jensenshannon, cosine, cdist
from scipy.io import mmwrite, mmread
from scipy.stats import entropy
from scipy.cluster.hierarchy import dendrogram, ward, fcluster
import scipy.cluster.hierarchy as sch
from scipy.sparse.csr import csr_matrix
from scipy.sparse.lil import lil_matrix
from scipy.sparse.csgraph import connected_components

# sparse_dot_topn: matrix multiplier
from sparse_dot_topn import awesome_cossim_topn
import sparse_dot_topn.sparse_dot_topn as ct

# Gensim
import gensim
from gensim.models import Phrases, Word2Vec, KeyedVectors, FastText, LdaModel
from gensim import utils
from gensim.utils import simple_preprocess
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec
import gensim.downloader as api
from gensim import models, corpora, similarities

# NLTK
import nltk
#nltk_model_data_path = "/someppath/"
#nltk.data.path.append(nltk_model_data_path)
from nltk import FreqDist, tokenize, sent_tokenize, word_tokenize, pos_tag
from nltk.corpus import stopwords, PlaintextCorpusReader
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import *
from nltk.translate.bleu_score import sentence_bleu
print("NLTK loaded.")

# Spacy
import spacy
from spacy import displacy
from spacy.matcher import Matcher
from spacy.lang.en import English
print("Spacy loaded.")

# TF & Keras
# import tensorflow as tf
# from keras.preprocessing.text import Tokenizer, text_to_word_sequence
# from keras.preprocessing.sequence import pad_sequences
# from keras.utils import CustomObjectScope
# from keras.utils.np_utils import to_categorical
# from keras.engine.topology import Layer
# from keras import backend as K
# from keras import initializers as initializers, regularizers, constraints, optimizers
# from keras.layers import *
# from keras.layers.normalization import BatchNormalization
# from keras.layers.recurrent import LSTM
# # from keras.layers.core import Input, Dense, Activation
# from keras.layers.embeddings import Embedding
# from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
# from keras.models import Sequential, Model, load_model
# import tensorflow_hub as hub
# print("TensorFlow loaded.")

# Pytorch
import torch
from torch import optim, nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelWithLMHead
from transformers import pipeline
from transformers import AutoModel
from transformers import AutoModelForSequenceClassification
print("PyTorch loaded.")

# Plots
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly import offline
%matplotlib inline

# Theme settings
pd.set_option("display.max_columns", 80)
sns.set_context('talk')
sns.set(rc={'figure.figsize':(15,10)})
sns.set_style("darkgrid")
warnings.filterwarnings('ignore')

NLTK loaded.
Spacy loaded.
PyTorch loaded.


## Paths

In [15]:
root_dir = os.path.abspath("../")
data_dir = os.path.join(root_dir, "data")
model_dir = os.path.join(root_dir, "models")
output_dir = os.path.join(root_dir, "outputs")

## Load model files

# Spacy
spacy_model_data_path = "../../../models/en_core_web_lg/en_core_web_lg-2.2.5"
nlp = spacy.load(spacy_model_data_path, disable=['ner'])

# NLP Resources
PATH_RES_DIR = "../../../models/Resources"

# FinBert v1
PATH_BERT_MODEL = "../../../models/finbert_v1"

# [OLD] Distilled Roberta v1
# PATH_BERT_MODEL = "../../../models/all-distilroberta-v1"

## Data

In [16]:
news_data_path = os.path.join(os.path.join(data_dir, "datasets"), "news_data")
os.listdir(news_data_path)

['Twitter_Microsoft_2015.csv',
 'Twitter_Microsoft_2016.csv',
 'Twitter_Microsoft_2017.csv',
 'Twitter_Microsoft_2018.csv',
 'Twitter_Microsoft_2019.csv']

In [17]:
data = []
for f in os.listdir(news_data_path):
    data.append(pd.read_csv(os.path.join(news_data_path, f)))

In [18]:
def remove_errors(df, d_format):
    dates = []
    for i,d in enumerate(df.date.values):
        try:
            if "-" in d:
                d_format = "%Y-%m-%d"
            date = datetime.strptime(d, d_format)
            dates.append(date)
        except Exception as e:
            print(i, str(e))
            dates.append("NULL")
    df['date'] = dates
    df = df[df['date']!="NULL"].reset_index(drop=True)
    return df

In [19]:
data[0] = remove_errors(data[0], d_format='%m/%d/%y')
data[1] = remove_errors(data[1], d_format='%m/%d/%Y')
data[2] = remove_errors(data[2], d_format='%m/%d/%Y')
data[3] = remove_errors(data[3], d_format='%Y-%m-%d')
data[4] = remove_errors(data[4], d_format='%Y-%m-%d')

data = pd.concat(data).reset_index(drop=True)

2796 time data 'http://t.co/HGfsrYaRKY' does not match format '%m/%d/%y'
6071 time data 'http://t.co/aIsdgFEzUG' does not match format '%m/%d/%y'
7602 time data 'http://t.co/ep3toGJqiz' does not match format '%m/%d/%y'
8530 time data 'Sin novia.....' does not match format '%m/%d/%y'
8531 time data 'No fumo.....' does not match format '%m/%d/%y'
8532 time data 'No bebo....' does not match format '%m/%d/%y'
8533 time data '@Bllgates @microsoft @Xbox me regaláis una XBOX ONE? ' does not match format '%m/%d/%y'
8534 time data 'CON LO MAJO QUE SOY!!!!!' does not match format '%m/%d/%y'
11337 time data ' to #Google #Android. #Micoshit' does not match format '%m/%d/%y'
14580 time data ' https://t.co/mdbo57hwjC https://t.co/MAwpb7syAd' does not match format '%m/%d/%y'
14701 time data ' https://t.co/b6o7agKtLM https://t.co/5R1yaYGaW8' does not match format '%m/%d/%y'
14833 time data ' https://t.co/mdbo57hwjC https://t.co/Gay9PyP4SC' does not match format '%m/%d/%y'
15229 time data 'https://t.co

In [20]:
data.shape

(91298, 3)

In [21]:
data

Unnamed: 0,date,tweet,relevance_count
0,2015-01-01 00:00:00,＜CNET Japan＞シリコンバレーを気にするな--マイクロソフト流のスタートアップ支援 ...,127711.0
1,2015-01-01 00:00:00,@jtheakstone_3 ...commerce.microsoft.com and t...,37120.0
2,2015-01-01 00:00:00,[http://t.co/C5WOfESi6C] Microsoft planea un n...,155060.0
3,2015-01-01 00:00:00,"@Samuel_4000 Currently, Microsoft Account fund...",1905461.0
4,2015-01-01 00:00:00,"Microsoft has a patent, for opening a new wind...",74591.0
...,...,...,...
91293,2019-12-31 00:00:00,Microsoft wraps up its best year since 2009 ht...,69243.0
91294,2019-12-31 00:00:00,"💧 Así es como Amazon superó a Microsoft, Apple...",44775.0
91295,2019-12-31 00:00:00,"🧩 Así es como Amazon superó a Microsoft, Apple...",54823.0
91296,2019-12-31 00:00:00,Microsoft seizes 'https://t.co/fJY4nvO9SV' fro...,2785818.0


In [22]:
col_txt = "tweet"

## Preprocessinng

In [23]:
class preprocessText:
    
    def __init__(self, resources_dir_path, custom_vocab=[], do_lemma=False):
        self.stopwords_file = os.path.join(resources_dir_path, "stopwords.txt")
        self.special_stopwords_file = os.path.join(resources_dir_path, "special_stopwords.txt")
        self.special_characters_file = os.path.join(resources_dir_path, "special_characters.txt")
        self.contractions_file = os.path.join(resources_dir_path, "contractions.json")
        self.chatwords_file = os.path.join(resources_dir_path, "chatwords.txt")
        self.emoticons_file = os.path.join(resources_dir_path, "emoticons.json")
        self.greeting_file = os.path.join(resources_dir_path, "greeting_words.txt")
        self.signature_file = os.path.join(resources_dir_path, "signature_words.txt")
        self.preserve_key = "<$>" # preserve special vocab
        self.vocab_list = custom_vocab
        self.preseve = True if len(custom_vocab) > 0 else False
        self.load_resources()
        self.do_lemma = do_lemma
        return
    
    def load_resources(self):
        
        ### Build Vocab Model --> Words to keep
        self.vocab_list = set(map(str.lower, self.vocab_list))
        self.vocab_dict = {w: self.preserve_key.join(w.split()) for w in self.vocab_list}
        self.re_retain_words = re.compile('|'.join(sorted(map(re.escape, self.vocab_dict), key=len, reverse=True)))
        
        ### Build Stopwords Model --> Words to drop/delete
        with open(self.stopwords_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.stopwords = [x.rstrip() for x in f.readlines()]
        with open(self.special_stopwords_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.stopwords.extend([x.rstrip() for x in f.readlines()])
        with open(self.special_characters_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.stopwords.extend([x.rstrip() for x in f.readlines()])
        self.stopwords = list(sorted(set(self.stopwords).difference(self.vocab_list)))

        ### Build Contractions
        with open(self.contractions_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.contractions = dict(json.load(f))
        
        ### Build Chat-words
        with open(self.chatwords_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.chat_words_map_dict, self.chat_words_list = {}, []
            chat_words = [x.rstrip() for x in f.readlines()]
            for line in chat_words:
                cw = line.split("=")[0]
                cw_expanded = line.split("=")[1]
                self.chat_words_list.append(cw)
                self.chat_words_map_dict[cw] = cw_expanded
            self.chat_words_list = set(self.chat_words_list)
        
        ### Bukd social markups
        # emoticons
        with open(self.emoticons_file, "r") as f:
            self.emoticons = re.compile(u'(' + u'|'.join(k for k in json.load(f)) + u')')
        # emojis
        self.emojis = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   "]+", flags=re.UNICODE)
        # greeting
        with open(self.greeting_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.greeting_words = [x.rstrip() for x in f.readlines()]
        # signature
        with open(self.signature_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.signature_words = [x.rstrip() for x in f.readlines()]
        # spell-corrector
        self.spell_checker = SpellChecker()   
        return
    
    
    def reserve_keywords_from_cleaning(self, text, reset=False):
        """ 
        Finds common words from a user-provided list of special keywords to preserve them from 
        cleaning steps. Identifies every special keyword and joins them using `self.preserve_key` during the 
        cleaning steps, and later resets it back to original word in the end.
        """
        if reset is False:
            # compile using a dict of words and their expansions, and sub them if found!
            match_and_sub = self.re_retain_words.sub(lambda x: self.vocab_dict[x.string[x.start():x.end()]], text)
            return re.sub(r"([\s\n\t\r]+)", " ", match_and_sub).strip()
        else:
            # reverse the change! - use this at the end of preprocessing
            text = text.replace(self.preserve_key, " ")
            return re.sub(r"([\s\n\t\r]+)", " ", text).strip()


    def basic_clean(self, input_sentences):
        cleaned_sentences = []
        for sent in tqdm.tqdm(input_sentences, position=0):
            sent = str(sent).strip()
            # FIX text
            sent = ftfy.fix_text(sent)
            # Normalize accented chars
            sent = unicodedata.normalize('NFKD', sent).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            # Removing <…> web scrape tags
            sent = re.sub(r"\<(.*?)\>", " ", sent)
            # Expanding contractions using contractions_file
            sent = re.sub(r"(\w+\'\w+)", lambda x: self.contractions.get(x.group().lower(), x.group().lower()), sent)
            # Removing web urls
            sent = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0–9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»""'']))''', " ", sent)
            # Removing date formats
            sent = re.sub(r"(\d{4}\-\d{2}\-\d{2}\s\d{2}\:\d{2}\:\d{2}\s\:)", " ", sent)
            # Removing extra whitespaces
            sent = re.sub(r"([\s\n\t\r]+)", " ", sent).strip()
            cleaned_sentences.append(sent)
        return cleaned_sentences


    def deep_clean(self, input_sentences):
        cleaned_sentences = []
        for sent in tqdm.tqdm(input_sentences, position=0):
            # normalize text to "utf-8" encoding
            sent = unicodedata.normalize('NFKD', str(sent)).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            # lowercasing
            sent = str(sent).strip().lower()

            # <----------------------------- CUSTOM CLEANING ----------------------------- >
            #
            # *** Mark important keywords such as: Domain specific, Question words(wh-words), etc, using 
            # "self.vocab_list". Words from this list if found in any input sentence shall be joined using 
            # a key (self.preserve_key) during pre-processing step, and later un-joined to retain them.
            #
            if self.preseve: 
                sent = self.reserve_keywords_from_cleaning(sent, reset=False)
            #
            # <----------------------------- CUSTOM CLEANING ----------------------------- >

            # remove Emojis
            sent = self.emojis.sub(r'', sent)
            # remove emoticons
            sent = self.emoticons.sub(r'', sent)
            # remove common chat-words
            sent = " ".join([self.chat_words_map_dict[w.upper()] if w.upper() in self.chat_words_list else w for w in sent.split()])
            # FIX text
            sent = ftfy.fix_text(sent)
            # Normalize accented chars
            sent = unicodedata.normalize('NFKD', sent).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            # Removing <…> web scrape tags
            sent = re.sub(r"\<(.*?)\>", " ", sent)
            # Expanding contractions using contractions_file
            sent = re.sub(r"(\w+\'\w+)", lambda x: self.contractions.get(x.group().lower(), x.group().lower()), sent)
            # Removing web urls
            sent = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0–9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»""'']))''', " ", sent)
            # Removing date formats
            sent = re.sub(r"(\d{4}\-\d{2}\-\d{2}\s\d{2}\:\d{2}\:\d{2}\s\:)", " ", sent)

            # <----------------------------- OPTIONAL CLEANING ----------------------------- >
            #
            # removing punctuations 🔥🔥
            # *** disable them, when sentence structure needs to be retained ***
            sent = re.sub(r"[\$|\#\@\*\%]+\d+[\$|\#\@\*\%]+", " ", sent)
            sent = re.sub(r"\'s", " \'s", sent)
            sent = re.sub(r"\'ve", " \'ve", sent)
            sent = re.sub(r"n\'t", " n\'t", sent)
            sent = re.sub(r"\'re", " \'re", sent)
            sent = re.sub(r"\'d", " \'d", sent)
            sent = re.sub(r"\'ll", " \'ll", sent)
            sent = re.sub(r"[\/,\@,\#,\\,\{,\},\(,\),\[,\],\$,\%,\^,\&,\*,\<,\>]", " ", sent)
            sent = re.sub(r"[\,,\;,\:,\-]", " ", sent)      # main puncts
            
            # remove sentence de-limitters 🔥🔥
            # *** disable them, when sentence boundary/ending is important ***
            # sent = re.sub(r"[\!,\?,\.]", " ", sent)

            # keep only text & numbers 🔥🔥
            # *** enable them, when only text and numbers matter! *** 
            # sent = re.sub(r"\s+", " ", re.sub(r"[\\|\/|\||\{|\}|\[|\]\(|\)]+", " ", re.sub(r"[^A-z0-9]", " ", str(sent))))
            
            # correct spelling mistakes 🔥🔥
            # *** enable them when english spelling mistakes matter *** 
            # sent = " ".join([self.spell_checker.correction(w) if w in self.spell_checker.unknown(sent.split()) else w for w in sent.split()])
            #
            # <----------------------------- OPTIONAL CLEANING ----------------------------- >

            # Remove stopwords
            sent = " ".join(token.text for token in nlp(sent) if token.text not in self.stopwords and 
                                                                 token.lemma_ not in self.stopwords)
            # Lemmatize
            if self.do_lemma:
                sent = " ".join(token.lemma_ for token in nlp(sent))
            # Removing extra whitespaces
            sent = re.sub(r"([\s\n\t\r]+)", " ", sent).lower().strip()

            # <----------------------------- CUSTOM CLEANING ----------------------------- >
            #
            # *** Reverse the custom joining now to un-join the special words found!
            if self.preseve: 
                sent = self.reserve_keywords_from_cleaning(sent, reset=True)
            # <----------------------------- CUSTOM CLEANING ----------------------------- >

            cleaned_sentences.append(sent.strip().lower())
        return cleaned_sentences


    def spacy_get_pos_list(self, results):
        word_list, pos_list, lemma_list, ner_list, start_end_list = [], [], [], [], []
        indices = results['sentences']
        for line in indices:
            tokens = line['tokens']
            for token in tokens:
                # (1). save tokens
                word_list.append(token['word'])
                # (2). save pos
                pos_list.append(token['pos'])
                # (3). save lemmas
                lemma = token['lemma'].lower()
                if lemma in self.stopwords: continue
                lemma_list.append(lemma)
                # (4). save NER
                ner_list.append(token['ner'])
                # (5). save start
                start_end_list.append(str(token['characterOffsetBegin']) + "_" + str(token['characterOffsetEnd']))
        output = {"word_list": word_list, 
                  "lemma_list": lemma_list, 
                  "token_start_end_list": start_end_list,
                  "pos_list": pos_list, "ner_list": ner_list}
        return output

    def spacy_generate_features(self, doc, operations='tokenize,ssplit,pos,lemma,ner'):
        """
        Spacy nlp pipeline to generate features such as pos, tokens, ner, dependency. Accepts doc=nlp(text)
        """
        # spacy doc
        doc_json = doc.to_json()  # Includes all operations given by spacy pipeline

        # Get text
        text = doc_json['text']

        # ---------------------------------------- OPERATIONS  ---------------------------------------- #
        # 1. Extract Entity List
        entity_list = doc_json["ents"]

        # 2. Create token lib
        token_lib = {token["id"]: token for token in doc_json["tokens"]}

        # init output json
        output_json = {}
        output_json["sentences"] = []

        # Perform spacy operations on each sent in text
        for i, sentence in enumerate(tqdm.tqdm(doc_json["sents"], position=0)):
            # init parsers
            parse = ""
            basicDependencies = []
            enhancedDependencies = []
            enhancedPlusPlusDependencies = []

            # init output json
            out_sentence = {"index": i, "line": 1, "tokens": []}
            output_json["sentences"].append(out_sentence)

            # 3. Split sentences by indices(i), add labels (pos, ner, dep, etc.)
            for token in doc_json["tokens"]:

                if sentence["start"] <= token["start"] and token["end"] <= sentence["end"]:
                    
                    # >>> Extract Entity label
                    ner = "O"
                    for entity in entity_list:
                        if entity["start"] <= token["start"] and token["end"] <= entity["end"]:
                            ner = entity["label"]

                    # >>> Extract dependency info
                    dep = token["dep"]
                    governor = 0 if token["head"] == token["id"] else (token["head"] + 1)  # CoreNLP index = pipeline index +1
                    governorGloss = "ROOT" if token["head"] == token["id"] else text[token_lib[token["head"]]["start"]:
                                                                                     token_lib[token["head"]]["end"]]
                    dependent = token["id"] + 1
                    dependentGloss = text[token["start"]:token["end"]]

                    # >>> Extract lemma
                    lemma = doc[token["id"]].lemma_

                    # 4. Add dependencies
                    basicDependencies.append({"dep": dep,
                                              "governor": governor,
                                              "governorGloss": governorGloss,
                                              "dependent": dependent,
                                              "dependentGloss": dependentGloss})
                    # 5. Add tokens
                    out_token = {"index": token["id"] + 1,
                                 "word": dependentGloss,
                                 "originalText": dependentGloss,
                                 "characterOffsetBegin": token["start"],
                                 "characterOffsetEnd": token["end"]}

                    # 6. Add lemmas
                    if "lemma" in operations:
                        out_token["lemma"] = lemma

                    # 7. Add POS tagging
                    if "pos" in operations:
                        out_token["pos"] = token["tag"]

                    # 8. Add NER
                    if "ner" in operations:
                        out_token["ner"] = ner

                    # Update output json
                    out_sentence["tokens"].append(out_token)

            # 9. Add dependencies operation
            if "parse" in operations:
                out_sentence["parse"] = parse
                out_sentence["basicDependencies"] = basicDependencies
                out_sentence["enhancedDependencies"] = out_sentence["basicDependencies"]
                out_sentence["enhancedPlusPlusDependencies"] = out_sentence["basicDependencies"]
        # ---------------------------------------- OPERATIONS  ---------------------------------------- #
        return output_json
    
    def spacy_clean(self, input_sentences):
        batch_size = min(int(np.ceil(len(input_sentences)/100)), 500)
        
        # Part 1: generate spacy textual features (pos, ner, lemma, dependencies)
        sentences = [self.spacy_generate_features(doc) for doc in nlp.pipe(input_sentences, batch_size=batch_size, n_threads=-1)]
        
        # Part 2: collect all the features for each sentence
        spacy_sentences = [self.spacy_get_pos_list(sent) for sent in sentences]

        return spacy_sentences


    ## MAIN ##
    def run_pipeline(self, sentences, operation):
        """
        Main module to execute pipeline. Accepts list of strings, and desired operation.
        """
        if operation=="":
            raise Exception("Please pass a cleaning type - `basic`, `deep` or `spacy` !!")

        # run basic cleaning
        if "basic" == operation.lower(): 
            return self.basic_clean(sentences)

        # run deep cleaning
        if "deep" == operation.lower(): 
            return self.deep_clean(sentences)

        # run spacy pipeline
        if "spacy" == operation.lower(): 
            return self.spacy_clean(sentences)

In [24]:
"""
CUSTOM VOCABULARY
- List of words you wish to mark and retain them across the preprocessing steps. 
- Example, task-specific, domain-specific keywords.
"""
custom_vocab = ["google", "goog", "alphabet", "googlee", "netflix", "netflx", "amazon", "amz", 
                "apple", "aple", "aws", "iphone", "mac", "ipad"]

"""
LEMMATIZER
- Truncate words to their root-known-word form, stripping off their adjectives, verbs, etc.
- Example: "running" becomes "run", "is" becomes "be"
"""
do_lemmatizing = False

In [25]:
## Preprocessing

resources_dir_path = PATH_RES_DIR
preprocessText_obj = preprocessText(resources_dir_path, custom_vocab, do_lemmatizing)

def cleaning(data, text_col):
    #data["Basic_%s" % text_col] = preprocessText_obj.run_pipeline(data[text_col], "basic")
    data["Deep_%s" % text_col] = preprocessText_obj.run_pipeline(data[text_col], "deep")
    #data["Spacy_%s" % text_col] = preprocessText_obj.run_pipeline(data[text_col], "spacy")
    return data


## SAMPLE
# df = cleaning(df, <_TEXT_COLUMN_>)

In [26]:
df = cleaning(data, col_txt)

100%|██████████| 91298/91298 [11:41<00:00, 130.08it/s]


## Vectorization + Classification using sentence-Bert

In [40]:
class GenerateSentiments:
    
    def __init__(self, model_fp):
        # https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment
        self.tokenizer = AutoTokenizer.from_pretrained(model_fp)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_fp)
        self.max_length = 140
        
    def get_sentiments(self, df_text_col):
        
        if isinstance(df_text_col, pd.Series):
            df_text_col = df_text_col.tolist()
        
        size = len(df_text_col)
        if size > 2000:
            senti_neg_scores, senti_neu_scores, senti_pos_scores = [], [], []
            try:
                for sent in tqdm.tqdm(df_text_col, position=0):
                    # vectorize
                    encoded_input = self.tokenizer(sent, return_tensors='pt')
                    # AutoModelForSequenceClassification outputs sentiments [0:Neg, 1:Neu, 2:Pos] absolute scores
                    output = self.model(**encoded_input)
                    # converting to labels
                    scaled_output = torch.nn.functional.softmax(output.logits, dim=-1)
                    senti_neg_scores.append(scaled_output[:, 0].tolist()[0])
                    senti_neu_scores.append(scaled_output[:, 1].tolist()[0])
                    senti_pos_scores.append(scaled_output[:, 2].tolist()[0])
            except Exception as e:
                pass#print(e)
        else:
            # vectorize
            encoded_input = self.tokenizer(df_text_col, padding=True, truncation=True, max_length=self.max_length, return_tensors='pt')
            # AutoModelForSequenceClassification outputs sentiments [0:Neg, 1:Neu, 2:Pos] absolute scores
            output = self.model(**encoded_input)
            # converting to labels
            scaled_output = torch.nn.functional.softmax(output.logits, dim=-1)
            senti_neg_scores = scaled_output[:, 0].tolist()
            senti_neu_scores = scaled_output[:, 1].tolist()
            senti_pos_scores = scaled_output[:, 2].tolist()
        
        return senti_neg_scores, senti_neu_scores, senti_pos_scores


## SAMPLE
# txt_col_chosen = "<TEXT_COL>" % col_txt
# df['Negative'], df['Neutral'], df['Positive'] = get_sentiments(df[txt_col_chosen])

In [46]:
# get sentiments: Neg, Neu, Pos abs scores:

txt_col_chosen = "Deep_%s" % col_txt


df_text_col = df[txt_col_chosen]
senti = GenerateSentiments(PATH_BERT_MODEL)
df['Negative'], df['Neutral'], df['Positive'] = senti.get_sentiments(df_text_col)

100%|██████████| 73050/73050 [6:01:33<00:00,  3.37it/s]      


In [47]:
df['Sentiment'] = df[['Negative', 'Neutral', 'Positive']].idxmax(axis=1)
df['Polarity'] = np.select(
    [df.Sentiment=='Positive', df.Sentiment=='Negative'], [1.0*df.Positive, -1.0*df.Negative], default=0.0)

df.to_csv("../outputs/Twitter_Microsoft_Sentiments_2015_2019.csv")

In [97]:
# grouping by each day


df_final = df.groupby(['date'])['Polarity'].mean().reset_index().rename(columns={'Polarity':'Polarity_mean'})
df_final = df_final.sort_values(by=['date']).reset_index(drop=True).drop_duplicates(subset=['date'])
df_final['date'] = pd.to_datetime(df_final['date'])
df_final.insert(1, 'year', df_final.date.dt.year)
df_final

Unnamed: 0,date,year,Polarity_mean
0,2015-01-01,2015,-0.041227
1,2015-01-02,2015,0.280970
2,2015-01-03,2015,0.062588
3,2015-01-04,2015,0.003251
4,2015-01-05,2015,0.093248
...,...,...,...
1821,2019-12-27,2019,0.766002
1822,2019-12-28,2019,0.686188
1823,2019-12-29,2019,0.826388
1824,2019-12-30,2019,0.535143


- Feb 2016 leap year