# Financial News Sentiment Analyzer

### Readme

- No need for lemmatizing/stemming
- No need for domain stop-words
- Used SentBert for vectorization

## Imports

In [3]:
'''Kernel Python Version 3.6.10 '''

# Standard libs
import os
import sys
import json
import warnings
import re
import io
from io import StringIO
import inspect
import shutil
import ast
import string
import time
import pickle
import glob
import traceback
import multiprocessing
import requests
import logging
import math
import pytz
from itertools import chain
from string import Template
from datetime import datetime, timedelta
from dateutil import parser
import base64
from collections import defaultdict, Counter, OrderedDict
from contextlib import contextmanager
import unicodedata
from functools import reduce
import itertools
import tempfile
import jsonschema
from typing import Any, Dict, List, Callable, Optional, Tuple, NamedTuple, Union
from functools import wraps

# graph
import networkx as nx

# Required pkgs
import numpy as np
from numpy import array, argmax
import pandas as pd
import ntpath
import tqdm

# General text correction - fit text for you (ftfy) and others
import ftfy
from fuzzywuzzy import fuzz
from wordcloud import WordCloud
from spellchecker import SpellChecker

# imbalanced-learn
from imblearn.over_sampling import SMOTE, SVMSMOTE, ADASYN

# scikit-learn
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, jaccard_score, silhouette_score, homogeneity_score, calinski_harabasz_score
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
from sklearn.neighbors import NearestNeighbors, LocalOutlierFactor
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.base import BaseEstimator, TransformerMixin

# scipy
from scipy import spatial, sparse
from scipy.sparse import coo_matrix, vstack, hstack
from scipy.spatial.distance import euclidean, jensenshannon, cosine, cdist
from scipy.io import mmwrite, mmread
from scipy.stats import entropy
from scipy.cluster.hierarchy import dendrogram, ward, fcluster
import scipy.cluster.hierarchy as sch
from scipy.sparse.csr import csr_matrix
from scipy.sparse.lil import lil_matrix
from scipy.sparse.csgraph import connected_components

# sparse_dot_topn: matrix multiplier
from sparse_dot_topn import awesome_cossim_topn
import sparse_dot_topn.sparse_dot_topn as ct

# Gensim
import gensim
from gensim.models import Phrases, Word2Vec, KeyedVectors, FastText, LdaModel
from gensim import utils
from gensim.utils import simple_preprocess
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec
import gensim.downloader as api
from gensim import models, corpora, similarities

# NLTK
import nltk
#nltk_model_data_path = "/someppath/"
#nltk.data.path.append(nltk_model_data_path)
from nltk import FreqDist, tokenize, sent_tokenize, word_tokenize, pos_tag
from nltk.corpus import stopwords, PlaintextCorpusReader
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import *
from nltk.translate.bleu_score import sentence_bleu
print("NLTK loaded.")

# Spacy
import spacy
# spacy_model_data_path = "/Users/pranjalpathak/opt/anaconda3/envs/Python_3.6/lib/python3.6/site-packages/en_core_web_lg/en_core_web_lg-2.2.5"
nlp = spacy.load('en_core_web_lg')  # disabling: nlp = spacy.load(spacy_data_path, disable=['ner'])
from spacy import displacy
from spacy.matcher import Matcher
from spacy.lang.en import English
print("Spacy loaded.")

# TF & Keras
# import tensorflow as tf
# from keras.preprocessing.text import Tokenizer, text_to_word_sequence
# from keras.preprocessing.sequence import pad_sequences
# from keras.utils import CustomObjectScope
# from keras.utils.np_utils import to_categorical
# from keras.engine.topology import Layer
# from keras import backend as K
# from keras import initializers as initializers, regularizers, constraints, optimizers
# from keras.layers import *
# from keras.layers.normalization import BatchNormalization
# from keras.layers.recurrent import LSTM
# # from keras.layers.core import Input, Dense, Activation
# from keras.layers.embeddings import Embedding
# from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
# from keras.models import Sequential, Model, load_model
# import tensorflow_hub as hub
# print("TensorFlow loaded.")

# Pytorch
import torch
from torch import optim, nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelWithLMHead
from transformers import pipeline
from transformers import AutoModel
print("PyTorch loaded.")

# Plots
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly import offline
%matplotlib inline

# Theme settings
pd.set_option("display.max_columns", 80)
sns.set_context('talk')
sns.set(rc={'figure.figsize':(15,10)})
sns.set_style("darkgrid")
warnings.filterwarnings('ignore')

NLTK loaded.
Spacy loaded.
PyTorch loaded.


## Paths

In [53]:
root_dir = os.path.abspath("../")
data_dir = os.path.join(root_dir, "data")
output_dir = os.path.join(root_dir, "outputs")

PATH_RES_DIR = "/Volumes/Local Drive/WORK/Machine Learning/!!! Resources !!!!/NLP Resource Files"

PATH_BERT_MODEL = os.path.join(os.path.join(root_dir, "models"), "finbert_v1")

    
# bert_model_fp = "/v/region/na/appl/mswm/ainlp/data/ainlp_dev/Pretrained_Models/sentence-transformers-models/all-distilroberta-v1"


## Data

In [13]:
data = pd.read_csv(os.path.join(data_dir, "Sample_News.csv"), index_col=0)
data.shape

(645, 7)

In [15]:
col_txt = "title"

## Preprocessinng

In [22]:
class preprocessText:
    
    def __init__(self, resources_dir_path, custom_vocab=[], do_lemma=False):
        self.stopwords_file = os.path.join(resources_dir_path, "stopwords.txt")
        self.special_stopwords_file = os.path.join(resources_dir_path, "special_stopwords.txt")
        self.special_characters_file = os.path.join(resources_dir_path, "special_characters.txt")
        self.contractions_file = os.path.join(resources_dir_path, "contractions.json")
        self.chatwords_file = os.path.join(resources_dir_path, "chatwords.txt")
        self.emoticons_file = os.path.join(resources_dir_path, "emoticons.json")
        self.greeting_file = os.path.join(resources_dir_path, "greeting_words.txt")
        self.signature_file = os.path.join(resources_dir_path, "signature_words.txt")
        self.preserve_key = "<$>" # preserve special vocab
        self.vocab_list = custom_vocab
        self.preseve = True if len(custom_vocab) > 0 else False
        self.load_resources()
        self.do_lemma = do_lemma
        return
    
    def load_resources(self):
        
        ### Build Vocab Model --> Words to keep
        self.vocab_list = set(map(str.lower, self.vocab_list))
        self.vocab_dict = {w: self.preserve_key.join(w.split()) for w in self.vocab_list}
        self.re_retain_words = re.compile('|'.join(sorted(map(re.escape, self.vocab_dict), key=len, reverse=True)))
        
        ### Build Stopwords Model --> Words to drop/delete
        with open(self.stopwords_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.stopwords = [x.rstrip() for x in f.readlines()]
        with open(self.special_stopwords_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.stopwords.extend([x.rstrip() for x in f.readlines()])
        with open(self.special_characters_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.stopwords.extend([x.rstrip() for x in f.readlines()])
        self.stopwords = list(sorted(set(self.stopwords).difference(self.vocab_list)))

        ### Build Contractions
        with open(self.contractions_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.contractions = dict(json.load(f))
        
        ### Build Chat-words
        with open(self.chatwords_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.chat_words_map_dict, self.chat_words_list = {}, []
            chat_words = [x.rstrip() for x in f.readlines()]
            for line in chat_words:
                cw = line.split("=")[0]
                cw_expanded = line.split("=")[1]
                self.chat_words_list.append(cw)
                self.chat_words_map_dict[cw] = cw_expanded
            self.chat_words_list = set(self.chat_words_list)
        
        ### Bukd social markups
        # emoticons
        with open(self.emoticons_file, "r") as f:
            self.emoticons = re.compile(u'(' + u'|'.join(k for k in json.load(f)) + u')')
        # emojis
        self.emojis = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   "]+", flags=re.UNICODE)
        # greeting
        with open(self.greeting_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.greeting_words = [x.rstrip() for x in f.readlines()]
        # signature
        with open(self.signature_file, 'r', encoding='utf-8', errors='ignore') as f:
            self.signature_words = [x.rstrip() for x in f.readlines()]
        # spell-corrector
        self.spell_checker = SpellChecker()   
        return
    
    
    def reserve_keywords_from_cleaning(self, text, reset=False):
        """ 
        Finds common words from a user-provided list of special keywords to preserve them from 
        cleaning steps. Identifies every special keyword and joins them using `self.preserve_key` during the 
        cleaning steps, and later resets it back to original word in the end.
        """
        if reset is False:
            # compile using a dict of words and their expansions, and sub them if found!
            match_and_sub = self.re_retain_words.sub(lambda x: self.vocab_dict[x.string[x.start():x.end()]], text)
            return re.sub(r"([\s\n\t\r]+)", " ", match_and_sub).strip()
        else:
            # reverse the change! - use this at the end of preprocessing
            text = text.replace(self.preserve_key, " ")
            return re.sub(r"([\s\n\t\r]+)", " ", text).strip()


    def basic_clean(self, input_sentences):
        cleaned_sentences = []
        for sent in input_sentences:
            sent = str(sent).strip()
            # FIX text
            sent = ftfy.fix_text(sent)
            # Normalize accented chars
            sent = unicodedata.normalize('NFKD', sent).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            # Removing <…> web scrape tags
            sent = re.sub(r"\<(.*?)\>", " ", sent)
            # Expanding contractions using contractions_file
            sent = re.sub(r"(\w+\'\w+)", lambda x: self.contractions.get(x.group().lower(), x.group().lower()), sent)
            # Removing web urls
            sent = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0–9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»""'']))''', " ", sent)
            # Removing date formats
            sent = re.sub(r"(\d{4}\-\d{2}\-\d{2}\s\d{2}\:\d{2}\:\d{2}\s\:)", " ", sent)
            # Removing extra whitespaces
            sent = re.sub(r"([\s\n\t\r]+)", " ", sent).strip()
            cleaned_sentences.append(sent)
        return cleaned_sentences


    def deep_clean(self, input_sentences):
        cleaned_sentences = []
        for sent in input_sentences:
            # normalize text to "utf-8" encoding
            sent = unicodedata.normalize('NFKD', str(sent)).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            # lowercasing
            sent = str(sent).strip().lower()

            # <----------------------------- CUSTOM CLEANING ----------------------------- >
            #
            # *** Mark important keywords such as: Domain specific, Question words(wh-words), etc, using 
            # "self.vocab_list". Words from this list if found in any input sentence shall be joined using 
            # a key (self.preserve_key) during pre-processing step, and later un-joined to retain them.
            #
            if self.preseve: 
                sent = self.reserve_keywords_from_cleaning(sent, reset=False)
            #
            # <----------------------------- CUSTOM CLEANING ----------------------------- >

            # remove Emojis
            sent = self.emojis.sub(r'', sent)
            # remove emoticons
            sent = self.emoticons.sub(r'', sent)
            # remove common chat-words
            sent = " ".join([self.chat_words_map_dict[w.upper()] if w.upper() in self.chat_words_list else w for w in sent.split()])
            # FIX text
            sent = ftfy.fix_text(sent)
            # Normalize accented chars
            sent = unicodedata.normalize('NFKD', sent).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            # Removing <…> web scrape tags
            sent = re.sub(r"\<(.*?)\>", " ", sent)
            # Expanding contractions using contractions_file
            sent = re.sub(r"(\w+\'\w+)", lambda x: self.contractions.get(x.group().lower(), x.group().lower()), sent)
            # Removing web urls
            sent = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0–9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»""'']))''', " ", sent)
            # Removing date formats
            sent = re.sub(r"(\d{4}\-\d{2}\-\d{2}\s\d{2}\:\d{2}\:\d{2}\s\:)", " ", sent)

            # <----------------------------- OPTIONAL CLEANING ----------------------------- >
            #
            # removing punctuations 🔥🔥
            # *** disable them, when sentence structure needs to be retained ***
            sent = re.sub(r"[\$|\#\@\*\%]+\d+[\$|\#\@\*\%]+", " ", sent)
            sent = re.sub(r"\'s", " \'s", sent)
            sent = re.sub(r"\'ve", " \'ve", sent)
            sent = re.sub(r"n\'t", " n\'t", sent)
            sent = re.sub(r"\'re", " \'re", sent)
            sent = re.sub(r"\'d", " \'d", sent)
            sent = re.sub(r"\'ll", " \'ll", sent)
            sent = re.sub(r"[\/,\@,\#,\\,\{,\},\(,\),\[,\],\$,\%,\^,\&,\*,\<,\>]", " ", sent)
            sent = re.sub(r"[\,,\;,\:,\-]", " ", sent)      # main puncts
            
            # remove sentence de-limitters 🔥🔥
            # *** disable them, when sentence boundary/ending is important ***
            # sent = re.sub(r"[\!,\?,\.]", " ", sent)

            # keep only text & numbers 🔥🔥
            # *** enable them, when only text and numbers matter! *** 
            # sent = re.sub(r"\s+", " ", re.sub(r"[\\|\/|\||\{|\}|\[|\]\(|\)]+", " ", re.sub(r"[^A-z0-9]", " ", str(sent))))
            
            # correct spelling mistakes 🔥🔥
            # *** enable them when english spelling mistakes matter *** 
            # sent = " ".join([self.spell_checker.correction(w) if w in self.spell_checker.unknown(sent.split()) else w for w in sent.split()])
            #
            # <----------------------------- OPTIONAL CLEANING ----------------------------- >

            # Remove stopwords
            sent = " ".join(token.text for token in nlp(sent) if token.text not in self.stopwords and 
                                                                 token.lemma_ not in self.stopwords)
            # Lemmatize
            if self.do_lemma:
                sent = " ".join(token.lemma_ for token in nlp(sent))
            # Removing extra whitespaces
            sent = re.sub(r"([\s\n\t\r]+)", " ", sent).lower().strip()

            # <----------------------------- CUSTOM CLEANING ----------------------------- >
            #
            # *** Reverse the custom joining now to un-join the special words found!
            if self.preseve: 
                sent = self.reserve_keywords_from_cleaning(sent, reset=True)
            # <----------------------------- CUSTOM CLEANING ----------------------------- >

            cleaned_sentences.append(sent.strip().lower())
        return cleaned_sentences


    def spacy_get_pos_list(self, results):
        word_list, pos_list, lemma_list, ner_list, start_end_list = [], [], [], [], []
        indices = results['sentences']
        for line in indices:
            tokens = line['tokens']
            for token in tokens:
                # (1). save tokens
                word_list.append(token['word'])
                # (2). save pos
                pos_list.append(token['pos'])
                # (3). save lemmas
                lemma = token['lemma'].lower()
                if lemma in self.stopwords: continue
                lemma_list.append(lemma)
                # (4). save NER
                ner_list.append(token['ner'])
                # (5). save start
                start_end_list.append(str(token['characterOffsetBegin']) + "_" + str(token['characterOffsetEnd']))
        output = {"word_list": word_list, 
                  "lemma_list": lemma_list, 
                  "token_start_end_list": start_end_list,
                  "pos_list": pos_list, "ner_list": ner_list}
        return output

    def spacy_generate_features(self, doc, operations='tokenize,ssplit,pos,lemma,ner'):
        """
        Spacy nlp pipeline to generate features such as pos, tokens, ner, dependency. Accepts doc=nlp(text)
        """
        # spacy doc
        doc_json = doc.to_json()  # Includes all operations given by spacy pipeline

        # Get text
        text = doc_json['text']

        # ---------------------------------------- OPERATIONS  ---------------------------------------- #
        # 1. Extract Entity List
        entity_list = doc_json["ents"]

        # 2. Create token lib
        token_lib = {token["id"]: token for token in doc_json["tokens"]}

        # init output json
        output_json = {}
        output_json["sentences"] = []

        # Perform spacy operations on each sent in text
        for i, sentence in enumerate(doc_json["sents"]):
            # init parsers
            parse = ""
            basicDependencies = []
            enhancedDependencies = []
            enhancedPlusPlusDependencies = []

            # init output json
            out_sentence = {"index": i, "line": 1, "tokens": []}
            output_json["sentences"].append(out_sentence)

            # 3. Split sentences by indices(i), add labels (pos, ner, dep, etc.)
            for token in doc_json["tokens"]:

                if sentence["start"] <= token["start"] and token["end"] <= sentence["end"]:
                    
                    # >>> Extract Entity label
                    ner = "O"
                    for entity in entity_list:
                        if entity["start"] <= token["start"] and token["end"] <= entity["end"]:
                            ner = entity["label"]

                    # >>> Extract dependency info
                    dep = token["dep"]
                    governor = 0 if token["head"] == token["id"] else (token["head"] + 1)  # CoreNLP index = pipeline index +1
                    governorGloss = "ROOT" if token["head"] == token["id"] else text[token_lib[token["head"]]["start"]:
                                                                                     token_lib[token["head"]]["end"]]
                    dependent = token["id"] + 1
                    dependentGloss = text[token["start"]:token["end"]]

                    # >>> Extract lemma
                    lemma = doc[token["id"]].lemma_

                    # 4. Add dependencies
                    basicDependencies.append({"dep": dep,
                                              "governor": governor,
                                              "governorGloss": governorGloss,
                                              "dependent": dependent,
                                              "dependentGloss": dependentGloss})
                    # 5. Add tokens
                    out_token = {"index": token["id"] + 1,
                                 "word": dependentGloss,
                                 "originalText": dependentGloss,
                                 "characterOffsetBegin": token["start"],
                                 "characterOffsetEnd": token["end"]}

                    # 6. Add lemmas
                    if "lemma" in operations:
                        out_token["lemma"] = lemma

                    # 7. Add POS tagging
                    if "pos" in operations:
                        out_token["pos"] = token["tag"]

                    # 8. Add NER
                    if "ner" in operations:
                        out_token["ner"] = ner

                    # Update output json
                    out_sentence["tokens"].append(out_token)

            # 9. Add dependencies operation
            if "parse" in operations:
                out_sentence["parse"] = parse
                out_sentence["basicDependencies"] = basicDependencies
                out_sentence["enhancedDependencies"] = out_sentence["basicDependencies"]
                out_sentence["enhancedPlusPlusDependencies"] = out_sentence["basicDependencies"]
        # ---------------------------------------- OPERATIONS  ---------------------------------------- #
        return output_json
    
    def spacy_clean(self, input_sentences):
        batch_size = min(int(np.ceil(len(input_sentences)/100)), 500)
        
        # Part 1: generate spacy textual features (pos, ner, lemma, dependencies)
        sentences = [self.spacy_generate_features(doc) for doc in nlp.pipe(input_sentences, batch_size=batch_size, n_threads=-1)]
        
        # Part 2: collect all the features for each sentence
        spacy_sentences = [self.spacy_get_pos_list(sent) for sent in sentences]

        return spacy_sentences


    ## MAIN ##
    def run_pipeline(self, sentences, operation):
        """
        Main module to execute pipeline. Accepts list of strings, and desired operation.
        """
        if operation=="":
            raise Exception("Please pass a cleaning type - `basic`, `deep` or `spacy` !!")

        # run basic cleaning
        if "basic" == operation.lower(): 
            return self.basic_clean(sentences)

        # run deep cleaning
        if "deep" == operation.lower(): 
            return self.deep_clean(sentences)

        # run spacy pipeline
        if "spacy" == operation.lower(): 
            return self.spacy_clean(sentences)

In [23]:
"""
CUSTOM VOCABULARY
- List of words you wish to mark and retain them across the preprocessing steps. 
- Example, task-specific, domain-specific keywords.
"""
custom_vocab = ["google", "goog", "alphabet", "googlee", "netflix", "netflx", "amazon", "amz", 
                "apple", "aple", "aws", "iphone", "mac", "ipad"]

"""
LEMMATIZER
- Truncate words to their root-known-word form, stripping off their adjectives, verbs, etc.
- Example: "running" becomes "run", "is" becomes "be"
"""
do_lemmatizing = False

In [18]:
## Preprocessing

resources_dir_path = PATH_RES_DIR
preprocessText_obj = preprocessText(resources_dir_path, custom_vocab, do_lemmatizing)

def cleaning(data, text_col):
    data["Basic_%s" % text_col] = preprocessText_obj.run_pipeline(data[text_col], "basic")
    data["Deep_%s" % text_col] = preprocessText_obj.run_pipeline(data[text_col], "deep")
    data["Spacy_%s" % text_col] = preprocessText_obj.run_pipeline(data[text_col], "spacy")
    return data


## SAMPLE
# df = cleaning(df, <_TEXT_COLUMN_>)

In [19]:
df = cleaning(data, col_txt)

## Vectorization using sentence-Bert

In [324]:
class BertTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, tokenizer, model, max_length=128, embedding_func: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,):
        self.tokenizer = tokenizer
        self.model = model
        self.model.eval()
        self.max_length = max_length
        self.embedding_func = embedding_func
        if self.embedding_func is None:
            self.embedding_func = lambda x: x[0][:, 0, :].squeeze()

    def _tokenize(self, text):
        # Mean Pooling - Take attention mask into account for correct averaging
        def mean_pooling(model_output, attention_mask):
            token_embeddings = model_output[0] #First element of model_output contains all token embeddings
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
            sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
            return sum_embeddings / sum_mask

        # Tokenize the text with the provided tokenizer
        sentence_embeddings = tokenizer(text, padding=True, truncation=True, max_length=self.max_length, return_tensors='pt')

        # Compute token embeddings
        # with torch.no_grad():
        #    model_output = self.model(**encoded_input)

        # Perform mean pooling
        # sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

        # bert takes in a batch so we need to unsqueeze the rows
        return sentence_embeddings

    def transform(self, text: List[str]):
        if isinstance(text, pd.Series):
            text = text.tolist()
        return self._tokenize(text)

    def fit(self, X, y=None):
        """No fitting required so we just return ourselves. For fine-tuning, refer to shared gpu-code!"""
        return self
    
# BERT Model
bert_model_fp = PATH_BERT_MODEL

# load tokenizer, model classes
tokenizer = AutoTokenizer.from_pretrained(bert_model_fp)
model_bert = AutoModel.from_pretrained(bert_model_fp)
classify_model_bert = AutoModelForSequenceClassification.from_pretrained(bert_model_fp)

# load vectorizer
bert_vectorizer = BertTransformer(tokenizer, model_bert, embedding_func=lambda x: x[0][:, 0, :].squeeze())
print("Bert Model '%s' loaded." % ntpath.basename(bert_model_fp))

Some weights of the model checkpoint at /Volumes/Local Drive/CU Boulder/Google Drive/My Drive/CU Boulder/Academics/2. CSCI 5502-002 - Data Mining/Course Project/StockBuff-main/models/finbert_v1 were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Bert Model 'finbert_v1' loaded.


In [326]:
def get_sentiments(df_text_col):
    
    # vectorize
    sentence_embbeds = bert_vectorizer.transform(df_text_col)
    
    # AutoModelForSequenceClassification(FinBert) outputs sentiments [Pos, Neg, Neu] absolute scores
    output = classify_model_bert(**sentence_embbeds)
    
    # need to predict probability that sums to 1.0, SUM(pos_prob + neg_prob + neu_prob) = 1
    scaled_output = torch.nn.functional.softmax(output.logits, dim=-1)
    
    senti_pos_scores = scaled_output[:, 0].tolist()
    senti_neg_scores = scaled_output[:, 1].tolist()
    senti_neu_scores = scaled_output[:, 2].tolist()
    return senti_pos_scores, senti_neg_scores, senti_neu_scores

# getting sentiments
df['Positive'], df['Negative'], df['Neutral'] = get_sentiments(df['Deep_title'])

In [335]:
df

Unnamed: 0,date,title,content,link,symbols,tags,sentiment,Basic_title,Deep_title,Spacy_title,Positive,Negative,Neutral
0,2021-03-01T23:04:15+00:00,SoftBank-Backed Mapbox Names New CEO to Go Aft...,"(Bloomberg) -- Mapbox Inc., which makes mappin...",https://finance.yahoo.com/news/softbank-backed...,"['AAPL.US', 'AMZN.US', 'AONE-UN.US', 'AONE.US'...","['BLOOMBERG', 'BY\xa0INSTACART INC', 'CHIEF EX...","{'polarity': 0.974, 'neg': 0, 'neu': 0.934, 'p...",SoftBank-Backed Mapbox Names New CEO to Go Aft...,softbank backed mapbox names new ceo go carmakers,"{'word_list': ['SoftBank', '-', 'Backed', 'Map...",0.178975,0.087238,0.733787
1,2021-03-01T22:05:12+00:00,"Dow Jones Rips 600 Points Higher, Led By Boein...",The Dow Jones rallied sharply along with the o...,https://finance.yahoo.com/m/ae1cdff5-1fcf-3c5e...,"['AAPL.US', 'BA.US', 'JNJ.US', 'SMG.US', 'ZM.US']","['APPLE STOCK', 'DOW JONES', 'NASDAQ', 'NASDAQ...","{'polarity': 0.494, 'neg': 0, 'neu': 0.887, 'p...","Dow Jones Rips 600 Points Higher, Led By Boein...",dow jones rips 600 points higher led by boeing...,"{'word_list': ['Dow', 'Jones', 'Rips', '600', ...",0.311447,0.477432,0.211121
2,2021-03-01T20:20:54+00:00,Dow Jones Rises Over 700 Points As Indexes Rec...,The Dow Jones Industrial Average held strong g...,https://finance.yahoo.com/m/fa4aff04-adc3-352a...,"['AAPL.US', 'BA.US', 'INTC.US', 'JNJ.US', 'TER...","['DOW JONES', 'STOCK MARKET', 'THE DOW', 'U.S....","{'polarity': 0.896, 'neg': 0, 'neu': 0.59, 'po...",Dow Jones Rises Over 700 Points As Indexes Rec...,dow jones rises 700 points indexes recover las...,"{'word_list': ['Dow', 'Jones', 'Rises', 'Over'...",0.112521,0.619718,0.267761
3,2021-03-01T20:20:54+00:00,Dow Jones Rises Over 700 Points As Indexes Rec...,The Dow Jones Industrial Average held strong g...,https://finance.yahoo.com/m/fa4aff04-adc3-352a...,"['AAPL.US', 'BA.US', 'INTC.US', 'JNJ.US', 'TER...","['DOW JONES', 'STOCK MARKET', 'THE DOW', 'U.S....","{'polarity': 0.896, 'neg': 0, 'neu': 0.59, 'po...",Dow Jones Rises Over 700 Points As Indexes Rec...,dow jones rises 700 points indexes recover las...,"{'word_list': ['Dow', 'Jones', 'Rises', 'Over'...",0.112521,0.619718,0.267761
4,2021-03-01T19:01:11+00:00,Warren Buffett signals more stock buybacks com...,"Jonathan Boyar, Boyar Asset Management Princip...",https://finance.yahoo.com/video/warren-buffett...,"['AAPL.US', 'AXP.US', 'BAC-PP.US', 'BAC.US', '...","['BOYAR', 'JONATHAN BOYAR', 'KRISTIN MYERS', '...","{'polarity': 0.999, 'neg': 0.04, 'neu': 0.821,...",Warren Buffett signals more stock buybacks com...,warren buffett signals stock buybacks coming y...,"{'word_list': ['Warren', 'Buffett', 'signals',...",0.321768,0.145025,0.533206
...,...,...,...,...,...,...,...,...,...,...,...,...,...
640,2018-01-31T12:55:00+00:00,Investor Expectations to Drive Momentum within...,"NEW YORK, Jan. 31, 2018 (GLOBE NEWSWIRE) -- ...",https://www.globenewswire.com/news-release/201...,"['AAPL.US', 'AVHI.US', 'FARM.US', 'GM.US', 'SG...",['FUNDAMENTAL MARKETS'],"{'polarity': 0.995, 'neg': 0.009, 'neu': 0.937...",Investor Expectations to Drive Momentum within...,investor expectations drive momentum within co...,"{'word_list': ['Investor', 'Expectations', 'to...",0.073805,0.005399,0.920796
641,2017-11-30T14:27:00+00:00,"BioTelemetry, Inc. Enters Agreement to Provide...","MALVERN, Pa., Nov. 30, 2017 (GLOBE NEWSWIRE)...",https://www.globenewswire.com/news-release/201...,"['AAPL.US', 'BEAT.US']","['BIOTELEMETRY', 'INC', 'NASDAQ:BEAT']","{'polarity': 0.989, 'neg': 0.021, 'neu': 0.804...","BioTelemetry, Inc. Enters Agreement to Provide...",biotelemetry inc enters agreement provide card...,"{'word_list': ['BioTelemetry', ',', 'Inc.', 'E...",0.657360,0.008309,0.334330
642,2017-11-27T13:00:00+00:00,"Factors of Influence in 2018, Key Indicators a...","NEW YORK, Nov. 27, 2017 (GLOBE NEWSWIRE) -- ...",https://www.globenewswire.com/news-release/201...,"['AAPL.US', 'CSCO.US', 'GD.US', 'HPE.US', 'NVD...",['FUNDAMENTAL MARKETS'],"{'polarity': 0.997, 'neg': 0.008, 'neu': 0.926...","Factors of Influence in 2018, Key Indicators a...",factors influence 2018 key indicators opportun...,"{'word_list': ['Factors', 'of', 'Influence', '...",0.055207,0.009103,0.935690
643,2017-10-05T15:58:00+00:00,New Research: Key Drivers of Growth for Micros...,"NEW YORK, Oct. 05, 2017 (GLOBE NEWSWIRE) -- ...",https://www.globenewswire.com/news-release/201...,"['AAPL.US', 'AMZN.US', 'INTC.US', 'MSFT.US', '...",['FUNDAMENTAL MARKETS'],"{'polarity': 0.997, 'neg': 0.008, 'neu': 0.925...",New Research: Key Drivers of Growth for Micros...,new research key drivers growth microsoft appl...,"{'word_list': ['New', 'Research', ':', 'Key', ...",0.034298,0.011561,0.954141
