In [1]:
import re
import string
import nltk
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import spacy
import pandas as pd
import pickle

# Load spacy
nlp = spacy.load('en_core_web_sm')

def clean_string(text, stem="None"):
    
    final_string = ""

    # Make lower
    text = text.lower()

    # Remove line breaks
    text = re.sub(r'\n', '', text)

    # Remove puncuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # Remove stop words
    text = text.split()
    useless_words = nltk.corpus.stopwords.words("english")

    text_filtered = [word for word in text if not word in useless_words]

    # Remove numbers
    text_filtered = [re.sub(r'\w*\d\w*', '', w) for w in text_filtered]

    # Stem or Lemmatize
    if stem == 'Stem':
        stemmer = PorterStemmer() 
        text_stemmed = [stemmer.stem(y) for y in text_filtered]
    elif stem == 'Lem':
        lem = WordNetLemmatizer()
        text_stemmed = [lem.lemmatize(y) for y in text_filtered]
    elif stem == 'Spacy':
        text_filtered = nlp(' '.join(text_filtered))
        text_stemmed = [y.lemma_ for y in text_filtered]
    else:
        text_stemmed = text_filtered

    final_string = ' '.join(text_stemmed)

    return final_string

In [2]:
def splitToList(df):
    sentenceList = []
    for index, row in df.iterrows():
        tempList = []
        tempRow = row['MDA'].split('.')
        for x in tempRow:
            if len(x) > 30 and not x[:5].isupper() and '#160' not in x:
                tempList.append(clean_string(x, stem='Stem'))
        sentenceList.append(tempList)
    return sentenceList

In [3]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import pipeline
import tensorflow as tf

finbert = TFBertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)
def modeling(df):
    sentenceList = []
    for index, row in df.iterrows():
        tempList = []
        tempRow = row['MDAList']
        for x in tempRow:
            sents = nlp(x)
            tempList.append(sents)
        sentenceList.append(tempList)
    return sentenceList    

Metal device set to: Apple M1 Max

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB



2022-06-21 19:01:05.947505: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-06-21 19:01:05.947660: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at yiyanghkust/finbert-tone.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [4]:
CompanyTickers = [
    # 'AAPl',
    # 'TSLA', 
    # 'LULU',
    # 'SQ',
    # 'MSFT',
    # 'GOOG', 
    # 'KO',
    # 'XOM',
    # 'MCD',
    'GS'
    # 'JPM'
]

In [None]:
/Users/chrisjackson/LHL/Final Project/Initial Data/GS.pkl

In [7]:
for ticker in CompanyTickers:
    df = pd.read_pickle('/Users/chrisjackson/LHL/Final Project/Initial Data/GS.pkl')
    df['MDAList'] = splitToList(df)
    print(ticker, 'split')
    df['analyzed'] = modeling(df)
    print(ticker, 'analyzed')
    open_file = open(ticker + 'Analyzed.pkl', "wb")
    pickle.dump(df, open_file)
    open_file.close()
    print(ticker, 'pickled')
    

GS split
GS analyzed
GS pickled
