In [3]:
import os
import zipfile
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification, pipeline, TrainingArguments, Trainer
import torch
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from sklearn.model_selection import train_test_split
import pyarrow as pa
from datasets import Dataset
from datetime import datetime
import pandas_market_calendars as mcal

# Library configuration
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
# Set output options
pd.set_option('display.max_columns', 3000)
pd.options.display.max_rows = 999

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adammiyauchi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adammiyauchi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def parse_news():
    '''
    Parse zipped json news into tabular data for further processing. Final tabular data is written to 'data/news/raw_newsparsed.csv'
    '''
    df = pd.DataFrame()

    for file in os.listdir(os.fsencode('data/news/zipped_json/')):
        # Unzip news
        filename = os.fsdecode(file)    
        with zipfile.ZipFile(os.path.join('data/news/zipped_json/', filename), 'r') as zip:
            zip.extractall(os.path.join('data/news/json', filename).split('.')[0])
        # Parse each json file into a dataframe
        for json in os.listdir(os.fsencode(os.path.join('data/news/json', filename).split('.')[0])):
            f = os.path.join(os.path.join('data/news/json', filename).split('.')[0], os.fsdecode(json))
            df = pd.concat([df, pd.read_json(f, lines=True)])    

    df.to_csv('data/news/raw_newsparsed.csv', index=False)
    return df

In [3]:
# FIN-BERT
# FinBert model API ref: https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
finbert_tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finbert')
finbert = AutoModelForSequenceClassification.from_pretrained('ProsusAI/finbert')

# BERT-NER
# Pretrained BERT-NER model https://huggingface.co/Jean-Baptiste/camembert-ner
ner_tokenizer = AutoTokenizer.from_pretrained('Jean-Baptiste/camembert-ner')
Bert_NER = AutoModelForTokenClassification.from_pretrained('Jean-Baptiste/camembert-ner')
ner_pipeline = pipeline('ner', model=Bert_NER, tokenizer=ner_tokenizer, aggregation_strategy='simple')


def finbert_sentiment(text: str) -> pd.Series:
    '''
    Given an input text predicts positive, negative, neutral sentiment using FinBERT model. 
    Returns a series containing positive, negative, & neutral
    '''
    # The tokenizer breaks headline text into tokens and generates a tensor representation which is passed as input to the FinBert model
    tokenized_text = finbert_tokenizer(preprocess_text(text), padding=True, truncation=True, return_tensors='pt')
    outputs = finbert(**tokenized_text)
    # Pass the model outputs through softmax function to generate positive, negative, & neutral sentiment probabilities
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return pd.Series(predictions[0].tolist())


def finbert_entity_sentiment(text: str, surrounding_count=0) -> pd.Series:
    '''
    Given a body of input text, extracts relevant entities and find sentiments of each sentence. 
    Can optionally specify the number of surrounding sentences to be considered in the sentiment evaluation. A value of 0 will only consider the sentence that contains the relevant entity.
    Returns an simple average of all sentiment scores. 
    '''
    # TODO change hardcoded relevant entities to a more sophisticated approach
    # relevant_entities = ['AAPL', 'Apple', 'AMZN', 'Amazon']
    if ('AAPL' in text or 'Apple' in text) and ('AMZN' in text or 'Amazon' in text):
        relevant_entities = ['AAPL', 'Apple', 'AMZN', 'Amazon']
    elif 'AAPL' in text or 'Apple' in text:
        relevant_entities = ['AAPL', 'Apple']
    else: 
        relevant_entities = ['AMZN', 'Amazon']

    # Extract sentence indices from news bodies that contain the relevant entities
    sentences = extract_sentences(text)
    sentence_index = []
    for i, sentence in enumerate(sentences):
        if any(ent in sentence for ent in relevant_entities):
            sentence_index += [i]

    # Extract the sentence and optionally the surrounding sentences that contain the found entities. 
    if sentence_index:
        # sentiments = pd.DataFrame()
        sentiments = []
        for index in sentence_index:
            text = ' '.join(sentences[max(index-surrounding_count,0):index+surrounding_count+1])
            # sentiments = pd.concat([sentiments, finbert_sentiment(text)])
            sentiments.append(finbert_sentiment(text))

        # TODO Optionally try merging methods other than simple average
        sentiments = pd.DataFrame(sentiments)
        return sentiments.mean()
    else:
        return pd.Series([0, 0, 0])


def bert_ner(text: str) -> list:
    '''
    Extract entities from the input text using a BERT model. 
    Returns a list of entities. Each entity has {entity_group, score, word, start, end} 
    '''     
    return ner_pipeline(text)


def extract_sentences(text: str) -> list:
    '''
    Breaks of body of text into sentences. Returns a list of sentences. 
    '''
    return nltk.sent_tokenize(text)


def remove_punctuation(text):
    '''
    Removes punctionation (. , ! ? etc) from a string of text.
    '''
    ans = ''
    for i in text:     
        if i not in string.punctuation:
            ans += i    
    return ans


def remove_stopwords(text):
    '''
    Removes stopwords using the NLTK stopword dictionary from the string of text.
    '''
    return ' '.join([word for word in text.split(" ") if word not in set(stopwords.words('english'))])


def preprocess_text(text: str) -> str:
    '''
    Removes punctuation, stop words
    '''
    return remove_stopwords(remove_punctuation(text))


Using the flattened news data:
- drop unneeded columns
- drop articles that don't have a title
- drop articles that are not written in english
- determine if the article is related to Apple or Amazon
    - `title_aapl`: True if the article title contains 'AAPL' or 'Apple'
    - `text_aapl`: True if the article body contains 'AAPL' or 'Apple'
    - `title_amzn`: True if the article title contains 'AMZN' or 'Amazon'
    - `text_amzn`: True if the article body contains 'AMZN' or 'Amazon'

In [4]:
df = pd.read_csv('data/news/raw_newsparsed.csv')

df = df.drop(['organizations', 'thread', 'ord_in_thread', 'locations', 'entities', 'highlightText', 'persons', 'external_links', 'crawled', 'highlightTitle'], axis=1)
df = df[df['title'].notna()]   
df = df[df['language'] == 'english']   
df['published'] = pd.to_datetime(df['published'], utc=True)

df['title_aapl'] = df['title'].apply(lambda x: 'AAPL' in x or 'Apple' in x)
df['text_aapl'] = df['text'].apply(lambda x: 'AAPL' in x or 'Apple' in x)
df['title_amzn'] = df['title'].apply(lambda x: 'AMZN' in x or 'Amazon' in x)
df['text_amzn'] = df['text'].apply(lambda x: 'AMZN' in x or 'Amazon' in x)

df.shape
df.head()
df.dtypes

(70729, 11)

Unnamed: 0,uuid,author,url,title,language,text,published,title_aapl,text_aapl,title_amzn,text_amzn
0,3f884eef1ee888c2053c8eed3dda913e54a568f9,admin,http://www.wallstreetreporter.com/2018/09/the-...,The evidence is mounting that Apple&apos;s nex...,english,"September 2, 2018 admin Trading Ideas Comments...",2018-09-02 12:00:00+00:00,True,True,False,False
1,d958bfe53e5d2e55c480c37e0c8d1b78232f7dd3,Zacks Equity Research,https://www.zacks.com/stock/news/325190/jabils...,Jabil's (JBL) Q4 Earnings and Revenues Surpass...,english,"Jabil, Inc. ( JBL - Free Report ) reported bet...",2018-09-27 00:00:00+00:00,False,True,False,False
2,3b7327f6858be72e780b02b863d941fe7ce81eab,,https://money.cnn.com/2018/09/05/investing/pre...,6 things to know before the opening bell,english,Click chart for more in-depth data. 1. Big Tec...,2018-09-05 05:12:00+00:00,False,True,False,True
3,efde7c246b7d5bc1b383f6f795e7db179510bc66,Troy Wolverton,http://finance.yahoo.com/news/apple-apos-color...,Apple's colorful new iPhone XR could trigger a...,english,Apple iPhone XR (yellow) More Apple\nApple's a...,2018-09-13 05:35:00+00:00,True,True,False,False
4,5b4f0affc94431cc61997167db92199c138beab0,David Gewirtz,https://www.zdnet.com/article/will-there-be-an...,Will there be an October Apple event? Signs po...,english,ZDNet Japan Will there be an October Apple eve...,2018-09-27 17:21:00+00:00,True,True,False,False


uuid                       object
author                     object
url                        object
title                      object
language                   object
text                       object
published     datetime64[ns, UTC]
title_aapl                   bool
text_aapl                    bool
title_amzn                   bool
text_amzn                    bool
dtype: object

Using the FinBERT model to extract sentiment scores the articles title, body, and sentences that contain entities relavant to AAPL or AMZN. Find sentiment in batches to avoid any memory issues.

In [None]:
size = 1000
for i, pos in enumerate(range(0, df.shape[0], size)):
    temp = df.iloc[pos:pos + size, :].copy()
    temp[['title_posSent', 'title_neg_sent', 'title_net_sent']] = temp['title'].apply(finbert_sentiment)
    temp[['text_pos_sent', 'text_neg_sent', 'text_net_sent']] = temp['text'].apply(finbert_sentiment)
    temp[['entity_pos_sent', 'entity_neg_sent', 'entity_net_sent']] = temp['text'].apply(finbert_entity_sentiment)
    temp['title_entities'] = temp['title'].apply(bert_ner)
    temp['text_entities'] = temp['text'].apply(bert_ner)
    temp.to_csv(f'data/news/stg_sentiment/news_{i}.csv', index=False)

Combine all news sentiment data into a single csv file.

In [2]:
df = pd.DataFrame()
dir = 'data/news/stg_sentiment'
for file in os.listdir(dir):
    df = pd.concat([pd.read_csv(dir+'/'+file), df])


Assign a prediction date to each news article. Articles published before market open (14:30 UTC) are used for predicting the current days price direction. Articles published at or after market open are used for predicting the next trading days price direction. Articles published on a non-trading days (weekends & holidays) are used for predicting the next valid trading day.


In [12]:
# UTC is 4 hours ahead of EST - Market opens at 9:30 AM EST
df['published'] = pd.to_datetime(df['published'], utc=True)
df['date'] = df['published'].dt.date
df['time'] = df['published'].dt.time

nyse = mcal.get_calendar('NYSE')
market_calendar = nyse.schedule(start_date='2017-01-01', end_date='2019-12-31')

def get_trading_day(date: datetime.date, n=1):
    return market_calendar.shift(-1*n).loc[str(date)]['market_open'].date()

df['prediction_date'] = df['date'].apply(lambda x: market_calendar.iloc[np.searchsorted(market_calendar.index, str(x))]['market_open'].date())
df.loc[(df['time'] >= datetime(1, 1, 1, 14, 30).time()) & (df['published'].dt.dayofweek < 5), 'prediction_date'] = df[df['time'] >= datetime(1, 1, 1, 14, 30).time()]['prediction_date'].apply(get_trading_day)
df['prediction_date'] = pd.to_datetime(df['prediction_date'])

df = df.drop(['date', 'time'], axis=1)

df.to_csv('data/news/news.csv', index=False)

Final preprocessed news dataset

In [13]:
pd.read_csv('data/news/news.csv').head()

Unnamed: 0,uuid,author,url,title,language,text,published,title_aapl,text_aapl,title_amzn,text_amzn,title_posSent,title_neg_sent,title_net_sent,text_pos_sent,text_neg_sent,text_net_sent,entity_pos_sent,entity_neg_sent,entity_net_sent,title_entities,text_entities,prediction_date
0,3f884eef1ee888c2053c8eed3dda913e54a568f9,admin,http://www.wallstreetreporter.com/2018/09/the-...,The evidence is mounting that Apple&apos;s nex...,english,"September 2, 2018 admin Trading Ideas Comments...",2018-09-02 12:00:00+00:00,True,True,False,False,0.292428,0.009908,0.697664,0.041162,0.033188,0.925649,0.096885,0.029284,0.873831,"[{'entity_group': 'ORG', 'score': 0.86504984, ...","[{'entity_group': 'MISC', 'score': 0.8753462, ...",2018-09-04
1,d958bfe53e5d2e55c480c37e0c8d1b78232f7dd3,Zacks Equity Research,https://www.zacks.com/stock/news/325190/jabils...,Jabil's (JBL) Q4 Earnings and Revenues Surpass...,english,"Jabil, Inc. ( JBL - Free Report ) reported bet...",2018-09-27 00:00:00+00:00,False,True,False,False,0.949557,0.028038,0.022405,0.281226,0.06185,0.656924,0.306846,0.017357,0.675797,"[{'entity_group': 'LOC', 'score': 0.83176327, ...","[{'entity_group': 'ORG', 'score': 0.99474525, ...",2018-09-27
2,3b7327f6858be72e780b02b863d941fe7ce81eab,,https://money.cnn.com/2018/09/05/investing/pre...,6 things to know before the opening bell,english,Click chart for more in-depth data. 1. Big Tec...,2018-09-05 05:12:00+00:00,False,True,False,True,0.060998,0.040879,0.898123,0.029282,0.568029,0.402689,0.282342,0.013111,0.704547,"[{'entity_group': 'MISC', 'score': 0.8700097, ...","[{'entity_group': 'MISC', 'score': 0.96541995,...",2018-09-05
3,efde7c246b7d5bc1b383f6f795e7db179510bc66,Troy Wolverton,http://finance.yahoo.com/news/apple-apos-color...,Apple's colorful new iPhone XR could trigger a...,english,Apple iPhone XR (yellow) More Apple\nApple's a...,2018-09-13 05:35:00+00:00,True,True,False,False,0.083479,0.191922,0.724599,0.075078,0.036043,0.888879,0.361608,0.090258,0.548134,"[{'entity_group': 'ORG', 'score': 0.93689966, ...","[{'entity_group': 'MISC', 'score': 0.98870397,...",2018-09-13
4,5b4f0affc94431cc61997167db92199c138beab0,David Gewirtz,https://www.zdnet.com/article/will-there-be-an...,Will there be an October Apple event? Signs po...,english,ZDNet Japan Will there be an October Apple eve...,2018-09-27 17:21:00+00:00,True,True,False,False,0.290154,0.034264,0.675582,0.029674,0.046009,0.924317,0.113821,0.102926,0.783253,"[{'entity_group': 'MISC', 'score': 0.9934734, ...","[{'entity_group': 'MISC', 'score': 0.5556212, ...",2018-09-28
