In [1]:
import os
import re
import glob
from datetime import datetime
import sys
sys.path.append("..") # Adds higher directory to python modules path for importing from src dir

import pandas as pd
import numpy as np
import tqdm
import matplotlib
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook as tqdm

import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

%matplotlib inline
%load_ext autotime
%load_ext autoreload
%autoreload 2

In [2]:
HOME = ".."
DATA_DIR = "data"
REUTERS = os.path.join(HOME, DATA_DIR, "preprocessed", "news_reuters.csv")
BLOOMBERG = os.path.join(HOME, DATA_DIR, "preprocessed", "news_bloomberg.csv")
NEWS = os.path.join(HOME, DATA_DIR, "preprocessed", "news.csv")
# Columns: 'date', 'filename', 'content'

time: 20.1 ms


### Get companies

In [4]:
from src.datasets import NyseSecuritiesDataset
from src.datasets import NyseStocksDataset
import src.nlp_utils as nlp_utils

time: 558 ms


In [5]:
securities_ds = NyseSecuritiesDataset(file_path='../data/nyse/securities.csv')
companies = securities_ds.get_all_company_names()  # List[Tuple[symbol, name]]

time: 42.2 ms


Reuters:
- 106519 articles
- From 2006-10-20 to 2013-11-20
- 45363 before 2010-01-04

Bloomberg:
- 448395 articles
- From 2006-10-20 to 2013-11-26
- 1148 before 2010-01-04

Nyse:
- From 2010-01-04 to 2016-12-30

In [None]:
dates_r = pd.read_csv(REUTERS, usecols=[0, 1], index_col=0)
dates_r['date'] = pd.to_datetime(dates_r['date'], errors='coerce')
dates_r['date'].hist()
print(sum(dates_r['date'] <= pd.to_datetime('2010-01-04')))

In [None]:
dates_b = pd.read_csv(BLOOMBERG, usecols=[0, 1], index_col=0)
dates_b['date'] = pd.to_datetime(dates_b['date'], errors='coerce')
dates_b['date'].hist()
print(sum(dates_b['date'] <= pd.to_datetime('2010-01-04')))

# Multi Run

#### Before NYSE
- All from Reuters and Bloomberg before first entry of NYSE dataset
- 45363 article from Reuters
- 1148 articles from Bloomber
- Resulted in about 102.735 company occurrences
- `./occurrences-before-nyse.csv` [3.9 MB]

#### Reuters

- Took 53h (2d 2h 58min 18s)
- Analysed 106.519 articles (106.494 included content)
- Resulted in 217.518 company occurrences
- ... in 52.210 different articles
- `./occurrences-reuters.csv`[8.5 MB]

In [5]:
reuters = pd.read_csv(REUTERS, index_col=0)  # nrows=45363
print(len(reuters))
reuters = reuters[reuters['content'].notna()]
print(len(reuters))

106519
106494
time: 5.72 s


In [None]:
results = pd.DataFrame(
    index=range(5 * len(reuters)),
    columns=['article_id', 'stock_symbol', 'match_text', 'start_idx', 'end_idx'])
counter = 0

In [None]:
pbar = tqdm(reuters.iterrows(), total=len(reuters))
for i, article in pbar:
    # One full article takes about 7 seconds
    found_entities = nlp_utils.find_nyse_corporations(article[2], quiet=True)
    for ent, symbol in found_entities:
        results.iloc[counter] = (f'r{i}', symbol, ent.text, ent.start_char, ent.end_char)
        counter += 1
    if (counter % 500) + len(found_entities) != (counter + len(found_entities)) % 500:
        results.dropna().to_csv('occurrences-reuters.csv')
        pbar.set_description(f"Stored {counter} entries")
results.dropna().to_csv('occurrences-reuters.csv')

#### Bloomberg
- Took ?
- Analysed 448.395 articles (447.769 included content)
- Resulted in ? company occurrences
- ... in ? different articles
- `./occurrences-bloomberg.csv` [? MB]

In [6]:
bloombergs = pd.read_csv(BLOOMBERG, index_col=0)  # nrows=1148
print(len(bloombergs))
# bloombergs = bloombergs[bloombergs['content'].notna()]
# print(len(bloombergs))

448395
time: 25.7 s


In [33]:
results = pd.DataFrame(
    index=range(10 * len(bloombergs)),
    columns=['article_id', 'stock_symbol', 'match_text', 'start_idx', 'end_idx'])

time: 1.25 s


In [52]:
# _results = pd.read_csv('occurrences-b1.csv', index_col=0)
# print(results.shape, _results.shape)
# results.iloc[:len(_results)] = _results
# TODO: set start to last article id

time: 34.2 ms


In [94]:
# x = results2.iloc[:counter]
# x.index = x.index + 34996
# results.iloc[34996:34996+counter] = x

time: 28.2 ms


In [None]:
# start = 0
# start = 48087
counter = start
pbar = tqdm(bloombergs.iloc[start:].iterrows(), total=len(bloombergs)-start)
for i, article in pbar:
    if article.content is np.nan:
        continue
    # One full article takes about 7 seconds
    found_entities = nlp_utils.find_nyse_corporations(article.content, quiet=True)
    
    for ent, symbol in found_entities:
        results.iloc[counter] = (f'b{i}', symbol, ent.text, ent.start_char, ent.end_char)
        counter += 1
    if (counter % 1000) - len(found_entities) != (counter - len(found_entities)) % 1000:
        results.dropna().to_csv('occurrences-b1.csv')
        pbar.set_description(f"Stored {counter} entries")
        print(f"Stored {counter} entries")
results.dropna().to_csv('occurrences-b1.csv')

HBox(children=(IntProgress(value=0, max=400308), HTML(value='')))

Stored 49003 entries
Stored 50000 entries
Stored 51006 entries


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Stored 53002 entries


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Stored 55001 entries
Stored 56011 entries


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Stored 58000 entries


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Stored 60001 entries
Stored 61000 entries


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Stored 63001 entries


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Stored 65012 entries
Stored 66000 entries
Stored 67001 entries


## Remove headlines from articles and the found entities

In [9]:
reuters = pd.read_csv(REUTERS, index_col=0)
reuters = reuters[reuters.content.notna()]
print('Reuters Articles', len(reuters))
reuters['title_start_idx'] = 0
reuters['title_end_idx'] = 0
reuters['head_end_idx'] = 0

# Don't remove entries with empty content because the bb occurrences are matching the original index values
bloombergs = pd.read_csv(BLOOMBERG, index_col=0)
bloombergs['title_start_idx'] = 0
bloombergs['title_end_idx'] = 0
bloombergs['head_end_idx'] = 0

# The articles keep their IDs which is necessary for matching them with the occurrences
# Both are not necessary for reuters
bloombergs = bloombergs[bloombergs.content.notna()]  # 626 article with empty content, e.g. b821, b822, b835, ..., b382080
# Regex wouldn't match because only the title's existing: e.g. b19841, b7498, b8401, b11860 (so far all have below 100 chars)
# For reuters there are always more than 100 chars, but sometimes only the meta data (~1000 entries)
bloombergs = bloombergs[bloombergs.content.str.len() > 100]  # 391 articles, regexes wouldn't match
print('Bloomberg Articles', len(bloombergs))

Reuters Articles 106494
Bloomberg Articles 447378
time: 20.7 s


#### Data Cleansing

In [10]:
orig_occ_r = pd.read_csv('occurrences-reuters.csv', index_col=0)
x = orig_occ_r
# All 3406 links in reuters articles are those in the header for the article link (and one in r49915 to uscourts.gov)
x = x[~x.match_text.str.contains('www.')]
orig_occ_r = x

time: 416 ms


In [11]:
orig_occ_b = pd.read_csv('occurrences-b1.csv', index_col=0)
# BB: Links starting before inde 400 are eiter "http://ww.bloomberg..." or "Ministry of Economy, Trade and Industry  http://www.meti.go.jp"
x = orig_occ_b
x = x[~x.match_text.str.contains("Ministry of Economy, Trade and Industry")]
x = x[~x.match_text.str.contains('http:') | (x.start_idx > 300)]
print(x.shape)
print(f'{x.match_text.str.contains("http:").sum()} out of {orig_occ_b.match_text.str.contains("http:").sum()} links are left')
orig_occ_b = x

(291934, 5)
122 out of 111783 links are left
time: 1.85 s


#### Filter matches from header

In [14]:
# remove_meta = re.compile(r'(--.*\n)+[\n\s]*')
remove_meta = re.compile(r'-- (.*)\n(?:--.*\n)+[\n\s]*')
# remove_meta_2 = re.compile(r'[\s\S]*(--.*\n)(?:(?:[^-]-[^-]|[^-])*\n){4,}')
remove_meta_2 = re.compile(r'[\s\S]{0,250}(--.*\.html\s*\n)(?:[^-](?:-[^-])*|-(?:[^-]-)*){250}')

def filter_meta_matches(r, article, article_id):
    # If there's no occurence in this article, we're done
    if not len(r[r.article_id == article_id]):
        return r
    match = remove_meta.match(article.content)
    if match:
        article.title_start_idx = match.start(1)
        article.title_end_idx = match.end(1)
        article.head_end_idx = match.end()
        return r[(r.article_id != article_id) | (r.start_idx >= article.head_end_idx) |
             (r.start_idx.between(article.title_start_idx, article.title_end_idx) &
             r.end_idx.between(article.title_start_idx, article.title_end_idx))]
    match = remove_meta_2.match(article.content)
    if match:
        article.title_start_idx = -1
        article.title_end_idx = -1
        article.head_end_idx = match.end(1)
        return r[(r.article_id != article_id) | (r.start_idx >= article.head_end_idx)]
    print(f"No regex worked for article {article_id}")
    return r

time: 23 ms


In [16]:
occ_r = orig_occ_r
print('Before - Reuters:', orig_occ_r.shape)

for i, article in tqdm(reuters.iterrows(), total=len(reuters)):
    occ_r = filter_meta_matches(occ_r, article, f'r{i}')

print('After - Reuters:', occ_r.shape)
# Reduced from 214112 to 214109
occ_r.to_csv('occurrences-reuters-v2.csv')

Before - Reuters: (214112, 5)


HBox(children=(IntProgress(value=0, max=106494), HTML(value='')))

After - Reuters: (214109, 5)
time: 58min 28s


In [15]:
occ_b = orig_occ_b
print('Before - Bloomberg:', orig_occ_b.shape)
# Fails: b69482, b76189, b83057, b88059, b97550, b98027, b100132, b107901, b113886, b124492, b134323, b136283, b151695, b163888, b164659, b164663, b166778, b166962, b173433, b176106, b176183, b181211, b185876
for i, article in tqdm(bloombergs.iterrows(), total=len(bloombergs)):
    occ_b = filter_meta_matches(occ_b, article, f'b{i}')
print('After - Bloomberg:', occ_b.shape)
# Reduced from 403596 to , from 111778 links to links (which are not in the header)
occ_b.to_csv('occurrences-bloomberg-v2.csv')

Before - Bloomberg: (291934, 5)


HBox(children=(IntProgress(value=0, max=447378), HTML(value='')))

No regex worked for article b69482
No regex worked for article b76189
No regex worked for article b83057
No regex worked for article b88059
No regex worked for article b97550
No regex worked for article b98027
No regex worked for article b100132
No regex worked for article b107901
No regex worked for article b113886
No regex worked for article b124492
No regex worked for article b134323
No regex worked for article b136283
No regex worked for article b151695
No regex worked for article b163888
No regex worked for article b164659
No regex worked for article b164663
No regex worked for article b166778
No regex worked for article b166962
No regex worked for article b173433
No regex worked for article b176106
No regex worked for article b176183
No regex worked for article b181211
No regex worked for article b185876
After - Bloomberg: (287426, 5)
time: 4h 10min 56s


### Apply on Reuters Article

In [None]:
# idxmax reuters: 61727  (FB 26, MSFT 1, NWSA 44, NWS 44, YHOO 1)
reuters = pd.read_csv(REUTERS, skiprows=61727, nrows=1, index_col=0)
print(reuters.loc[61727][2])

In [None]:
article1 = nlp(reuters.loc[61727][2])
labels = [x.label_ for x in article1.ents]
print(Counter(labels))
items = [x.text for x in article1.ents if x.label_ == 'ORG']
print(Counter(items))  # .most_common(3)
sentences = [x for x in article1.sents]
print(sentences[20])
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

In [None]:
counts = Counter([ent for ent, label in zip(items, labels) if label == 'ORG'])
matches = [
    [key, counts[key], securities_ds.get_most_similar_company(key)] for key in counts]
matches = [x for x in matches if x[2] is not None]
matched_stocks = dict([(x[0], x[2]) for x in matches])
matches

In [None]:
securities_ds.get_most_similar_company('AOL-Time Warner', quiet=False)

In [None]:
found_entities = nlp_utils.find_nyse_corporations(reuters.loc[61727][2], quiet=False)

### Apply on Bloomberg Article

In [None]:
# idxmax bloomberg: 316777  (AAPL 1, FB 79, JPM 1, MSFT 1, MS 7)
bloomberg = pd.read_csv(BLOOMBERG, skiprows=316777, nrows=1, index_col=0)
# print(bloomberg.loc[316777][2])

In [None]:
found_entities = nlp_utils.find_nyse_corporations(bloomberg.loc[316777][2], quiet=False)