### Content:
- Use SpaCy NLP to find companys
- Match companys in news with stock companies (using regex)
- Exclude matches in headers afterwards (using regex)
- Apply NLP on one Reuters and one Bloomberg article as show cases
- Fix matchings for 3M Company
- Generate cooccurrences (value represents number of articles in which two companies occur together)

In [1]:
import os
import re
import glob
from datetime import datetime
import sys
sys.path.append("..") # Adds higher directory to python modules path for importing from src dir

import pandas as pd
import numpy as np
import tqdm
import matplotlib
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook as tqdm

import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

%matplotlib inline
%load_ext autotime
%load_ext autoreload
%autoreload 2

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
HOME = ".."
DATA_DIR = "data"
REUTERS = os.path.join(HOME, DATA_DIR, "preprocessed", "news_reuters.csv")
BLOOMBERG = os.path.join(HOME, DATA_DIR, "preprocessed", "news_bloomberg.csv")
NEWS = os.path.join(HOME, DATA_DIR, "preprocessed", "news-v2.csv")
# Columns: 'date', 'filename', 'content'

time: 171 ms


### Get companies

In [4]:
from src.datasets import NyseSecuritiesDataset
from src.datasets import NyseStocksDataset
import src.nlp_utils as nlp_utils

time: 4.46 s


In [5]:
stocks_ds = NyseStocksDataset(file_path='../data/nyse/prices-split-adjusted.csv')
securities_ds = NyseSecuritiesDataset(file_path='../data/nyse/securities.csv')
companies = securities_ds.get_all_company_names()  # List[Tuple[symbol, name]]
stocks_ds.load()

HBox(children=(IntProgress(value=0, max=470), HTML(value='')))


time: 18.3 s


Reuters:
- 106519 articles
- From 2006-10-20 to 2013-11-20
- 45363 before 2010-01-04

Bloomberg:
- 448395 articles
- From 2006-10-20 to 2013-11-26
- 1148 before 2010-01-04

Nyse:
- From 2010-01-04 to 2016-12-30

# Multi Run

#### Article before NYSE dataset (before 2010-01-04)
- All from Reuters and Bloomberg before first entry of NYSE dataset
- 45363 article from Reuters
- 1148 articles from Bloomberg
- Resulted in about 102.735 company occurrences
- `./occurrences-before-nyse.csv` [3.9 MB]

#### Reuters

- Took 53h (2d 2h 58min 18s)
- Analysed 106.519 articles (106.494 included content)
- Resulted in 217.518 company occurrences
- ... in 52.210 different articles
- `./occurrences-reuters.csv`[8.5 MB]

In [6]:
reuters = pd.read_csv(REUTERS, index_col=0)  # nrows=45363
print(len(reuters))
reuters = reuters[reuters['content'].notna()]
print(len(reuters))

FileNotFoundError: File b'..\\data\\preprocessed\\news_reuters.csv' does not exist

time: 904 ms


In [None]:
results = pd.DataFrame(
    index=range(5 * len(reuters)),
    columns=['article_id', 'stock_symbol', 'match_text', 'start_idx', 'end_idx'])
counter = 0

In [None]:
pbar = tqdm(reuters.iterrows(), total=len(reuters))
for i, article in pbar:
    # One full article takes about 7 seconds
    found_entities = nlp_utils.find_nyse_corporations(article[2], quiet=True)
    for ent, symbol in found_entities:
        results.iloc[counter] = (f'r{i}', symbol, ent.text, ent.start_char, ent.end_char)
        counter += 1
    if (counter % 500) + len(found_entities) != (counter + len(found_entities)) % 500:
        results.dropna().to_csv('occurrences-reuters.csv')
        pbar.set_description(f"Stored {counter} entries")
results.dropna().to_csv('occurrences-reuters.csv')

#### Bloomberg
- Took ~4 days
- Analysed 448.395 articles (447.769 included content)
- Resulted in ? company occurrences
- ... in ? different articles
- `./occurrences-bloomberg.csv` [? MB]

In [6]:
bloombergs = pd.read_csv(BLOOMBERG, index_col=0)  # nrows=1148
print(len(bloombergs))
# The final used indexes would be confused if this line is executed
# bloombergs = bloombergs[bloombergs['content'].notna()]

448395
time: 25.7 s


In [33]:
results = pd.DataFrame(
    index=range(10 * len(bloombergs)),
    columns=['article_id', 'stock_symbol', 'match_text', 'start_idx', 'end_idx'])

time: 1.25 s


In [None]:
# Required if it was stopped during execution (restarting at the last save point)
# _results = pd.read_csv('occurrences-bloomberg.csv', index_col=0)
# print(results.shape, _results.shape)
# results.iloc[:len(_results)] = _results
# start = 48087

counter = start
pbar = tqdm(bloombergs.iloc[start:].iterrows(), total=len(bloombergs)-start)
for i, article in pbar:
    if article.content is np.nan:
        continue
    # One full article takes about 7 seconds
    found_entities = nlp_utils.find_nyse_corporations(article.content, quiet=True)
    
    for ent, symbol in found_entities:
        results.iloc[counter] = (f'b{i}', symbol, ent.text, ent.start_char, ent.end_char)
        counter += 1
    if (counter % 1000) - len(found_entities) != (counter - len(found_entities)) % 1000:
        results.dropna().to_csv('occurrences-bloomberg.csv')
        pbar.set_description(f"Stored {counter} entries")
        print(f"Stored {counter} entries")
results.dropna().to_csv('occurrences-bloomberg.csv')

## Remove headlines from articles and the found entities

In [9]:
reuters = pd.read_csv(REUTERS, index_col=0)
reuters = reuters[reuters.content.notna()]
print('Reuters Articles', len(reuters))
reuters['title_start_idx'] = 0
reuters['title_end_idx'] = 0
reuters['head_end_idx'] = 0

# Don't remove entries with empty content because the bb occurrences are matching the original index values
bloombergs = pd.read_csv(BLOOMBERG, index_col=0)
bloombergs['title_start_idx'] = 0
bloombergs['title_end_idx'] = 0
bloombergs['head_end_idx'] = 0

# The articles keep their IDs which is necessary for matching them with the occurrences
# Both are not necessary for reuters
bloombergs = bloombergs[bloombergs.content.notna()]  # 626 article with empty content, e.g. b821, b822, b835, ..., b382080
# Regex wouldn't match because only the title's existing: e.g. b19841, b7498, b8401, b11860 (so far all have below 100 chars)
# For reuters there are always more than 100 chars, but sometimes only the meta data (~1000 entries)
bloombergs = bloombergs[bloombergs.content.str.len() > 100]  # 391 articles, regexes wouldn't match
print('Bloomberg Articles', len(bloombergs))

Reuters Articles 106494
Bloomberg Articles 447378
time: 20.7 s


### Data Cleansing

In [10]:
orig_occ_r = pd.read_csv('occurrences-reuters.csv', index_col=0)
x = orig_occ_r
# All 3406 links in reuters articles are those in the header for the article link (and one in r49915 to uscourts.gov)
x = x[~x.match_text.str.contains('www.')]
orig_occ_r = x

time: 416 ms


In [11]:
orig_occ_b = pd.read_csv('occurrences-bloomberg.csv', index_col=0)
# BB: Links starting before inde 400 are eiter "http://ww.bloomberg..." or "Ministry of Economy, Trade and Industry  http://www.meti.go.jp"
x = orig_occ_b
x = x[~x.match_text.str.contains("Ministry of Economy, Trade and Industry")]
x = x[~x.match_text.str.contains('http:') | (x.start_idx > 300)]
print(x.shape)
print(f'{x.match_text.str.contains("http:").sum()} out of {orig_occ_b.match_text.str.contains("http:").sum()} links are left')
orig_occ_b = x

(291934, 5)
122 out of 111783 links are left
time: 1.85 s


### Filter matches from header

In [None]:
occ_r = orig_occ_r
print('Before - Reuters:', orig_occ_r.shape)

for i, article in tqdm(reuters.iterrows(), total=len(reuters)):
    occ_r = filter_meta_matches(occ_r, article, f'r{i}')

print('After - Reuters:', occ_r.shape)
# Reduced from 214112 to 214109
occ_r.to_csv('occurrences-reuters-v2.csv')

In [None]:
occ_b = orig_occ_b
print('Before - Bloomberg:', orig_occ_b.shape)
# Fails: b69482, b76189, b83057, b88059, b97550, b98027, b100132, b107901, b113886, b124492, b134323, b136283, b151695, b163888, b164659, b164663, b166778, b166962, b173433, b176106, b176183, b181211, b185876
for i, article in tqdm(bloombergs.iterrows(), total=len(bloombergs)):
    occ_b = filter_meta_matches(occ_b, article, f'b{i}')
print('After - Bloomberg:', occ_b.shape)
# Reduced from 403596 to , from 111778 links to links (which are not in the header)
occ_b.to_csv('occurrences-bloomberg-v2.csv')

### Apply on Reuters Article

In [322]:
# idxmax reuters: 61727  (FB 26, MSFT 1, NWSA 44, NWS 44, YHOO 1)
reuters = pd.read_csv(REUTERS, skiprows=61727, nrows=1, index_col=0)
# print(reuters.loc[61727][2])

time: 943 ms


In [None]:
article1 = nlp(reuters.loc[61727][2])
labels = [x.label_ for x in article1.ents]
print(Counter(labels))
items = [x.text for x in article1.ents if x.label_ == 'ORG']
print(Counter(items))  # .most_common(3)
sentences = [x for x in article1.sents]
print(sentences[20])
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

In [None]:
counts = Counter([ent for ent, label in zip(items, labels) if label == 'ORG'])
matches = [[key, counts[key], securities_ds.get_most_similar_company(key)] for key in counts]
matches = [x for x in matches if x[2] is not None]
matched_stocks = dict([(x[0], x[2]) for x in matches])
matches

In [None]:
# print(securities_ds.get_most_similar_company('AOL-Time Warner', quiet=False))
found_entities = nlp_utils.find_nyse_corporations(reuters.loc[61727][2], quiet=False)

### Apply on Bloomberg Article

In [71]:
# idxmax bloomberg: 316777  (AAPL 1, FB 79, JPM 1, MSFT 1, MS 7)
bloomberg = pd.read_csv(BLOOMBERG, skiprows=316777, nrows=1, index_col=0)
# print(bloomberg.loc[316777][2])

time: 4.14 s


In [None]:
found_entities = nlp_utils.find_nyse_corporations(bloomberg.loc[316777][2], quiet=False)

### Fix matches for short company name (3M matches too many strings)

In [155]:
occ_b = pd.read_csv('occurrences-bloomberg-v2.csv', index_col=0)
# 32189 entries, this problem seems to only exist for 3M co. [MMM]
cleaned_up = occ_b[(occ_b['stock_symbol'] != 'MMM') | occ_b.match_text.str.contains('3M')]
# cleaned_up.to_csv('occurrences-bloomberg-v2.csv')

time: 2.41 s


In [156]:
occ_r = pd.read_csv('occurrences-reuters-v2.csv', index_col=0)
# 5751 entries, this problem seems to only exist for 3M co. [MMM]
cleaned_up = occ_r[(occ_r['stock_symbol'] != 'MMM') | occ_r.match_text.str.contains('3M')]
# cleaned_up.to_csv('occurrences-reuters-v2.csv')

time: 1.92 s


## Reading and analysing high important articles

In [5]:
def get_barticle(idx):
    return pd.read_csv(BLOOMBERG, skiprows=idx, nrows=1, index_col=0).loc[idx][2]

def get_article(idx):
    if idx[0] == 'r':
        return reuters.loc[int(idx[1:])].content
    return get_barticle(int(idx[1:]))

def display_article(idx):
    occs = occs_r if idx[0] == 'r'else occs_b
    hits = [{'start': match.start_idx, 'end': match.end_idx,
             'label': f'ORG ({match.stock_symbol})'}
            for _, match in occs[occs.article_id == idx].iterrows()]
    ent_names = np.unique([x['label'] for x in hits])
    options = {'ents': ent_names, 'colors': nlp_utils.get_colors(ent_names), 'collapse_punct': True}
    displacy.render({
        'text': get_article(idx),  # .replace('$', '\$')
        'ents': hits,
        'title': None
    }, options=options, style='ent', manual=True, jupyter=True)

def get_occ_per_article(occs):
    grouped = occs.groupby(['article_id', 'stock_symbol'], sort=False)
    occ_per_article = grouped.size().reset_index().pivot('article_id', 'stock_symbol')\
        .fillna(0).astype(int)
    occ_per_article.columns = occ_per_article.columns.droplevel(0)
    return occ_per_article

reuters = pd.read_csv(REUTERS, index_col=0)
reuters = reuters[reuters['content'].notna()]
# bloomberg = pd.read_csv(BLOOMBERG, index_col=0)

occs_b = pd.read_csv('reports/occurrences-bloomberg-v2.csv', index_col=0)
occs_r = pd.read_csv('reports/occurrences-reuters-v2.csv', index_col=0)

r = get_occ_per_article(occs_r)
b = get_occ_per_article(occs_b)

r_ids = r.sum(axis=1).nlargest(50).index
b_ids = b.sum(axis=1).nlargest(50).index

time: 5.71 s


In [111]:
r_score = pd.Series([row.nlargest(2).mean() for _, row in tqdm(r.iterrows(), total=len(r))], index=r.index)

HBox(children=(IntProgress(value=0, max=47179), HTML(value='')))

time: 36.4 s


In [181]:
r_score2 = pd.Series([row.nlargest(2).min() for _, row in tqdm(r.iterrows(), total=len(r))], index=r.index)

HBox(children=(IntProgress(value=0, max=47179), HTML(value='')))

time: 29.5 s


In [121]:
b_score = pd.Series([row.nlargest(2).mean() for _, row in tqdm(b.iterrows(), total=len(b))], index=b.index)

HBox(children=(IntProgress(value=0, max=79404), HTML(value='')))

time: 1min


In [182]:
r_ids2 = r_score.nlargest(50).index
r_ids3 = r_score2.nlargest(50).index
b_ids2 = b_score.nlargest(50).index
# IDs are very similar to each other
# print('\n'.join(f'{x} vs {y}' for x, y in zip(b_ids[:20], b_ids2[:20])))
# print('\n'.join(f'{x} vs {y} vs {z}' for x, y, z in zip(r_ids[:20], r_ids2[:20], r_ids3[:20])))

time: 194 ms


### Outcomes

##### Article r55716
81x Microsoft, 12x Apple

"Goldman" (lvt=0.46), "Google" (vs. Alphabet), "iPad" (vs. Apple), "Hewlett-Packard" (vs. HP Inc.) are not recognized.
Overall analysis of Microsoft current success and its future plans. Skepsis (downgraded by Goldman Sachs) but also optimism (by it's biggest share holder BlackRock).

##### Article r58694
87x Boeing, 3x Rockwell Collins, 1x General Motors

##### Article r22321
30x Yahoo, 29x Microsoft, 3x Time Warner Inc
Microsoft wants to buy Yahoo but they're resisting again and again. Google is often mentioned as the big competitor (but not recognized by NLP as company)

In [178]:
securities_ds.get_company_name('COL')

'Rockwell Collins'

time: 159 ms


In [172]:
securities_ds.get_most_similar_company('Alphabet', debug=True, acceptance_rate=0.47)

('Alphabet Inc', 0.0)

time: 185 ms


In [184]:
def inspect(idx):
    matches = r if idx[0] == 'r' else b
    occs = occs_r if idx[0] == 'r'else occs_b
    print('Occurrences in article:', len(occs[occs.article_id == idx]))
    print('Top 3: ', matches.loc[idx].nlargest(3)) 
#     print(occs[occs.article_id == idx].head(10))
    display_article(idx)
inspect(r_ids3[0])
# inspect(b_ids[0])

Occurrences in article: 64
Top 3:  stock_symbol
YHOO    30
MSFT    29
TWX      3
Name: r22321, dtype: int32


time: 303 ms


In [157]:
doc = nlp(get_article(r_ids[1]))
displacy.render(doc, style='ent', jupyter=True)

time: 190 ms


# Update Ids & Merge Occurrences Matrixes

In [7]:
news = pd.read_csv("../data/preprocessed/news-v2.csv", index_col=0)

time: 21.6 s


In [8]:
new_to_old_idx = news.old_idx.to_dict()
old_to_new_idx = {v: k for k, v in new_to_old_idx.items()}

time: 234 ms


In [None]:
bb = pd.read_csv('../data/preprocessed/occurrences/bloomberg_occurrences_matrix.csv', index_col=0)
re = pd.read_csv('../data/preprocessed/occurrences/reuters_occurrences_matrix.csv', index_col=0)

In [None]:
bb.index = bb.index + re.index.size
occurrences = pd.concat([re, bb])
occurrences.to_csv('../data/preprocessed/occurrences/occurrences.csv')

# Update Ids & Merge Concrete Occurrences

In [10]:
bb = pd.read_csv('../data/preprocessed/occurrences/occurrences-bloomberg-v2.csv', index_col=0)
re = pd.read_csv('../data/preprocessed/occurrences/occurrences-reuters-v2.csv', index_col=0)

time: 456 ms


In [30]:
re.article_id = re.article_id.apply(old_to_new_idx.get)
assert re.article_id.dtype == int
bb.article_id = bb.article_id.apply(old_to_new_idx.get)
assert bb.article_id.dtype == int

occurrences = pd.concat([re, bb])
occurrences.to_csv('../data/preprocessed/occurrences/occurrences.csv')

time: 100 ms
