In [None]:
import os
import re
import glob
from datetime import datetime
import sys
sys.path.append("..") # Adds higher directory to python modules path for importing from src dir

import pandas as pd
import numpy as np
import tqdm
import matplotlib
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook as tqdm

import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

%matplotlib inline
%load_ext autotime
%load_ext autoreload
%autoreload 2

In [None]:
HOME = ".."
DATA_DIR = "data"
REUTERS = os.path.join(HOME, DATA_DIR, "preprocessed", "news_reuters.csv")
BLOOMBERG = os.path.join(HOME, DATA_DIR, "preprocessed", "news_bloomberg.csv")
NEWS = os.path.join(HOME, DATA_DIR, "preprocessed", "news.csv")
# Columns: 'date', 'filename', 'content'

### Get companies

In [None]:
from src.datasets import NyseSecuritiesDataset
from src.datasets import NyseStocksDataset
import src.nlp_utils as nlp_utils

In [None]:
securities_ds = NyseSecuritiesDataset(file_path='../data/nyse/securities.csv')
companies = securities_ds.get_all_company_names()  # List[Tuple[symbol, name]]

Reuters:
- 106519 articles
- From 2006-10-20 to 2013-11-20
- 45363 before 2010-01-04

Bloomberg:
- 448395 articles
- From 2006-10-20 to 2013-11-26
- 1148 before 2010-01-04

Nyse:
- From 2010-01-04 to 2016-12-30

In [None]:
dates_r = pd.read_csv(REUTERS, usecols=[0, 1], index_col=0)
dates_r['date'] = pd.to_datetime(dates_r['date'], errors='coerce')
dates_r['date'].hist()
print(sum(dates_r['date'] <= pd.to_datetime('2010-01-04')))

In [None]:
dates_b = pd.read_csv(BLOOMBERG, usecols=[0, 1], index_col=0)
dates_b['date'] = pd.to_datetime(dates_b['date'], errors='coerce')
dates_b['date'].hist()
print(sum(dates_b['date'] <= pd.to_datetime('2010-01-04')))

# Multi Run

#### Before NYSE
- All from Reuters and Bloomberg before first entry of NYSE dataset
- 45363 article from Reuters
- 1148 articles from Bloomber
- Resulted in about 102.735 company occurrences
- `./occurrences-before-nyse.csv` [3.9 MB]

#### Reuters

- Took 53h (2d 2h 58min 18s)
- Analysed 106.519 articles (106.494 included content)
- Resulted in 217.518 company occurrences
- ... in 52.210 different articles
- `./occurrences-reuters.csv`[8.5 MB]

In [None]:
reuters = pd.read_csv(REUTERS, index_col=0)  # nrows=45363
print(len(reuters))
reuters = reuters[reuters['content'].notna()]
print(len(reuters))

In [None]:
results = pd.DataFrame(
    index=range(5 * len(reuters)),
    columns=['article_id', 'stock_symbol', 'match_text', 'start_idx', 'end_idx'])
counter = 0

In [None]:
pbar = tqdm(reuters.iterrows(), total=len(reuters))
for i, article in pbar:
    # One full article takes about 7 seconds
    found_entities = nlp_utils.find_nyse_corporations(article[2], quiet=True)
    for ent, symbol in found_entities:
        results.iloc[counter] = (f'r{i}', symbol, ent.text, ent.start_char, ent.end_char)
        counter += 1
    if (counter % 500) + len(found_entities) != (counter + len(found_entities)) % 500:
        results.dropna().to_csv('occurrences-reuters.csv')
        pbar.set_description(f"Stored {counter} entries")
results.dropna().to_csv('occurrences-reuters.csv')

#### Bloomberg
- Took ?
- Analysed 448.395 articles (447.769 included content)
- Resulted in ? company occurrences
- ... in ? different articles
- `./occurrences-bloomberg.csv` [? MB]

In [None]:
bloombergs = pd.read_csv(BLOOMBERG, index_col=0)  # nrows=1148
print(len(bloombergs))
bloombergs = bloombergs[bloombergs['content'].notna()]
print(len(bloombergs))

In [None]:
results = pd.DataFrame(
    index=range(10 * len(bloombergs)),
    columns=['article_id', 'stock_symbol', 'match_text', 'start_idx', 'end_idx'])
counter = 0

In [None]:
pbar = tqdm(bloombergs.iterrows(), total=len(bloombergs))
for i, article in pbar:
    # One full article takes about 7 seconds
    found_entities = nlp_utils.find_nyse_corporations(article[2], quiet=True)
    for ent, symbol in foun
    d_entities:
        results.iloc[counter] = (f'b{i}', symbol, ent.text, ent.start_char, ent.end_char)
        counter += 1
    if (counter % 100) + len(found_entities) != (counter + len(found_entities)) % 100:
        results.dropna().to_csv('occurrences-b1.csv')
        pbar.set_description(f"Stored {counter} entries")
results.dropna().to_csv('occurrences-b1.csv')

## Remove headlines from articles and the found entities

In [None]:
reuters.loc['head_end'] = 0
bloombergs.loc['head_end'] = 0
r = results.dropna()
print('Before:', r.shape)
print('www:', sum(r.match_text.str.contains('www.')), 'http:', sum(r.match_text.str.contains('http:')))
# remove_meta = re.compile(r'(--.*\n)+[\n\s]*')
remove_meta = re.compile(r'-- (.*)\n(?:--.*\n)+[\n\s]*')

def filter_meta_matches(article, article_id):
    match = remove_meta.match(article.content)
    article.title_start_idx = match.start(1)
    article.title_end_idx = match.end(1)
    article.head_end_idx = match.end()
    r = r[(r.article_id != article_id) | (r.start_idx >= article.head_end_idx) |
          (r.start_idx.between(article.title_start_idx, article.title_end_idx) &
           r.end_idx.between(article.title_start_idx, article.title_end_idx))]

for i, article in tqdm(reuters.iterrows(), total=len(reuters)):
    filter_meta_matches(article, f'r{i}')

print('After Reuters:', r.shape)
print('www:', sum(r.match_text.str.contains('www.')), 'http:', sum(r.match_text.str.contains('http:')))
    
for i, article in tqdm(bloombergs.iterrows(), total=len(bloombergs)):
    filter_meta_matches(article, f'b{i}')

print('After Bloomberg:', r.shape)
print('www:', sum(r.match_text.str.contains('www.')), 'http:', sum(r.match_text.str.contains('http:')))

### Apply on Reuters Article

In [None]:
# idxmax reuters: 61727  (FB 26, MSFT 1, NWSA 44, NWS 44, YHOO 1)
reuters = pd.read_csv(REUTERS, skiprows=61727, nrows=1, index_col=0)
print(reuters.loc[61727][2])

In [None]:
article1 = nlp(reuters.loc[61727][2])
labels = [x.label_ for x in article1.ents]
print(Counter(labels))
items = [x.text for x in article1.ents if x.label_ == 'ORG']
print(Counter(items))  # .most_common(3)
sentences = [x for x in article1.sents]
print(sentences[20])
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

In [None]:
counts = Counter([ent for ent, label in zip(items, labels) if label == 'ORG'])
matches = [
    [key, counts[key], securities_ds.get_most_similar_company(key)] for key in counts]
matches = [x for x in matches if x[2] is not None]
matched_stocks = dict([(x[0], x[2]) for x in matches])
matches

In [None]:
securities_ds.get_most_similar_company('AOL-Time Warner', quiet=False)

In [None]:
found_entities = nlp_utils.find_nyse_corporations(reuters.loc[61727][2], quiet=False)

### Apply on Bloomberg Article

In [None]:
# idxmax bloomberg: 316777  (AAPL 1, FB 79, JPM 1, MSFT 1, MS 7)
bloomberg = pd.read_csv(BLOOMBERG, skiprows=316777, nrows=1, index_col=0)
# print(bloomberg.loc[316777][2])

In [None]:
found_entities = nlp_utils.find_nyse_corporations(bloomberg.loc[316777][2], quiet=False)