### Content:
- Use SpaCy NLP to find companys
- Match companys in news with stock companies (using regex)
- Exclude matches in headers afterwards (using regex)
- Apply NLP on one Reuters and one Bloomberg article as show cases
- Fix matchings for 3M Company
- Generate cooccurrences (value represents number of articles in which two companies occur together)

#### TODO:
- Read through some articles to find a show case how it relates to stock prices

In [None]:
import os
import re
import glob
from datetime import datetime
import sys
sys.path.append("..") # Adds higher directory to python modules path for importing from src dir

import pandas as pd
import numpy as np
import tqdm
import matplotlib
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook as tqdm

import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load(disable=['parser', 'tagger'])

%matplotlib inline
%load_ext autotime
%load_ext autoreload
%autoreload 2

In [None]:
HOME = ".."
DATA_DIR = "data"
REUTERS = os.path.join(HOME, DATA_DIR, "preprocessed", "news_reuters.csv")
BLOOMBERG = os.path.join(HOME, DATA_DIR, "preprocessed", "news_bloomberg.csv")
NEWS = os.path.join(HOME, DATA_DIR, "preprocessed", "news.csv")
# Columns: 'date', 'filename', 'content'

### Get companies

In [None]:
from src.datasets import NyseSecuritiesDataset
from src.datasets import NyseStocksDataset
import src.nlp_utils as nlp_utils

In [None]:
stocks_ds = NyseStocksDataset(file_path='../data/nyse/prices-split-adjusted.csv')
securities_ds = NyseSecuritiesDataset(file_path='../data/nyse/securities.csv')
companies = securities_ds.get_all_company_names()  # List[Tuple[symbol, name]]
stocks_ds.load()

In [None]:
def find_entities(article, quiet=True):
    ents = nlp(article.content).ents
    title_start_idx, title_end_idx, head_end_idx = nlp_utils.get_meta_indizes(article)
    for ent in ents:
        s = ent.start_char
        e = ent.end_char
        if title_start_idx != -1:
            if (title_start_idx <= s <= title_end_idx) or head_end_idx <= s:
                yield ent
        elif head_end_idx != -1:
            if head_end_idx <= s:
                yield ent
        else:
            yield ent
    if not quiet:
        displacy.render(doc, style='ent', jupyter=True)

In [None]:
def to_frame(arr, amount=None):
    return pd.DataFrame(
        arr,
        index=range(amount or len(arr)),
        columns=['article_id', 'match_text', 'start_idx', 'end_idx', 'label'])

Reuters:
- 106519 articles
- From 2006-10-20 to 2013-11-20
- 45363 before 2010-01-04
- Took 2h 26m
- 6.788.173 entities
- `./entities-reuters.csv`[273 MB]

Bloomberg:
- 448395 articles
- From 2006-10-20 to 2013-11-26
- 1148 before 2010-01-04
-
-
-

Nyse:
- From 2010-01-04 to 2016-12-30

# Reuters

In [None]:
reuters = pd.read_csv(REUTERS, index_col=0)  # nrows=45363
print(len(reuters))
reuters = reuters[reuters['content'].notna()]
print(len(reuters))

In [None]:
pbar = tqdm(reuters.iterrows(), total=len(reuters))
counter = 0
results = []
assert False, 'prevent overwriting existing entities file'
for i, article in pbar:
    for ent in find_entities(article):
        results.append((f'r{i}', ent.text, ent.start_char, ent.end_char, ent.label_))
        counter += 1
    if (counter % 300000) + len(found_entities) != (counter + len(found_entities)) % 300000:
        to_frame(results, counter).to_csv('entities-reuters.csv')
        pbar.set_description(f"Stored {counter} entities")
results = to_frame(results, counter).to_csv('entities-reuters.csv')

# Bloomberg

In [None]:
bloomberg = pd.read_csv(BLOOMBERG, index_col=0)  # nrows=1148
print(len(bloomberg))
# The final used indexes would be confused if this line is executed
# bloomberg = bloomberg[bloomberg['content'].notna()]

In [None]:
pbar = tqdm(bloomberg.iterrows(), total=len(bloomberg))
counter = 0
results = []
assert False, 'prevent overwriting existing entities file'
for i, article in pbar:
    if not isinstance(article.content, str):
        continue
    for ent in find_entities(article):
        results.append((f'b{i}', ent.text, ent.start_char, ent.end_char, ent.label_))
        counter += 1
    if (counter % 300000) + len(found_entities) != (counter + len(found_entities)) % 300000:
        to_frame(results, counter).to_csv('entities-bloomberg-1.csv')
        pbar.set_description(f"Stored {counter} entities")
results = to_frame(results, counter).to_csv('entities-bloomberg-1.csv')

In [None]:
start = 148877
counter = 0
pbar = tqdm(bloomberg.iloc[start:].iterrows(), total=len(bloomberg)-start)
results = []
assert False, 'prevent overwriting existing entities file'
for i, article in pbar:
    if not isinstance(article.content, str):
        continue
    prev_counter = counter
    for ent in find_entities(article):
        results.append((f'b{i}', ent.text, ent.start_char, ent.end_char, ent.label_))
        counter += 1
    if  (prev_counter % 500000) > (counter % 500000):
        to_frame(results, counter).to_csv('entities-bloomberg-2.csv')
        pbar.set_description(f"Stored {counter} entities")
results = to_frame(results, counter).to_csv('entities-bloomberg-2.csv')

In [None]:
start = 335618
counter = 0
pbar = tqdm(bloomberg.iloc[start:].iterrows(), total=len(bloomberg)-start)
results = []
assert False, 'prevent overwriting existing entities file'
for i, article in pbar:
    if not isinstance(article.content, str):
        continue
    prev_counter = counter
    for ent in find_entities(article):
        results.append((f'b{i}', ent.text, ent.start_char, ent.end_char, ent.label_))
        counter += 1
    if  (prev_counter % 500000) > (counter % 500000):
        to_frame(results, counter).to_csv('entities-bloomberg-3.csv')
        pbar.set_description(f"Stored {counter} entities")
results = to_frame(results, counter)
results.to_csv('entities-bloomberg-3.csv')

### Merge Bloomberg Steps

In [None]:
# After duplicating entities-bloomberg-1.csv
pd.read_csv('entities-bloomberg-2.csv', index_col=0).to_csv('entities-bloomberg.csv', mode='a')
pd.read_csv('entities-bloomberg-3.csv', index_col=0).to_csv('entities-bloomberg.csv', mode='a')

In [None]:
merged = pd.read_csv('entities-bloomberg.csv')

In [None]:
# Original: 33.914.235 entries -> After removing duplicates: 33.913.975 entries
# Duplicate articles: b148877, b335618
merged.drop_duplicates(subset=['article_id', 'start_idx'], inplace=True)
merged.reset_index(inplace=True, drop=True)

In [None]:
merged.to_csv('entities-bloomberg.csv')

## Apply new mapping

In [4]:
news = pd.read_csv('news-v2.csv', usecols=['old_idx'])
new_to_old_idx = news.old_idx.to_dict()
old_to_new_idx = {v: k for k, v in new_to_old_idx.items()}

In [7]:
entities_reuters = pd.read_csv('entities-reuters.csv', index_col=0)
entities_reuters.article_id = entities_reuters.article_id.apply(old_to_new_idx.get)

In [8]:
entities_reuters.to_csv('entities.csv')

In [5]:
entities_bloomberg = pd.read_csv('entities-bloomberg.csv', index_col=0)
entities_bloomberg.article_id = entities_bloomberg.article_id.apply(old_to_new_idx.get)

In [9]:
entities_bloomberg.to_csv('entities.csv', mode='a')
# This also added the headline again and led to wrong dtypes:
# Find the repeated headline:   awk '/^,article_id/ {print FNR}' entities.csv
# Validate by printing it:   sed '6870969q; d' entities.csv
# Delete line:   sed -i '6870969d' entities.csv

40.702.148 Entities found in 554.068 articles