### Content:
- Use SpaCy NLP to find companys
- Match companys in news with stock companies (using regex)
- Exclude matches in headers afterwards (using regex)
- Apply NLP on one Reuters and one Bloomberg article as show cases
- Fix matchings for 3M Company
- Generate cooccurrences (value represents number of articles in which two companies occur together)

#### TODO:
- Read through some articles to find a show case how it relates to stock prices

In [1]:
import os
import re
import glob
from datetime import datetime
import sys
sys.path.append("..") # Adds higher directory to python modules path for importing from src dir

import pandas as pd
import numpy as np
import tqdm
import matplotlib
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook as tqdm

import src.nlp_utils as nlp_utils
import src.text_classification_utils as tc_utils

import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load(disable=['parser', 'tagger'])

%matplotlib inline
%load_ext autotime
%load_ext autoreload
%autoreload 2

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [60]:
HOME = ".."
DATA_DIR = "data"
REUTERS = os.path.join(HOME, DATA_DIR, "preprocessed", "news_reuters.csv")
BLOOMBERG = os.path.join(HOME, DATA_DIR, "preprocessed", "news_bloomberg.csv")
ENTITIES_v1 = os.path.join("..", "data", "preprocessed", "entities.csv")
ENTITIES_v2 = os.path.join("..", "data", "preprocessed", "entities-v2.csv")
ENTITIES = ENTITIES_v2
OCCS = os.path.join(HOME, DATA_DIR, "preprocessed", "occurrences", "occurrences.csv")
NEWS_v2 = os.path.join(HOME, DATA_DIR, "preprocessed", "news-v2.csv")
NEWS_v3 = os.path.join(HOME, DATA_DIR, "preprocessed", "news-v3.csv")
NEWS = NEWS_v3
# Columns: 'date', 'filename', 'content'

time: 336 ms


## Article Id Mapping

In [None]:
news = pd.read_csv(NEWS, index_col=0)

# 8650 duplicated filenames, 786 duplicated contents
# d = news.filename.duplicated()
# print(news.loc[1716].iloc[1].content) # 1201 letters
# print(news.loc[1641].iloc[1].content) # 541 letters

In [None]:
def get_old_idx(article):
    if article.reuters:
        return f'r{article.name}'
    return f'b{article.name}'

# [get_old_idx(idx, article) for idx, article in news.iterrows()];
news['old_idx'] = news.apply(get_old_idx, axis=1)

In [None]:
news.reset_index(drop=True, inplace=True)
news = news[['old_idx', 'date', 'filename', 'content']]

new_to_old_idx = news.old_idx.to_dict()
old_to_new_idx = {v: k for k, v in new_to_old_idx.items()}

In [None]:
news.to_csv('news-v2.csv')

## Sanity Check for Occurrences

In [None]:
news = pd.read_csv(NEWS_v2, index_col=0)

In [None]:
new_to_old_idx = news.old_idx.to_dict()
old_to_new_idx = {v: k for k, v in new_to_old_idx.items()}

In [None]:
bb = pd.read_csv('../data/preprocessed/occurrences/occurrences-bloomberg-v2.csv', index_col=0)
re = pd.read_csv('../data/preprocessed/occurrences/occurrences-reuters-v2.csv', index_col=0)

In [None]:
n_tests = 100000
# for idx in tqdm(np.random.choice(len(re), n_tests, replace=False)):
#     occurrence = re.iloc[idx]
for idx in tqdm(np.random.choice(len(bb), n_tests, replace=False)):
    occurrence = bb.iloc[idx]
    str_in_article = news.loc[old_to_new_idx[occurrence.article_id]].content[occurrence.start_idx:occurrence.end_idx]
    assert str_in_article == occurrence.match_text, f"Didn't match for occurrence {idx}"

In [None]:
re.article_id = re.article_id.apply(old_to_new_idx.get)
bb.article_id = bb.article_id.apply(old_to_new_idx.get)
occurrences = pd.concat([re, bb])

n_tests = 100000
for idx in tqdm(np.random.choice(len(occurrences), n_tests, replace=False)):
    occurrence = occurrences.iloc[idx]
    str_in_article = news.loc[occurrence.article_id].content[occurrence.start_idx:occurrence.end_idx]
    assert str_in_article == occurrence.match_text, f"Didn't match for occurrence {idx}"

## Sanity Check for Entities

In [None]:
entities = pd.read_csv(ENTITIES, index_col=0, nrows=10000)
news = pd.read_csv(NEWS, index_col=0, nrows=10000)

In [None]:
nan_txt_entities = entities[entities.match_text.isna()]
txts = []
for _, row in nan_txt_entities.iterrows():
    txts.append(news.loc[row.article_id].content[row.start_idx:row.end_idx])
txts = np.array(txts)
print(np.unique(txts))  # -> N/A or NA with das transformed in real nan values
# In entities.csv the files is empty -> e.g. line 2335779: "2291077,39064,,1172,1174,ORG"

In [None]:
n_tests = 10000
for idx in tqdm(np.random.choice(len(entities), n_tests, replace=False)):
    entity = entities.iloc[idx]
    str_in_article = news3.loc[entity.article_id].content[entity.start_idx:entity.end_idx]
    assert (isinstance(entity.match_text, float) and np.isnan(entity.match_text)) or str_in_article == entity.match_text, f"Didn't match entity at {idx}"

## News-v3
Reuters and Bloomberg are now mixed because they are sorted by date. This does not change the indexes.

In [3]:
news = pd.read_csv(NEWS_v2, index_col=0)
news.date = pd.to_datetime(news.date)
news = news.sort_values(by=['date'])

time: 41.1 s


In [44]:
with open(NEWS_v3, mode='w', newline='\n', encoding='utf-8') as f:
    news.to_csv(f, line_terminator='\n', encoding='utf-8')

time: 1min 30s


### Testing HDF5
Performance roughly as good as csv. --> http://matthewrocklin.com/blog/work/2015/03/16/Fast-Serialization

In [51]:
news.to_hdf(NEWS_v3[:-3]+'h5', 'df')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->['old_idx', 'filename', 'content']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


time: 1min 28s


In [52]:
news = pd.read_hdf(NEWS_v3[:-3]+'h5')

time: 48 s


Note: v3 contained \r\r for each \r because of Windows line formatting rules

## Replace entities

#### Merge Entities and Occs

In [None]:
news = pd.read_csv(NEWS_v3, index_col=0)

In [59]:
entities = pd.read_csv(ENTITIES_v1, index_col=0)
occs = pd.read_csv(OCCS, index_col=0)

time: 40.8 s


In [None]:
entities = nlp_utils.merge_entities_and_occs(entities, occs, quiet=False)

In [70]:
entities.sort_values(by=['article_id', 'start_idx'], ascending=[True, False], inplace=True)

time: 1min 15s


In [71]:
entities.to_csv(ENTITIES_v2)

time: 7min 6s


### Notes
- 2.3 mio ORG entities start with an empty space. This might led to not matching stock symbols. (Example " Exxon Mobile")
- Entity Labels: ['ORG', 'GPE', 'MONEY', 'DATE', 'QUANTITY', 'CARDINAL', 'NORP', 'PERSON', 'PRODUCT', 'LAW', 'PERCENT']
- Company Occurrences are marked with < SYM >

In [63]:
entities = pd.read_csv(ENTITIES_v2, index_col=0)

  mask |= (ar1 == a)


time: 1min 20s


In [72]:
grouped = entities.groupby('article_id', sort=False)

time: 366 ms


In [65]:
def remove_substr(s, to_idx, from_idx):
    return s[:to_idx]+s[from_idx:]

time: 211 ms


In [None]:
for article_id, group in tqdm(grouped, total=len(grouped.groups)):
    # group.sort_values(by='start_idx', inplace=True, ascending=False)
    s = news.loc[article_id].content
    for i, row in group.iterrows():
        # print(news.loc[0].content[row.start_idx-2:row.end_idx+2])
        # Plus 1 for end_idx so the next sign (probably a space) will also be removed
        s = remove_substr(s, row.start_idx, row.end_idx+1)
        # print(s[row.start_idx-2:row.end_idx+2])
    news.loc[article_id].content = s
    # print(s)
    # assert False

HBox(children=(IntProgress(value=0, max=554065), HTML(value='')))