### Content:
- Use SpaCy NLP to find companys
- Match companys in news with stock companies (using regex)
- Exclude matches in headers afterwards (using regex)
- Apply NLP on one Reuters and one Bloomberg article as show cases
- Fix matchings for 3M Company
- Generate cooccurrences (value represents number of articles in which two companies occur together)

#### TODO:
- Read through some articles to find a show case how it relates to stock prices

In [None]:
import os
import re
import glob
from datetime import datetime
import sys
sys.path.append("..") # Adds higher directory to python modules path for importing from src dir

import pandas as pd
import numpy as np
import tqdm
import matplotlib
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook as tqdm

import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load(disable=['parser', 'tagger'])

%matplotlib inline
%load_ext autotime
%load_ext autoreload
%autoreload 2

In [None]:
HOME = ".."
DATA_DIR = "data"
REUTERS = os.path.join(HOME, DATA_DIR, "preprocessed", "news_reuters.csv")
BLOOMBERG = os.path.join(HOME, DATA_DIR, "preprocessed", "news_bloomberg.csv")
NEWS = os.path.join(HOME, DATA_DIR, "preprocessed", "news.csv")
# Columns: 'date', 'filename', 'content'

In [None]:
news = pd.read_csv(NEWS, index_col=0)

# 8650 duplicated filenames, 786 duplicated contents
# d = news.filename.duplicated()
# print(news.loc[1716].iloc[1].content) # 1201 letters
# print(news.loc[1641].iloc[1].content) # 541 letters

In [None]:
def get_old_idx(article):
    if article.reuters:
        return f'r{article.name}'
    return f'b{article.name}'

# [get_old_idx(idx, article) for idx, article in news.iterrows()];
news['old_idx'] = news.apply(get_old_idx, axis=1)

In [None]:
news.reset_index(drop=True, inplace=True)
news = news[['old_idx', 'date', 'filename', 'content']]

new_to_old_idx = news.old_idx.to_dict()
old_to_new_idx = {v: k for k, v in new_to_old_idx.items()}

In [None]:
news.to_csv('news-v2.csv')

## Sanity Check for Occurrences

In [15]:
NEWS = os.path.join(HOME, DATA_DIR, "preprocessed", "news-v2.csv")
news = pd.read_csv(NEWS, index_col=0)

time: 29.4 s


In [16]:
new_to_old_idx = news.old_idx.to_dict()
old_to_new_idx = {v: k for k, v in new_to_old_idx.items()}

time: 630 ms


In [21]:
bb = pd.read_csv('../data/preprocessed/occurrences/occurrences-bloomberg-v2.csv', index_col=0)
re = pd.read_csv('../data/preprocessed/occurrences/occurrences-reuters-v2.csv', index_col=0)

time: 704 ms


In [17]:
n_tests = 100000
# for idx in tqdm(np.random.choice(len(re), n_tests, replace=False)):
#     occurrence = re.iloc[idx]
for idx in tqdm(np.random.choice(len(bb), n_tests, replace=False)):
    occurrence = bb.iloc[idx]
    str_in_article = news.loc[old_to_new_idx[occurrence.article_id]].content[occurrence.start_idx:occurrence.end_idx]
    assert str_in_article == occurrence.match_text, f"Didn't match for occurrence {idx}"

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))


time: 38.5 s


In [22]:
re.article_id = re.article_id.apply(old_to_new_idx.get)
bb.article_id = bb.article_id.apply(old_to_new_idx.get)
occurrences = pd.concat([re, bb])

n_tests = 100000
for idx in tqdm(np.random.choice(len(occurrences), n_tests, replace=False)):
    occurrence = occurrences.iloc[idx]
    str_in_article = news.loc[occurrence.article_id].content[occurrence.start_idx:occurrence.end_idx]
    assert str_in_article == occurrence.match_text, f"Didn't match for occurrence {idx}"

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))


time: 42.3 s


## Sanity Check for Entities

In [None]:
ENTITIES = os.path.join("..", "data", "preprocessed", "entities.csv")
NEWS = os.path.join("..", "data", "preprocessed", "news-v2.csv")

In [None]:
entities = pd.read_csv(ENTITIES, index_col=0)
news = pd.read_csv(NEWS, index_col=0)

In [41]:
nan_txt_entities = entities[entities.match_text.isna()]
txts = []
for _, row in nan_txt_entities.iterrows():
    txts.append(news.loc[row.article_id].content[row.start_idx:row.end_idx])
txts = np.array(txts)
print(np.unique(txts))  # -> N/A or NA with das transformed in real nan values
# In entities.csv the files is empty -> e.g. line 2335779: "2291077,39064,,1172,1174,ORG"

time: 2.34 s


In [63]:
n_tests = 100000
for idx in tqdm(np.random.choice(len(entities), n_tests, replace=False)):
    entity = entities.iloc[idx]
    str_in_article = news.loc[entity.article_id].content[entity.start_idx:entity.end_idx]
    assert (isinstance(entity.match_text, float) and np.isnan(entity.match_text)) or str_in_article == entity.match_text, f"Didn't match entity at {idx}"

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))

time: 2min 45s


## News-v3
Reuters and Bloomberg are now mixed because they are sorted by date. This does not change the indexes.

In [None]:
news = pd.read_csv(NEWS, index_col=0)
news.date = pd.to_datetime(all_news.date)
news = all_news.sort_values(by=['date'])
news.to_csv('../data/preprocessed/news-v3.csv')