In [1]:
import os
import re
import glob
from datetime import datetime

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook as tqdm

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
%load_ext autotime

In [3]:
print("Define constants")
HOME = ".."
DATA_DIR = "data"
REUTERS = os.path.join(HOME, DATA_DIR, "bloomberg_reuters", "raw", "reuters")
BLOOMBERG = os.path.join(HOME, DATA_DIR, "bloomberg_reuters", "raw", "bloomberg")

Define constants
time: 256 ms


In [93]:
files_re = []
for folder in glob.iglob(os.path.join(REUTERS, '*'), recursive=True):
    for filename in glob.iglob(os.path.join(folder, '*'), recursive=True):
        assert os.path.isfile(filename), filename
        files_re.append(filename)
print(len(files_re))

106519


In [4]:
files_bb = []
for folder in glob.iglob(os.path.join(BLOOMBERG, '*'), recursive=True):
    for filename in glob.iglob(os.path.join(folder, '*'), recursive=True):
        assert os.path.isfile(filename), filename
        files_bb.append(filename)
print(len(files_bb))

448395
time: 1min 11s


In [5]:
print('Define functions')

def split(path):
    complete_dir, filename = os.path.split(path)
    complete_dir, short_dir = os.path.split(complete_dir)
    return complete_dir, short_dir, filename

def read_content(path, datef='%Y-%m-%d'):
    complete_dir, short_dir, filename = split(path)
    with open(path, encoding='utf8') as file:
        try:
            content = file.read()
        except Exception as e:
            print('Failed reading', path)
            raise e
    publish_date = datetime.strptime(short_dir, datef)
    return publish_date, filename, content

Define functions
time: 276 ms


In [6]:
print('Reading all Bloomberg files')
df_bloomberg = pd.DataFrame([read_content(x) for x in files_bb], columns=['date', 'filename', 'content'])

Reading all Bloomberg files
time: 4min 1s


In [149]:
print('Reading all Reuters files')
df_reuters = pd.DataFrame([read_content(x, '%Y%m%d') for x in files_re], columns=['date', 'filename', 'content'])

Reading all Reuters files


In [7]:
import nyse

time: 446 ms


In [108]:
nyse.load()

def find_all(text, substring):
    return [m.start() for m in re.finditer(substring, text)]

def count_occurrences(df_articles, start=0, end=None):
    end = end if end is not None else len(df_articles)
    df_occurrences = pd.DataFrame(0, index=df_articles.index[start:end], columns=nyse.securities['Ticker symbol'])
    for idx, row in tqdm(df_articles.iloc[start:end].iterrows(), total=end-start):
        for symbol in df_occurrences.columns:
            company = nyse.get_name(symbol)
            occurrences = find_all(row['content'], company)
            if len(occurrences):
                df_occurrences[symbol][idx] = len(occurrences)
    return df_occurrences

time: 3 s


### Occurrences Reuters

In [230]:
# df_occurrences_reuters = pd.DataFrame(0, index=df_reuters.index, columns=nyse.securities['Ticker symbol'])
df_occurrences_reuters = count_occurrences(df_reuters)

HBox(children=(IntProgress(value=0, max=100519), HTML(value='')))

time: 6h 51min


In [238]:
df_occurrences_reuters.sum().sum()

112560

time: 622 ms


In [239]:
df_occurrences_reuters.to_csv('reuters_occurrences.csv')

time: 58.3 s


### Occurrences Bloomberg

In [None]:
df_occurrences_bloomberg = count_occurrences(df_bloomberg, 0, 100000)

In [25]:
df_occurrences_bloomberg.to_csv('bloomberg_occurrences_1_100k.csv')

time: 41.9 s


In [10]:
df_occurrences_bloomberg2 = count_occurrences(df_bloomberg, 100000, 200000)

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))


time: 8h 8min 57s


In [11]:
df_occurrences_bloomberg2.to_csv('bloomberg_occurrences_2_100k.csv')

time: 44.9 s


In [104]:
df_occurrences_bloomberg3 = count_occurrences(df_bloomberg, 200000, 300000)

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))


time: 6h 30min 28s


In [105]:
df_occurrences_bloomberg3.to_csv('bloomberg_occurrences_3_100k.csv')

time: 3min 13s


In [106]:
df_occurrences_bloomberg4 = count_occurrences(df_bloomberg, 300000, 400000)

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))


time: 6h 29min 12s


In [107]:
df_occurrences_bloomberg4.to_csv('bloomberg_occurrences_4_100k.csv')

time: 3min 14s


In [109]:
df_occurrences_bloomberg5 = count_occurrences(df_bloomberg, 400000)

HBox(children=(IntProgress(value=0, max=48395), HTML(value='')))


time: 3h 6min 38s


In [110]:
df_occurrences_bloomberg5.to_csv('bloomberg_occurrences_5_50k.csv')

time: 21 s


TODO: https://ndres.me/post/best-jupyter-notebook-extensions/