In [1]:
import os
import re
import glob
from datetime import datetime

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook as tqdm

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
%load_ext autotime

In [217]:
print("Define constants")
HOME = ".."
DATA_DIR = "data"
REUTERS = os.path.join(HOME, DATA_DIR, "bloomberg_reuters", "raw", "reuters")
BLOOMBERG = os.path.join(HOME, DATA_DIR, "bloomberg_reuters", "raw", "bloomberg")

Define constants


In [222]:
files_re = []
for folder in glob.iglob(os.path.join(REUTERS, '*'), recursive=True):
    for filename in glob.iglob(os.path.join(folder, '*'), recursive=True):
        assert os.path.isfile(filename), filename
        files_re.append(filename)
print(len(files_re))

106519


In [218]:
files_bb = []
for folder in glob.iglob(os.path.join(BLOOMBERG, '*'), recursive=True):
    for filename in glob.iglob(os.path.join(folder, '*'), recursive=True):
        assert os.path.isfile(filename), filename
        files_bb.append(filename)
print(len(files_bb))

448395


In [219]:
print('Define functions')

def split(path):
    complete_dir, filename = os.path.split(path)
    complete_dir, short_dir = os.path.split(complete_dir)
    return complete_dir, short_dir, filename

def read_content(path, datef='%Y-%m-%d'):
    complete_dir, short_dir, filename = split(path)
    with open(path, encoding='utf8') as file:
        try:
            content = file.read()
        except Exception as e:
            print('Failed reading', path)
            raise e
    publish_date = datetime.strptime(short_dir, datef)
    return publish_date, filename, content

Define functions


In [220]:
print('Reading all Bloomberg files')
df_bloomberg = pd.DataFrame([read_content(x) for x in files_bb], columns=['date', 'filename', 'content'])

Reading all Bloomberg files


In [226]:
df_bloomberg.to_csv('../data/preprocessed/news_bloomberg.csv')

In [223]:
print('Reading all Reuters files')
df_reuters = pd.DataFrame([read_content(x, '%Y%m%d') for x in files_re], columns=['date', 'filename', 'content'])

Reading all Reuters files


In [227]:
df_reuters.to_csv('../data/preprocessed/news_reuters.csv')

In [247]:
df_reuters['reuters'] = True
df_bloomberg['reuters'] = False

In [248]:
df_news = pd.concat([df_reuters, df_bloomberg])

In [249]:
df_news.to_csv('../data/preprocessed/news.csv')

In [9]:
import nyse

time: 1.06 s


In [10]:
nyse.load()

def find_all(text, substring):
    return [m.start() for m in re.finditer(substring, text)]

def count_occurrences(df_articles, start=0, end=None):
    end = end if end is not None else len(df_articles)
    df_occurrences = pd.DataFrame(0, index=df_articles.index[start:end], columns=nyse.securities['Ticker symbol'])
    for idx, row in tqdm(df_articles.iloc[start:end].iterrows(), total=end-start):
        for symbol in df_occurrences.columns:
            company = nyse.get_name(symbol)
            occurrences = find_all(row['content'], company)
            if len(occurrences):
                df_occurrences[symbol][idx] = len(occurrences)
    return df_occurrences

time: 5.07 s


### Occurrences Reuters

In [230]:
# df_occurrences_reuters = pd.DataFrame(0, index=df_reuters.index, columns=nyse.securities['Ticker symbol'])
df_occurrences_reuters = count_occurrences(df_reuters)

HBox(children=(IntProgress(value=0, max=100519), HTML(value='')))

time: 6h 51min


In [238]:
df_occurrences_reuters.sum().sum()

112560

time: 622 ms


In [239]:
df_occurrences_reuters.to_csv('reuters_occurrences.csv')

time: 58.3 s


### Occurrences Bloomberg

In [None]:
df_occurrences_bloomberg = count_occurrences(df_bloomberg, 0, 100000)

In [25]:
df_occurrences_bloomberg.to_csv('bloomberg_occurrences_1_100k.csv')

time: 41.9 s


In [10]:
df_occurrences_bloomberg2 = count_occurrences(df_bloomberg, 100000, 200000)

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))


time: 8h 8min 57s


In [11]:
df_occurrences_bloomberg2.to_csv('bloomberg_occurrences_2_100k.csv')

time: 44.9 s


In [104]:
df_occurrences_bloomberg3 = count_occurrences(df_bloomberg, 200000, 300000)

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))


time: 6h 30min 28s


In [105]:
df_occurrences_bloomberg3.to_csv('bloomberg_occurrences_3_100k.csv')

time: 3min 13s


In [106]:
df_occurrences_bloomberg4 = count_occurrences(df_bloomberg, 300000, 400000)

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))


time: 6h 29min 12s


In [107]:
df_occurrences_bloomberg4.to_csv('bloomberg_occurrences_4_100k.csv')

time: 3min 14s


In [109]:
df_occurrences_bloomberg5 = count_occurrences(df_bloomberg, 400000)

HBox(children=(IntProgress(value=0, max=48395), HTML(value='')))


time: 3h 6min 38s


In [110]:
df_occurrences_bloomberg5.to_csv('bloomberg_occurrences_5_50k.csv')

time: 21 s


In [42]:
df_occurrences_bloomberg = pd.concat([
    df_occurrences_bloomberg1, df_occurrences_bloomberg2,
    df_occurrences_bloomberg3, df_occurrences_bloomberg4,
    df_occurrences_bloomberg5])

In [46]:
df_occurrences_bloomberg.to_csv('bloomberg_occurrences_all.csv')

In [232]:
df_occurrences_reuters.sum(axis=1).idxmax()

61727

In [233]:
df_occurrences_bloomberg.sum(axis=1).idxmax()

316777

In [239]:
ao = df_occurrences_reuters.iloc[df_occurrences_reuters.sum(axis=1).idxmax()]

In [240]:
ao[ao != 0]

FB      26
MSFT     1
NWSA    44
NWS     44
YHOO     1
Name: 61727, dtype: int64

TODO: https://ndres.me/post/best-jupyter-notebook-extensions/

### Co-occurrences

In [16]:
import pandas as pd
import itertools

In [11]:
df_occurrences_bloomberg = pd.read_csv('../data/preprocessed/occurrences/bloomberg_occurrences_all.csv', index_col=0)
df_occurrences_bloomberg.sum().sum()

275184

In [17]:
corps = df_occurrences_bloomberg.columns
coocc_b = pd.DataFrame(0, index=corps, columns=corps)

for c1, c2 in tqdm(list(itertools.combinations(corps, 2))):
    amount = sum(df_occurrences_bloomberg[c1] & df_occurrences_bloomberg[c2])
    coocc_b.loc[c1].loc[c2] = amount
    coocc_b.loc[c2].loc[c1] = amount

In [204]:
coocc_b.to_csv('../data/preprocessed/occurrences/bloomberg_cooccurrences.csv')

In [173]:
df_occurrences_reuters = pd.read_csv('../data/preprocessed/occurrences/reuters_occurrences_all.csv', index_col=0)
df_occurrences_reuters.sum().sum()

112560

In [178]:
corps = df_occurrences_reuters.columns
coocc_r = pd.DataFrame(0, index=corps, columns=corps)

for c1, c2 in tqdm(list(itertools.combinations(corps, 2))):
    amount = sum(df_occurrences_reuters[c1] & df_occurrences_reuters[c2])
    coocc_r.loc[c1].loc[c2] = amount
    coocc_r.loc[c2].loc[c1] = amount

HBox(children=(IntProgress(value=0, max=127260), HTML(value='')))

In [202]:
coocc_r.to_csv('../data/preprocessed/occurrences/reuters_cooccurrences.csv')

In [203]:
coocc = coocc_b + coocc_r
coocc.to_csv('../data/preprocessed/occurrences/cooccurrences.csv')

### Inspect strongest relationships

In [197]:
def get_sorted_cooccurrences(coocc_matrix):
    rowmax = coocc_matrix.idxmax(axis=0)
    tuples = np.array([(x, rowmax[x], coocc_matrix[x][rowmax[x]]) for x in rowmax.index])
    tuples = sorted(tuples, key=lambda x: int(x[2]), reverse=True)
    final_coocc = pd.DataFrame([(x, nyse.get_name(x), y, nyse.get_name(y), z) for x, y, z in tuples], columns=['SymA', 'NameA', 'SymB', 'NameB', 'AmountArticles'])
    return final_coocc

In [200]:
# coocc_b = pd.read_csv('../data/preprocessed/occurrences/aggregated_bloomberg_occurrences.csv', index_col=0)
# get_sorted_cooccurrences(coocc_b)[:20]

# coocc_r = pd.read_csv('../data/preprocessed/occurrences/aggregated_reuters_occurrences.csv', index_col=0)
# get_sorted_cooccurrences(coocc_r)[:20]

# coocc = pd.read_csv('../data/preprocessed/occurrences/aggregated_occurrences.csv', index_col=0)
get_sorted_cooccurrences(coocc)[:20]

Unnamed: 0,SymA,NameA,SymB,NameB,AmountArticles
0,NWSA,News Corp.,NWS,News Corp.,12814
1,NWS,News Corp.,NWSA,News Corp.,12814
2,C,Citigroup Inc.,JPM,JPMorgan Chase & Co.,3793
3,JPM,JPMorgan Chase & Co.,C,Citigroup Inc.,3793
4,BAC,Bank of America Corp,JPM,JPMorgan Chase & Co.,3454
5,GS,Goldman Sachs Group,JPM,JPMorgan Chase & Co.,3385
6,F,Ford Motor,GM,General Motors,2837
7,GM,General Motors,F,Ford Motor,2837
8,MS,Morgan Stanley,GS,Goldman Sachs Group,2603
9,AAPL,Apple Inc.,MSFT,Microsoft Corp.,2001
