In [1]:
import numpy as np
import pandas as pd
import cython
import os
import re
import json
from bs4 import BeautifulSoup
from multiprocessing import Pool
from pandarallel import pandarallel

In [2]:
os.chdir('/mnt/d/workspace/8-2/Financial-Statements-Text-Analysis/')

In [3]:
# params
with open('config.json', 'r') as f:
    c = json.load(f)
input_dir = os.path.join(c['DATA_DIR'], '10k_clean')
# destination_dir = os.path.join(c['DATA_DIR'], '10k_clean')

# read processed 10-Ks in

In [4]:
metadata = pd.read_csv(os.path.join(c['DATA_DIR'], 'metadata.csv'))
metadata_legacy = pd.read_csv(os.path.join(c['DATA_DIR'], 'metadata_2017.csv'))

# only download the data from russell 3000 today
metadata = metadata_legacy[metadata_legacy['TICKER'].isin(metadata['ticker'])]

In [5]:
metadata['LOCAL_LINK'] = input_dir + '/' + metadata['TICKER'] + '/' + metadata['EDGAR_LINK'].str.split("/").str[-1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata['LOCAL_LINK'] = input_dir + '/' + metadata['TICKER'] + '/' + metadata['EDGAR_LINK'].str.split("/").str[-1]


In [6]:
%%timeit

for i in range(100):
    pd.read_csv(metadata.iloc[i]['LOCAL_LINK'])

652 ms ± 19.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%%timeit 
links = [metadata.iloc[i]['LOCAL_LINK'] for i in range(100)]

with Pool(processes=4) as pool:
    pool.map(pd.read_csv, links)

348 ms ± 14.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
def read_csv_wrapper(i):
    try:
        row = metadata.iloc[i]
        path = row['LOCAL_LINK']
        ticker = row['TICKER']
        
        df = pd.read_csv(path)
        df['ticker'] = ticker
        df['path'] = path
        df['filing_date'] = row['DATE_FILED']
        return df
    except:
        # some were unable to read because the parse failed 
        return pd.DataFrame()

with Pool(processes=16) as pool:
    dfs = pool.map(read_csv_wrapper, range(len(metadata)))
    
df = pd.concat(dfs)
# filter out failed reads
df = df[~df['text'].isnull()]

# order the df
df = df.sort_values(['ticker', 'item', 'filing_date'])
df['index'] = np.arange(len(df))
df['lead_index'] = df.groupby(['ticker', 'item'])['index'].shift(-1)

# Text cleaning

In [9]:
%%timeit
df.head(1000)['text'].str.replace('\W', ' ', regex=True)\
    .str.lower()\
    .str.split()\
    .str.join(' ')

3.54 s ± 107 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
def clean_string(s):
    s = re.sub('\W', ' ', s)
    s = s.lower()
    s = re.sub(' +', ' ', s)
    return s

In [11]:
%%timeit
df.head(1000)['text'].apply(clean_string)

4.95 s ± 106 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
pandarallel.initialize(progress_bar=True, nb_workers=16, verbose=0)

In [13]:
%%timeit
df.head(1000)['text'].parallel_apply(clean_string)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

2.34 s ± 106 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
df['text'] = df['text'].parallel_apply(clean_string)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1556), Label(value='0 / 1556'))), …

# transform to tfidf

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [16]:
comparison_df = df[~df['lead_index'].isnull()].copy()
comparison_df['lead_index'] = comparison_df['lead_index'].astype(int)

In [17]:
vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(comparison_df['text'])

# perform cosine distance computation

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
%%timeit
cosine_similarity(tfidf[:1000], tfidf[:1000])

742 ms ± 30.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
%%timeit
cosine_similarity(tfidf[:2000], tfidf[:2000])

2.67 s ± 65.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
%%timeit
(tfidf[:999].multiply(tfidf[1:1000]).sum(axis=1) / \
     np.sqrt(tfidf[:999].multiply(tfidf[:999]).sum(axis=1)) / np.sqrt(tfidf[1:1000].multiply(tfidf[1:1000]).sum(axis=1)))

76 ms ± 3.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [22]:
%%timeit
(tfidf[:1999].multiply(tfidf[1:2000]).sum(axis=1) / \
     np.sqrt(tfidf[:1999].multiply(tfidf[:1999]).sum(axis=1)) / np.sqrt(tfidf[1:2000].multiply(tfidf[1:2000]).sum(axis=1)))

172 ms ± 4.07 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [23]:
%%timeit
(tfidf[:-1].multiply(tfidf[1:]).sum(axis=1) / \
     np.sqrt(tfidf[:-1].multiply(tfidf[:-1]).sum(axis=1)) / np.sqrt(tfidf[1:].multiply(tfidf[1:]).sum(axis=1)))

1.51 s ± 43.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
(tfidf[:-1].multiply(tfidf[1:]).sum(axis=1) / \
     np.sqrt(tfidf[:-1].multiply(tfidf[:-1]).sum(axis=1)) / np.sqrt(tfidf[1:].multiply(tfidf[1:]).sum(axis=1)))

matrix([[0.98227666],
        [0.99666753],
        [0.58170126],
        ...,
        [0.56405527],
        [0.93125178],
        [0.94108083]])

In [25]:
tfidf.shape

(18301, 88229)