In [1]:
import numpy as np
import pandas as pd
import cython
import os
import re
import json
from bs4 import BeautifulSoup
from multiprocessing import Pool
from pandarallel import pandarallel

In [2]:
os.chdir('/mnt/d/workspace/8-2/Financial-Statements-Text-Analysis/')

In [3]:
# params
with open('config.json', 'r') as f:
    c = json.load(f)
input_dir = os.path.join(c['DATA_DIR'], '10k_clean')
# destination_dir = os.path.join(c['DATA_DIR'], '10k_clean')

# read processed 10-Ks in

In [4]:
metadata = pd.read_csv(os.path.join(c['DATA_DIR'], 'metadata.csv'))
metadata_legacy = pd.read_csv(os.path.join(c['DATA_DIR'], 'metadata_2017.csv'))

# only download the data from russell 3000 today
metadata = metadata_legacy[metadata_legacy['TICKER'].isin(metadata['ticker'])]

In [5]:
metadata['LOCAL_LINK'] = input_dir + '/' + metadata['TICKER'] + '/' + metadata['EDGAR_LINK'].str.split("/").str[-1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata['LOCAL_LINK'] = input_dir + '/' + metadata['TICKER'] + '/' + metadata['EDGAR_LINK'].str.split("/").str[-1]


In [6]:
%%timeit

for i in range(100):
    pd.read_csv(metadata.iloc[i]['LOCAL_LINK'])

618 ms ± 20.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%%timeit 
links = [metadata.iloc[i]['LOCAL_LINK'] for i in range(100)]

with Pool(processes=4) as pool:
    pool.map(pd.read_csv, links)

350 ms ± 16.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
def read_csv_wrapper(i):
    try:
        row = metadata.iloc[i]
        path = row['LOCAL_LINK']
        ticker = row['TICKER']
        
        df = pd.read_csv(path)
        df['ticker'] = ticker
        df['path'] = path
        df['filing_date'] = row['FILING_DATE']
        return df
    except:
        # some were unable to read because the parse failed 
        return pd.DataFrame()

with Pool(processes=16) as pool:
    dfs = pool.map(read_csv_wrapper, range(len(metadata)))
    
df = pd.concat(dfs)
df = df[~df['text'].isnull()]

KeyError: 'text'

# Text cleaning

In [None]:
%%timeit
df.head(1000)['text'].str.replace('\W', ' ', regex=True)\
    .str.lower()\
    .str.split()\
    .str.join(' ')

In [None]:
def clean_string(s):
    s = re.sub('\W', ' ', s)
    s = s.lower()
    s = re.sub(' +', ' ', s)
    return s

In [None]:
%%timeit
df.head(1000)['text'].apply(clean_string)

In [None]:
pandarallel.initialize(progress_bar=True, nb_workers=16, verbose=0)

In [None]:
%%timeit
df.head(1000)['text'].parallel_apply(clean_string)

In [None]:
df['text'] = df['text'].parallel_apply(clean_string)

# transform to tfidf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(df['text'])

In [None]:
tfidf

# perform cosine distance computation

In [None]:
df.sort_values(['ticker', 'item'])