In [1]:
%matplotlib inline
%load_ext cython

In [8]:
%%cython

from pandas import DataFrame, read_sas, read_csv
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt

import numpy as np
cimport numpy as np

from SECEdgar.crawler import SecCrawler

from bs4 import BeautifulSoup as bs

import time
from datetime import datetime as dt
from datetime import date

from collections import defaultdict

import os
import re
import lxml

In [50]:
# Read in SAS data set - takes a while...
%time data = read_sas("data/crsp_comp.sas7bdat")

CPU times: user 3min 2s, sys: 11.1 s, total: 3min 13s
Wall time: 3min 13s


In [51]:
# Read in the positive word list
%time pos_list = read_csv("data/pos_list.csv", header=None, names=['word'])
%time pos_roots = read_csv("data/pos_roots.csv")
pos_roots_dict = dict(zip(list(pos_roots.word), list(pos_roots.group)))

# Read in the negative word list
%time neg_list = read_csv("data/neg_list.csv", header=None, names=['word'])
%time neg_roots = read_csv("data/neg_roots.csv")
neg_roots_dict = dict(zip(list(neg_roots.word), list(neg_roots.group)))

# Turn them into a Series for easier lookups later on
pos_list = pos_list.iloc[:]
neg_list = neg_list.iloc[:]

CPU times: user 677 µs, sys: 15.5 ms, total: 16.2 ms
Wall time: 15.3 ms
CPU times: user 3.55 ms, sys: 24 ms, total: 27.5 ms
Wall time: 27.5 ms
CPU times: user 14.1 ms, sys: 70.2 ms, total: 84.3 ms
Wall time: 84.9 ms
CPU times: user 2.86 ms, sys: 150 ms, total: 153 ms
Wall time: 152 ms


In [52]:
# Sort the set by cusip, permno, cik, and then year (descending)
data.sort_values(['CUSIP', 'PERMNO', 'cik', 'year'], ascending=[True, True, True, False], inplace=True)

In [53]:
# Remove any duplicates where CUSIP, PERMNO, and CIK match
ciks = data.drop_duplicates(subset=['CUSIP', 'PERMNO', 'cik'])

# Only keep the cik and ticker column
ciks = ciks[['cik', 'tic']]

In [None]:
%%capture

# Iterate over each CIK and pull the relevant 10k filings
crawler = SecCrawler()
end_date = '20081231'
count = '20'

for index, row in ciks.iterrows():
    cik = row.iloc[0]
    tic = row.iloc[1]
    crawler.filing_10K(tic, cik, end_date, count)


In [57]:
%%cython
import cPickle as pickle

cdef void cache_objects(dict objs, int count, int batch):
    if count % batch == 0:
        print("Count: " + str(count))
        for name, obj in objs.iteritems():
            print("Saving the " + name + " object...")
            pickle.dump(obj, open("data/" + name + ".p", "wb"))
            


In [45]:
%prun -l 4 cache_objects({'test': processed, 'test2': DataFrame()}, 1, 1)

Count: 1
Saving the test object...
Saving the test2 object...
 

In [58]:
import cPickle as pickle

# Pull in one file to start working on the parsing algorithm
try:
    filings = pickle.load(open("data/filings.p", "rb"))
except:
    filings = DataFrame()
    
try:
    processed = pickle.load(open("data/processed.p", "rb"))    
except:
    processed = set()
    
# Batch save the filings info
count = 0
batch = 50
file_count = 0

# This is for testing
stop = 100000
skip_processed = True
process = True

folder = "SEC-Edgar-data"
for (dirpath, dirnames, filenames) in os.walk(folder):
    for filename in filenames:
        file_count += 1
        fn = os.sep.join([dirpath, filename])

        if filename.endswith('.txt'):
            # Skip this file if it exists in the filings object
            if skip_processed and 'path' in filings.columns and fn in filings.path.values:
            #if skip_processed and fn in processed:
                #print("File already processed: " + fn + ".")
                break

            count += 1
            if count > stop:
                break

            print("(" + str(count) + ") Processing " + fn)
            tic = fn.split('/')[1]
            cik = fn.split('/')[2]

            # Open the file, get all of the content, and then pull it into a parser
            fh = open(fn, 'r')
            contents = fh.read()

            # Clean up some of the text to fix malformed HTML before parsing it
            malformed_tags = ['ACCEPTANCE-DATETIME', 'TYPE', 'SEQUENCE', 'FILENAME', 'DESCRIPTION']
            for tag in malformed_tags:
                # Do a regex that replaces all of these malformed tags in the document
                regex = re.compile(r"(\n<%s>[^<]*?)\n" % re.escape(tag), re.I)
                contents = regex.sub(r"\1</%s>\n" % tag, contents)

            # Pull the 10-k into the parser
            document = bs(contents, 'lxml')

            # The document can either have a root node of sec-document or ims-document
            if document.find('sec-document') is not None:
                root = document.find('sec-document')
            elif document.find('ims-document') is not None: 
                root = document.find('ims-document')
            else:
                root = document.find('document')

            # Check if this is an amended 10-K and throw it out if so
            type_text = root.find('type')
            if type_text is None:
                # Couldn't find the type so we move it to the _error folder
                new_name = 'data/_error/' + tic + '-' + cik + '-' + filename

                # Close the file so that we can move it
                fh.close()
                os.rename(fn, new_name)
                print("Error finding type - moved file")
                break

            elif type_text.text == '10-K/A':
                # This is an amended 10-k, move it to the "data/_amended" folder
                new_name = 'data/_amended/' + tic + '-' + cik + '-' + filename

                # Close the file so that we can move it
                fh.close()
                os.rename(fn, new_name)
                print("Amended 10-K - moved file")
                break

            # Get the 'acceptance-datetime' metadata element
            acc_dt = root.find('acceptance-datetime')
            if acc_dt is None:
                header_text = None
                # If we didn't find an <acceptance-datetime /> element, find the date elsewhere
                if root.find('sec-header') is not None:
                    header_text = root.find('sec-header').text
                elif root.find('ims-header') is not None:
                    header_text = root.find('ims-header').text

                if header_text:
                    regex = re.compile(r".*\nFILED AS OF DATE:\s+?([\d]+?)\n.*", re.S)
                    filing_dt_text = re.sub(regex, r"\1", header_text)
                    filing_dt = dt.strptime(filing_dt_text, '%Y%m%d')
                    filing_ts = time.mktime(filing_dt.timetuple())
                else:
                    # We can't find the filing date for this file so throw it out
                    new_name = 'data/_outofrange/' + tic + '-' + cik + '-' + filename

                    # Close the file so that we can move it
                    fh.close()
                    os.rename(fn, new_name)
                    print("Bad filing date - moved file")
                    break
            else:
                # Get the filing date
                filing_dt_text = acc_dt.text.split('\n', 1)[0][:8]

            begin_dt = dt(1995, 1, 1)

            # If the filing date is not within our date range, then move it
            if begin_dt > filing_dt:
                # This file is outside of our date range so move it
                new_name = 'data/_outofrange/' + tic + '-' + cik + '-' + filename

                # Close the file so that we can move it
                fh.close()
                os.rename(fn, new_name)
                print("Out of date range - moved file.")
                break

            # Remove the exhibits
            #for doc in document.find('sec-document').findAll('document'):
                #[ex.extract() for ex in docume]

            # If we don't want to process the file, then we will quit here
            if not process:
                # Save this file as processed
                processed.add(fn)
                
                # Save certain objects so we don't have to process everything again
                objs = {'filings': filings, 'processed': processed}
                cache_objects(objs, count, batch)
                break

            # Grab the report (and throw out images, tables)
            report = root.find('text')

            # Remove some elements
            del_tags = ['img', 'hr', 'head']
            for tag in del_tags:
                [t.extract() for t in report.findAll(tag)]

            strip_tags = ['b', 'i', 'u', 'sup', 'em', 'strong', 'font', 'p', 'div', 'td', 'tr', 'table', 'body', 'html']
            for tag in strip_tags:
                [t.replaceWithChildren() for t in report.findAll(tag)]

            replace_tags = [{'br': '\n'}]
            for tag in replace_tags:
                tag, replace = tag.popitem()
                [t.replaceWith(replace) for t in report.findAll(tag)]

            # Now that everything is cleaned up, we can run the word processing algorithm
            pos_occurs = defaultdict(int)
            neg_occurs = defaultdict(int)
            negators = pd.Series(['not', 'no', 'never'])

            # We will tokenize the text and iterate through each word
            tokens = pd.Series(report.text.split())

            # First, filter out words that aren't in the 12dictionary word list

            # Now, process the text
            for index, token in tokens.iteritems():
                if token in pos_list.values:
                    # Check to see if there is a negator
                    negated = False
                    for word in tokens.iloc[(index - 3):(index + 3)]:
                        if word in negators.values:
                            #print("Found a negator: " + word + " - " + token)
                            negated = True

                    if not negated:
                        root = pos_roots_dict[token]
                        pos_occurs[root] += 1
                elif token in neg_list.values:
                    # Check to see if there is a negator
                    negated = False
                    for word in tokens.iloc[(index - 3):(index + 3)]:
                        if word in negators.values:
                            #print("Found a negator: " + word + " - " + token)
                            negated = True

                    if not negated:
                        root = neg_roots_dict[token]
                        neg_occurs[root] += 1

            # Add the info for this 10-K to the filings dataframe to keep track of it
            filings = filings.append(
                {'cik': cik,
                 'tic': tic,
                 'path': fn,
                 'file_name': filename,
                 'filing_date': filing_ts,
                 'pos_occurs': pos_occurs,
                 'neg_occurs': neg_occurs,
                 'mtime': time.time()
                }, ignore_index=True)

            # Save this file as processed
            processed.add(fn)

            # Save certain objects so we don't have to process everything again
            objs = {'filings': filings, 'processed': processed}
            cache_objects(objs, count, batch)
    
print(file_count)

(1) Processing SEC-Edgar-data/3MCLN./0000789547/10-K/0000948830-96-000019.txt
(2) Processing SEC-Edgar-data/3MCLN./0000789547/10-K/0000948830-97-000002.txt
(3) Processing SEC-Edgar-data/ITGB/0000320573/10-K/0000320573-96-000138.txt
(4) Processing SEC-Edgar-data/ITGB/0000320573/10-K/0000320573-05-000040.txt
(5) Processing SEC-Edgar-data/ITGB/0000320573/10-K/0000320573-04-000055.txt
(6) Processing SEC-Edgar-data/ITGB/0000320573/10-K/0000320573-01-500016.txt
(7) Processing SEC-Edgar-data/ITGB/0000320573/10-K/0000320573-00-000006.txt
(8) Processing SEC-Edgar-data/ITGB/0000320573/10-K/0000320573-95-000097.txt
(9) Processing SEC-Edgar-data/ITGB/0000320573/10-K/0000320573-00-000023.txt
(10) Processing SEC-Edgar-data/ITGB/0000320573/10-K/0000320573-98-000032.txt
(11) Processing SEC-Edgar-data/ITGB/0000320573/10-K/0000320573-02-000026.txt
(12) Processing SEC-Edgar-data/ITGB/0000320573/10-K/0000320573-06-000032.txt
(13) Processing SEC-Edgar-data/ITGB/0000320573/10-K/0000320573-98-000022.txt
(14)

KeyError: u'distinction'

In [None]:
print("Most Frequent Positive Words\n" +
       "============================")

for index, row in filings.iterrows():
    pos_sorted = pd.Series(data=row.pos_occurs).sort_values(ascending=False)
    print("\n" + row.path + "\n" +
           "=" * len(row.path))
    print(pos_sorted)

In [None]:
print("Most Frequent Negative Words\n" +
       "============================")

for index, row in filings.iterrows():
    neg_sorted = pd.Series(data=row.neg_occurs).sort_values(ascending=False)
    print("\n" + row.path + "\n" +
           "=" * len(row.path))
    print(neg_sorted)

In [None]:
objs = {'filings': filings, 'processed': processed}
for name, obj in objs.iteritems():
    print (name, obj)

In [60]:
pos_roots_dict['distinction']

KeyError: 'distinction'

In [61]:
'distinction' in pos_list.values

True