In [1]:
%matplotlib inline
%load_ext cython

  "Cython.Distutils.old_build_ext does not properly handle dependencies "


In [2]:
%%cython

from pandas import DataFrame, read_sas, read_csv
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt

import numpy as np
cimport numpy as np

from SECEdgar.crawler import SecCrawler

from bs4 import BeautifulSoup as bs

import time
from datetime import datetime as dt
from datetime import date, timedelta

from collections import defaultdict

import os
import re
import lxml
import redis

In [3]:
# Read in SAS data set - takes a while...
%time data = read_sas("data/crsp_comp.sas7bdat")

CPU times: user 6min 1s, sys: 12.2 s, total: 6min 14s
Wall time: 6min 14s


In [4]:
# Read in the positive word list
%time pos_list = read_csv("data/pos_list.csv", header=None, names=['word'])
%time pos_roots = read_csv("data/pos_roots.csv")
pos_roots_dict = dict(zip(list(pos_roots.word), list(pos_roots.group)))

# Read in the negative word list
%time neg_list = read_csv("data/neg_list.csv", header=None, names=['word'])
%time neg_roots = read_csv("data/neg_roots.csv")
neg_roots_dict = dict(zip(list(neg_roots.word), list(neg_roots.group)))

# Turn them into a Series for easier lookups later on
pos_list = pos_list.iloc[:]
neg_list = neg_list.iloc[:]

CPU times: user 6.84 ms, sys: 7.56 ms, total: 14.4 ms
Wall time: 16.1 ms
CPU times: user 5.09 ms, sys: 15.9 ms, total: 21 ms
Wall time: 20.9 ms
CPU times: user 2.54 ms, sys: 70.7 ms, total: 73.2 ms
Wall time: 73.2 ms
CPU times: user 31.1 ms, sys: 110 ms, total: 141 ms
Wall time: 140 ms


In [5]:
# Sort the set by cusip, permno, cik, and then year (descending)
data.sort_values(['CUSIP', 'PERMNO', 'cik', 'year'], ascending=[True, True, True, False], inplace=True)

In [6]:
# Remove any duplicates where CUSIP, PERMNO, and CIK match
ciks = data.drop_duplicates(subset=['CUSIP', 'PERMNO', 'cik'])

# Only keep the cik and ticker column
ciks = ciks[['cik', 'tic']]

# Re-index the dataframe for better access
data.reset_index(inplace=True)

In [None]:
%%capture

# Iterate over each CIK and pull the relevant 10k filings
crawler = SecCrawler()
end_date = '20081231'
count = '20'

for index, row in ciks.iterrows():
    cik = row.iloc[0]
    tic = row.iloc[1]
    crawler.filing_10K(tic, cik, end_date, count)


In [79]:
#%%cython
import pickle as pickle

#cdef void cache_objects(dict objs, int count, int batch):
def cache_objects(objs, count, batch):
    if count % batch == 0:
        print("Count: " + str(count))
        for name, obj in objs.items():
            print("Saving the " + name + " object...")
            pickle.dump(obj, open("data/" + name + ".p", "wb"))


In [86]:
import pickle

# Pull in one file to start working on the parsing algorithm
if filings is None:    
    try:
        filings = pickle.load(open("data/filings.p", "rb"))
    except:
        filings = DataFrame()
    
if processed is None:
    try:
        processed = pickle.load(open("data/processed.p", "rb"))    
    except:
        processed = set()

# Batch save the filings info
count = 0
batch = 1000

# This is for testing
stop = 10
skip_processed = False
process = False

# Connect to redis
rds = redis.Redis()

folder = "SEC-Edgar-data"
for (dirpath, dirnames, filenames) in os.walk(folder, topdown=False):
    #print ("dirpath: " + dirpath + "; # files: " + str(len(filenames)))
    for filename in filenames:
        
        fn = os.sep.join([dirpath, filename])

        if filename.endswith('.txt'):
            count += 1
            if count > stop:
                break
            
            # Skip this file if it exists in the filings object
            #if skip_processed and 'path' in filings.columns and fn in filings.path.values:
            if skip_processed and fn in processed:
                print("File already processed: " + fn + ".")
                continue

            print("(" + str(count) + ") Processing " + fn)
            tic = fn.split('/')[1]
            cik = fn.split('/')[2]
            
            # Pull in 10-K from redis if it exists
            key = "report:" + cik + ":" + fn
            exists = rds.exists(key)
            
            # If the key exists, then we can skip to text processing
            if not exists or not skip_processed:
                # Open the file, get all of the content, and then pull it into a parser
                fh = open(fn, 'r')
                contents = fh.read()

                # Clean up some of the text to fix malformed HTML before parsing it
                malformed_tags = ['ACCEPTANCE-DATETIME', 'TYPE', 'SEQUENCE', 'FILENAME', 'DESCRIPTION']
                for tag in malformed_tags:
                    # Do a regex that replaces all of these malformed tags in the document
                    regex = re.compile(r"(\n<%s>[^<]*?)\n" % re.escape(tag), re.I)
                    contents = regex.sub(r"\1</%s>\n" % tag, contents)

                # Pull the 10-k into the parser
                document = bs(contents, 'lxml')

                # The document can either have a root node of sec-document or ims-document
                if document.find('sec-document') is not None:
                    root = document.find('sec-document')
                elif document.find('ims-document') is not None: 
                    root = document.find('ims-document')
                else:
                    root = document.find('document')

                # Check if this is an amended 10-K and throw it out if so
                type_text = root.find('type')
                if type_text is None:
                    # Couldn't find the type so we move it to the _error folder
                    new_name = 'data/_error/' + tic + '-' + cik + '-' + filename

                    # Close the file so that we can move it
                    fh.close()
                    os.rename(fn, new_name)
                    print("Error finding type - moved file")
                    continue

                elif type_text.text == '10-K/A':
                    # This is an amended 10-k, move it to the "data/_amended" folder
                    new_name = 'data/_amended/' + tic + '-' + cik + '-' + filename

                    # Close the file so that we can move it
                    fh.close()
                    os.rename(fn, new_name)
                    print("Amended 10-K - moved file")
                    continue

                # Get the 'acceptance-datetime' metadata element
                acc_dt = root.find('acceptance-datetime')
                if acc_dt is None:
                    header_text = None
                    # If we didn't find an <acceptance-datetime /> element, find the date elsewhere
                    if root.find('sec-header') is not None:
                        header_text = root.find('sec-header').text
                    elif root.find('ims-header') is not None:
                        header_text = root.find('ims-header').text

                    if header_text:
                        regex = re.compile(r".*\nFILED AS OF DATE:\s+?([\d]+?)\n.*", re.S)
                        filing_dt_text = re.sub(regex, r"\1", header_text)
                    else:
                        # We can't find the filing date for this file so throw it out
                        new_name = 'data/_error/' + tic + '-' + cik + '-' + filename

                        # Close the file so that we can move it
                        fh.close()
                        os.rename(fn, new_name)
                        print("Bad filing date - moved file")
                        continue
                else:
                    # Get the filing date
                    filing_dt_text = acc_dt.text.split('\n', 1)[0][:8]

                filing_dt = dt.strptime(filing_dt_text, '%Y%m%d')
                filing_ts = time.mktime(filing_dt.timetuple())
                begin_dt = dt(1995, 1, 1)

                # If the filing date is not within our date range, then move it
                if begin_dt > filing_dt:
                    # This file is outside of our date range so move it
                    new_name = 'data/_outofrange/' + tic + '-' + cik + '-' + filename

                    # Close the file so that we can move it
                    fh.close()
                    os.rename(fn, new_name)
                    print("Out of date range - moved file.")
                    continue
                    
                # See if we can find stock info for this company on the filing date of the 10-K
                try:
                    #pass
                    %time cik_df = data[(data['cik'] == bytes(cik, 'utf-8'))]
                    %time index = cik_df[(cik_df['date'] == filing_dt)].index[0]
                    #index = data[(data['cik'] == bytes(cik, 'utf-8')) & (data['date'] == filing_dt)].index[0]
                except:
                    # We don't have stock data for this company at this time period
                    new_name = 'data/_nostockdata/' + tic + '-' + cik + '-' + filename

                    # Close the file so that we can move it
                    fh.close()
                    #os.rename(fn, new_name)
                    print("No stock data found - moved file.")
                    #continue

                # Remove the exhibits
                [ex.extract() for ex in root.findAll('document')[1:]]

                # Grab the report (and throw out images, tables)
                report = root.find('text')

                # Remove some elements
                del_tags = ['img', 'hr', 'head']
                [t.extract() for t in report.findAll(del_tags)]

                strip_tags = ['b', 'i', 'u', 'sup', 'em', 'strong', 'font', 'p', 'div', 'td', 'tr', 'table', 'body', 'html', 'page', 'text']
                [t.replaceWithChildren() for t in report.findAll(strip_tags)]

                replace_tags = [{'br': '\n'}]
                for tag in replace_tags:
                    tag, replace = tag.popitem()
                    [t.replaceWith(replace) for t in report.findAll(tag)]

                # Save the text in redis
                report_hash = {
                    'cik': cik,
                    'tic': tic,
                    'path': fn,
                    'file_name': filename,
                    'filing_date': filing_ts,
                    'report': pickle.dumps(unicode(report)),
                    'index': index,
                    'mtime': time.time()
                }
                print("Saving to redis: " + key)
                rds.hmset(key, report_hash)
                
                # Close the file handle
                fh.close()

                # If we don't want to process the file, then we will quit here
                if not process:
                    # Save this file as processed
                    processed.add(fn)

                    # Save certain objects so we don't have to process everything again
                    objs = {'filings': filings, 'processed': processed}
                    cache_objects(objs, count, batch)
                    continue
            else:
                # Get the report out of redis
                print("Found in redis: " + key)
                report = pickle.loads(rds.hget(key, 'report'))
            
            # Now that everything is cleaned up, we can run the word processing algorithm
            pos_occurs = defaultdict(int)
            neg_occurs = defaultdict(int)
            negators = pd.Series(['not', 'no', 'never'])

            # We will tokenize the text and iterate through each word
            tokens = pd.Series(report.text.split())
            total_words = len(tokens)

            # First, filter out words that aren't in the 12dictionary word list

            # Now, process the text
            for index, token in tokens.iteritems():
                if token in pos_list.values:
                    # Check to see if there is a negator
                    negated = False
                    for word in tokens.iloc[(index - 3):(index + 3)]:
                        if word in negators.values:
                            #print("Found a negator: " + word + " - " + token)
                            negated = True

                    if not negated:
                        root = pos_roots_dict[token]
                        pos_occurs[root] += 1
                elif token in neg_list.values:
                    # Check to see if there is a negator
                    negated = False
                    for word in tokens.iloc[(index - 3):(index + 3)]:
                        if word in negators.values:
                            #print("Found a negator: " + word + " - " + token)
                            negated = True

                    if not negated:
                        root = neg_roots_dict[token]
                        neg_occurs[root] += 1

            # Save results of text processing to key in redis            
            report_hash = {
                'pos_occurs': pickle.dumps(pos_occurs),
                'neg_occurs': pickle.dumps(neg_occurs),
                'total_words': total_words,
                'mtime': time.time()
            }
            rds.hmset(key, report_hash)
            
            # Add the info for this 10-K to the filings dataframe to keep track of it
            report_hash['pos_occurs'] = pos_occurs
            report_hash['neg_occurs'] = neg_occurs
            filings = filings.append(report_hash, ignore_index=True)
            
            # Use the index we found earlier to ensure we have stock data and now we grab the four day window returns
            returns = data.ix[index:(index + 3)]
            returns = returns[['RET', 'vwretd']]

            # Calculate the abnormal return: r_i = M{t=0, 3} (ret_i,j) - M{t=0,3} (ret_vwi,t)
            ret = 1
            ret_vwi = 1
            for col, series in returns.iteritems():
                if col == 'RET':
                    for r in series:
                        ret *= r
                elif col == 'vwretd':
                    for r in series:
                        ret_vwi *= r
            ab_ret = ret - ret_vwi
            print("Abnormal return: " + str(ab_ret))
            
            # Now estimate the weights for the words
            
            
            # Normalize the weights of the words
            
            
            # Run the actual regression using the estimated weights for the words
            
            
            # Calculate the score of the document using the weights for each word given by the regression
            
            
            # Save this file as processed
            processed.add(fn)

            # Save certain objects so we don't have to process everything again
            objs = {'filings': filings, 'processed': processed}
            cache_objects(objs, count, batch)


(1) Processing SEC-Edgar-data/3MCLN./0000789547/10-K/0000948830-96-000019.txt
CPU times: user 3.8 s, sys: 7.9 ms, total: 3.81 s
Wall time: 3.8 s
No stock data found - moved file.


NameError: name 'unicode' is not defined

In [53]:
index = data[(data['cik'] == b'0000320193') & (data['date'] == filing_dt)].index[0]
returns = data.ix[index:(index + 3)]
returns = returns[['RET', 'vwretd']]

ret = 1
ret_vwi = 1

for col, series in returns.iteritems():
    if col == 'RET':
        for r in series:
            #print((ret,r))
            ret *= r
    elif col == 'vwretd':
        for r in series:
            #print((ret_vwi,r))
            ret_vwi *= r

ab_ret = ret - ret_vwi
print((ret, ret_vwi, ab_ret))


(-1.2158094036836717e-07, -7.2910937480136238e-10, -1.208518309935658e-07)


In [78]:
len(processed)

15

In [25]:
returns[['RET', 'vwretd']]

Unnamed: 0,RET,vwretd
1024959,-0.037838,0.015756
1024960,0.05618,0.003094
1024961,-0.010638,-0.003173
1024962,-0.005376,0.004714


In [None]:
import pickle

# Connect to redis
rds = redis.Redis()

# This will store the global positive and negative words occurrances
pos_occurs_all = defaultdict(int)
neg_occurs_all = defaultdict(int)

# Go through redis and grab every 10-k
keys = rds.keys("report:*")

print("Total keys: " + str(len(keys)))
for key in keys:
    report_pos_occurs = pickle.loads(rds.hget(key, 'pos_occurs'))
    report_neg_occurs = pickle.loads(rds.hget(key, 'neg_occurs'))
    
    for word, freq in report_pos_occurs.items():
        pos_occurs_all[word] += freq
        
    for word, freq in report_neg_occurs.items():
        neg_occurs_all[word] += freq

# Print out most frequent positive words
print("Most Frequent Positive Words\n" +
       "============================")

pos_sorted = pd.Series(data=pos_occurs_all).sort_values(ascending=False)
print(pos_sorted)

# Print out most frequent negative words
print("\n\nMost Frequent Negative Words\n" +
       "============================")

neg_sorted = pd.Series(data=neg_occurs_all).sort_values(ascending=False)
print(neg_sorted)

In [None]:
%prun -l 4 cache_objects({'test': processed, 'test2': DataFrame()}, 1, 1)

b'0000320193'

In [None]:
objs = {'filings': filings, 'processed': processed}
for name, obj in objs.iteritems():
    print (name, obj)