In [52]:
%matplotlib inline

from pandas import DataFrame, read_sas, read_csv
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt

import numpy as np

from SECEdgar.crawler import SecCrawler

from bs4 import BeautifulSoup as bs

import time
from datetime import datetime as dt
from datetime import date

from collections import defaultdict

import os
import re
import lxml

In [2]:
# Read in SAS data set - takes a while...
%time data = read_sas("data/crsp_comp.sas7bdat")

Wall time: 5min 10s


In [98]:
# Read in the positive word list
%time pos_list = read_csv("data/pos_list.csv", header=None, names=['word'])
%time pos_roots = read_csv("data/pos_roots.csv")
pos_roots_dict = dict(zip(list(pos_roots.word), list(pos_roots.group)))

# Read in the negative word list
%time neg_list = read_csv("data/neg_list.csv", header=None, names=['word'])
%time neg_roots = read_csv("data/neg_roots.csv")
neg_roots_dict = dict(zip(list(neg_roots.word), list(neg_roots.group)))

# Turn them into a Series for easier lookups later on
pos_list = pos_list.iloc[:]
neg_list = neg_list.iloc[:]

Wall time: 5 ms
Wall time: 15 ms
Wall time: 8 ms
Wall time: 9 ms


In [104]:
len(neg_roots.group.unique())

718

In [46]:
# List column names
print(data.columns.values)
print(pos_list.columns.values)
print(neg_list.columns.values)

[u'CUSIP' u'PERMNO' u'cik' u'date' u'year' u'fyear' u'tic' u'conm'
 u'mkvalt' u'at' u'intan' u'lt' u'book_market' u'tlta' u'cacl' u'nita'
 u'icf_na' u'rsiz' u'mkt_val' u'PRC' u'RET' u'ASKHI' u'BIDLO' u'VOL'
 u'RETX' u'vwretd' u'totval']
['word']
['word']


In [4]:
# Sort the set by cusip, permno, cik, and then year (descending)
data.sort_values(['CUSIP', 'PERMNO', 'cik', 'year'], ascending=[True, True, True, False], inplace=True)

In [5]:
# Remove any duplicates where CUSIP, PERMNO, and CIK match
ciks = data.drop_duplicates(subset=['CUSIP', 'PERMNO', 'cik'])

# Only keep the cik and ticker column
ciks = ciks[['cik', 'tic']]

In [None]:
%%capture

# Iterate over each CIK and pull the relevant 10k filings
crawler = SecCrawler()
end_date = '20081231'
count = '20'

for index, row in ciks.iterrows():
    cik = row.iloc[0]
    tic = row.iloc[1]
    crawler.filing_10K(tic, cik, end_date, count)


In [116]:
# Pull in one file to start working on the parsing algorithm
filings = DataFrame()

fn = 'SEC-Edgar-data/3ABHH/0001052489/10-K/0001193125-07-096277.txt'
tic = fn.split('/')[1]
cik = fn.split('/')[2]
file_name = fn.split('/')[4]

# Open the file, get all of the content, and then pull it into a parser
fh = open(fn, 'r')
contents = fh.read()

# Clean up some of the text to fix malformed HTML before parsing it
malformed_tags = ['ACCEPTANCE-DATETIME', 'TYPE', 'SEQUENCE', 'FILENAME', 'DESCRIPTION']
for tag in malformed_tags:
    # Do a regex that replaces all of these malformed tags in the document
    regex = re.compile(r"(\n<%s>[^<]*?\n)" % re.escape(tag), re.I)
    contents = regex.sub(r"\1</%s>" % tag, contents)

# Pull the 10-k into the parser
document = bs(contents, 'lxml')

# Check if this is an amended 10-K and throw it out if so
type_text = document.find('sec-document').document.type.text
if type_text == '10-K/A':
    # This is an amended 10-k, move it to the "SEC-Edgar-data/_amended" folder
    new_name = 'SEC-Edgar-data/_amended/' + tic + '-' + cik + '-' + file_name
    
    # Close the file so that we can move it
    fh.close()
    #os.rename(fn, new_name)
    print("TODO: Amended 10-K - move the file.")
    
# Get the 'acceptance-datetime' metadata element
acc_dt = document.find('acceptance-datetime')
if acc_dt is not None:
    # If we didn't find an <acceptance-datetime /> element, find the date elsewhere
    header_text = document.find('sec-header').text
    regex = re.compile(r".*\nFILED AS OF DATE:\s+?([\d]+?)\n.*", re.S)
    filing_dt_text = re.sub(regex, r"\1", header_text)
else:
    # Get the filing date
    filing_dt_text = acc_dt.text.split('\n', 1)[0]
    
begin_dt = dt(1995, 1, 1)
filing_dt = dt.strptime(filing_dt_text, '%Y%m%d')
filing_ts = time.mktime(filing_dt.timetuple())

# If the filing date is not within our date range, then move it
if begin_dt > filing_dt:
    # TODO: Move the file to another folder
    print("TODO: Out of date range - move the file to another folder")    

# Add the info for this 10-K to the filings dataframe to keep track of it
filings = filings.append({'cik': cik, 'tic': tic, 'file_name': file_name, 'filing_date': filing_ts}, ignore_index=True)

# Remove the exhibits
#for doc in document.find('sec-document').findAll('document'):
    #[ex.extract() for ex in docume]

# Grab the report (and throw out images, tables)
report = document.find('sec-document').document.find('text')

# Remove some elements
del_tags = ['img', 'hr', 'head']
for tag in del_tags:
    [t.extract() for t in report.findAll(tag)]

strip_tags = ['b', 'i', 'u', 'sup', 'em', 'strong', 'font', 'p', 'div', 'td', 'tr', 'table', 'body', 'html']
for tag in strip_tags:
    [t.replaceWithChildren() for t in report.findAll(tag)]

replace_tags = [{'br': '\n'}]
for tag in replace_tags:
    tag, replace = tag.popitem()
    [t.replaceWith(replace) for t in report.findAll(tag)]
    
#print(report.text)

# Now that everything is cleaned up, we can run the word processing algorithm
pos_occurs = defaultdict(int)
neg_occurs = defaultdict(int)
negators = ['not', 'no', 'never']

# We will tokenize the text and iterate through each word
tokens = report.text.split()
for index, token in enumerate(tokens):
    #print(token)
    #print(index, token)
    if token in pos_list.values:
        #print("Positive: " + token)
        # Check to see if there is a negator
        negated = False
        for word in tokens[(index - 3):(index + 3)]:
            if word in negators:
                #print("Found a negator: " + word + " - " + token)
                negated = True
                
        if not negated:
            root = pos_roots_dict[token]
            pos_occurs[root] += 1
    elif token in neg_list.values:
        #print("Negative: " + token)
        # Check to see if there is a negator
        negated = False
        for word in tokens[(index - 3):(index + 3)]:
            if word in negators:
                #print("Found a negator: " + word + " - " + token)
                negated = True
                
        if not negated:
            root = neg_roots_dict[token]
            neg_occurs[root] += 1

#print(pos_occurs, neg_occurs)

In [117]:
print("Most Frequent Positive Words")
pos_sorted = pd.Series(data=pos_occurs).sort_values(ascending=False)
for key, freq in pos_sorted.iteritems():
    print(key + ":\t" + str(freq))

Most Frequent Positive Words
beneficial:	43
outstanding:	10
opportunities:	8
good:	7
achieve:	5
reward:	3
succeed:	2
strength:	2
effective:	2
attain:	2
valuable:	1
satisfactorily:	1
highest:	1
gain:	1
exclusive:	1
enable:	1


In [118]:
print("Most Frequent Negative Words")
neg_sorted = pd.Series(data=neg_occurs).sort_values(ascending=False)
for key, freq in neg_sorted.iteritems():
    print(key + ":\t" + str(freq))

Most Frequent Negative Words
terminations:	36
resigns:	15
failures:	9
against:	7
lapsing:	6
forfeitures:	6
restating:	5
omitting:	5
breaching:	5
ceasing:	4
dispose:	4
adversity:	3
standstills:	3
disregards:	2
closures:	2
disclaims:	2
fraudulently:	2
inconsistently:	2
bankrupts:	2
oppositions:	2
convictions:	2
critically:	1
embezzling:	1
delinquents:	1
denying:	1
disclosing:	1
foregone:	1
limitations:	1
liquidators:	1
refusing:	1
absences:	1
