In [87]:
%matplotlib inline
%load_ext cython

The cython extension is already loaded. To reload it, use:
  %reload_ext cython


In [89]:
from pandas import DataFrame, read_sas, read_csv
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt

import numpy as np

from SECEdgar.crawler import SecCrawler

from bs4 import BeautifulSoup as bs

import time
from datetime import datetime as dt
from datetime import date, timedelta

from collections import defaultdict

import os
import re
import lxml
import redis
import string
import pickle
import math
import zlib

import nltk
from nltk.corpus import stopwords

try: stopwords.words('english')
except LookupError: nltk.download('stopwords')
    
import statsmodels.api as sm
    
rds = redis.Redis()

In [91]:
# Read in SAS data set - takes a while so try to use redis...
try: data
except:
    if rds.exists('data:word-power'):
        %time data = pickle.loads(zlib.decompress(rds.get('data:word-power')))
    else:
        %time data = read_sas("data/crsp_comp.sas7bdat")

        # Trim the SAS data set
        data = data[['CUSIP','PERMNO','cik','date','PRC','RET','vwretd']]

        # Sort the set by cusip, permno, cik, and then year (descending)
        data.sort_values(['CUSIP', 'PERMNO', 'cik', 'date'], ascending=[True, True, True, False], inplace=True)

        # Re-index the dataframe for better access
        data.reset_index(inplace=True)

        rds.set('data:word-power', zlib.compress(pickle.dumps(data)))

# We only need certain columns from the data set and we must set the right index for performance
try: df
except:
    df = data[["cik", "date", "PRC", "RET", "vwretd"]]
    df.set_index(keys=['cik','date'], inplace=True)
        
# Positive words
try: pos_dict, pos_roots, pos_roots_map
except:
    if rds.exists('data:pos-dict') and rds.exists('data:pos-roots') and rds.exists('data:pos-roots-map'):
        pos_dict = pickle.loads(rds.get('data:pos-dict'))
        pos_roots = pickle.loads(rds.get('data:pos-roots'))
        pos_roots_map = pickle.loads(rds.get('data:pos-roots-map'))
    else:
        # Read in the positive word list(s)
        pos_dict = read_csv("data/pos_list.csv", header=None, names=['word'])
        pos_dict = set(pos_dict['word'])
        pos_roots = read_csv("data/pos_roots.csv")
        pos_roots_map = dict(zip(list(pos_roots.word), list(pos_roots.group)))
        pos_roots = set(pos_roots['group'].drop_duplicates())

        # Save this data to redis for later
        rds.set('data:pos-dict', pickle.dumps(pos_dict))
        rds.set('data:pos-roots', pickle.dumps(pos_roots))
        rds.set('data:pos-roots-map', pickle.dumps(pos_roots_map))

# Negative words
try: neg_dict, neg_roots, neg_roots_map
except:
    if rds.exists('data:neg-dict') and rds.exists('data:neg-roots') and rds.exists('data:neg-roots-map'):
        neg_dict = pickle.loads(rds.get('data:neg-dict'))
        neg_roots = pickle.loads(rds.get('data:neg-roots'))
        neg_roots_map = pickle.loads(rds.get('data:neg-roots-map'))
    else:
        # Read in the negative word list(s)
        neg_dict = read_csv("data/neg_list.csv", header=None, names=['word'])
        neg_dict = set(neg_dict['word'])
        neg_roots = read_csv("data/neg_roots.csv")
        neg_roots_map = dict(zip(list(neg_roots.word), list(neg_roots.group)))
        neg_roots = set(neg_roots['group'].drop_duplicates())

        # Save this data to redis for later
        rds.set('data:neg-dict', pickle.dumps(neg_dict))
        rds.set('data:neg-roots', pickle.dumps(neg_roots))
        rds.set('data:neg-roots-map', pickle.dumps(neg_roots_map))

# 2of12inf dictionary
try: dict_2of12inf
except:
    if rds.exists('data:2of12inf'):
        dict_2of12inf = pickle.loads(rds.get('data:2of12inf'))
    else:
        # Read in the 2of12inf
        dict_2of12inf = read_csv("data/2of12inf.txt", header=None, names=['word'])

        # Iterate through and remove the percent signs
        regex = re.compile(r'%$')
        dict_2of12inf.apply(lambda x: re.sub(regex, r'', x['word']), axis=1)
        dict_2of12inf = set(dict_2of12inf['word'])

        # Save this to redis for later
        rds.set('data:2of12inf', pickle.dumps(dict_2of12inf))

In [122]:
cik = "0000884219"
filing_dt = dt(1999, 3, 24)
cik_df = df.ix[bytes(cik, 'utf-8')]
try: 
    index = df.index.get_loc((bytes(cik, 'utf-8'), filing_dt))
except (IndexError, KeyError):
    print("Can't find stock data")

Can't find stock data


In [131]:
%%cython
index = b'1110776400.0'
index = int(float(index.decode('utf-8')))
index

In [None]:
%%capture

# Remove any duplicates where CUSIP, PERMNO, and CIK match
ciks = data.drop_duplicates(subset=['CUSIP', 'PERMNO', 'cik'])

# Only keep the cik and ticker column
ciks = ciks[['cik', 'tic']]

# Iterate over each CIK and pull the relevant 10k filings
crawler = SecCrawler()
end_date = '20081231'
count = '20'

for index, row in ciks.iterrows():
    cik = row.iloc[0]
    tic = row.iloc[1]
    crawler.filing_10K(tic, cik, end_date, count)


In [93]:
%%cython
from cpython cimport bool

from __main__ import rds

def check_redis(str cleaned_key, str processed_key, str report_key):
    cdef bool processed = False
    cdef bool cleaned = False
    cdef str mtime
    
    if not rds.exists(cleaned_key):
        if not rds.exists(processed_key):
            # Temporary check to see if this file has been processed fully
            if rds.exists(report_key):
                mtime = rds.hget(report_key, 'mtime')

                if not rds.hexists(report_key, 'company_data'):
                    # Hasn't been cleaned with the new algorithm, so keep booleans False
                    pass
                elif rds.hexists(report_key, 'hist_ret'):
                    processed = True
                    cleaned = True

                    # Save to proper place in redis
                    rds.set(cleaned_key, mtime)
                    rds.set(processed_key, mtime)
                else:
                    cleaned = True

                    # Save to proper place in redis
                    rds.set(cleaned_key, mtime)
        else:
            processed = True
    else:
        # Check to see if this has really been cleaned (company_data exists)
        if rds.hexists(report_key, 'company_data'):
            cleaned = True
            if rds.exists(processed_key):
                processed = True
        
    return (cleaned, processed)

In [94]:
%%cython
import os

def move_file(fh, str fn, str folder, str tic, str cik, str filename, str message):
    # Generate the new name of the file
    cdef str s = os.sep
    cdef str new_name = 'data' + s + folder + s + tic + '-' + cik + '-' + filename

    # Close the file so that we can move it
    fh.close()
    os.rename(fn, new_name)
    print(message)

In [123]:
%%cython
import os
import re
import time
import string
import pickle
from datetime import datetime as dt

from bs4 import BeautifulSoup as bs

from __main__ import move_file, df, stopwords, rds, dict_2of12inf

from cpython cimport bool

# This function handles the cleaning of the 10-K
def clean(str fn):
    cdef bool error = False
    
    cdef str s = os.sep
    cdef str tic = fn.split(s)[1]
    cdef str cik = fn.split(s)[2]
    cdef str filename = fn.split(s)[4]
    cdef str report_key = "report:" + cik + ":" + fn
    cdef str cleaned_key = "cleaned:" + cik + ":" + fn

    # Open the file, get all of the content, and then pull it into a parser
    fh = open(fn, 'r')
    cdef unicode contents = fh.read()

    # Clean up some of the text to fix malformed HTML before parsing it
    cdef list malformed_tags = ['ACCEPTANCE-DATETIME', 'TYPE', 'SEQUENCE', 'FILENAME', 'DESCRIPTION']
    cdef str tag
    for tag in malformed_tags:
        # Do a regex that replaces all of these malformed tags in the document
        regex = re.compile(r"(\n<%s>[^<]*?)\n" % re.escape(tag), re.I)
        contents = regex.sub(r"\1</%s>\n" % tag, contents)

    # Pull the 10-k into the parser
    document = bs(contents, 'lxml')

    # The document can either have a root node of sec-document or ims-document
    if document.find('sec-document') is not None:
        root = document.find('sec-document')
    elif document.find('ims-document') is not None: 
        root = document.find('ims-document')
    elif document.find('document') is not None:
        root = document.find('document')
    elif document.find('error') is not None:
        root = None
    else:
        root = None
        
    if root is None:
        # Root node error 
        move_file(fh, fn, "_error", tic, cik, filename, "No root or erroneous root node - moved file")
        error = True
    if error: return error
        

    # Check if this is an amended 10-K and throw it out if so
    type_text = root.find('type')
    if type_text is None:
        move_file(fh, fn, "_error", tic, cik, filename, "Error finding type - moved file")
        error = True
    if error: return error

    elif type_text.text == '10-K/A':
        move_file(fh, fn, "_amended", tic, cik, filename, "Amended 10-K - moved file")
        error = True
    if error: return error

    # Get the 'acceptance-datetime' metadata element
    acc_dt = root.find('acceptance-datetime')
    if acc_dt is None:
        header_text = None
        # If we didn't find an <acceptance-datetime /> element, find the date elsewhere
        if root.find('sec-header') is not None:
            header_text = root.find('sec-header').text
        elif root.find('ims-header') is not None:
            header_text = root.find('ims-header').text

        if header_text:
            regex = re.compile(r".*\nFILED AS OF DATE:\s+?([\d]+?)\n.*", re.S)
            filing_dt_text = re.sub(regex, r"\1", header_text)
        else:
            move_file(fh, fn, "_error", tic, cik, filename, "Bad filing date - moved file")
            error = True
        if error: return error
    else:
        # Get the filing date
        filing_dt_text = acc_dt.text.split('\n', 1)[0][:8]

    filing_dt = dt.strptime(filing_dt_text, '%Y%m%d')
    filing_ts = time.mktime(filing_dt.timetuple())
    begin_dt = dt(1995, 1, 1)

    # If the filing date is not within our date range, then move it
    if begin_dt > filing_dt:
        move_file(fh, fn, "_outofrange", tic, cik, filename, "Out of date range - moved file.")
        error = True
    if error: return error

    # See if we can find stock info for this company on the filing date of the 10-K
    cdef int index = 0
    cik_df = None
    try:
        index = df.index.get_loc((bytes(cik, 'utf-8'), filing_dt))
        cik_df = df.ix[bytes(cik, 'utf-8')]
        price = cik_df.ix[filing_dt, 'PRC']
        # Now, check if the price of the stock is less than $3.00
        if price < 3.0:
            move_file(fh, fn, "_nostockdata", tic, cik, filename, "Price less than $3.00 - moved file.")
            error = True
    except (IndexError, KeyError):
        # We couldn't find the cik or date for this 10-k
        move_file(fh, fn, "_nostockdata", tic, cik, filename, "No stock data found - moved file.")
        error = True
    if error: return error
    
    # Remove the exhibits
    [ex.extract() for ex in root.findAll('document')[1:]]

    # Grab the report
    cdef str report = root.find('text').get_text()

    # We will tokenize the text and iterate through each word
    cdef list tokens = report.split()
    cdef list keep_tokens = []
    cdef set stopwords_set = set(stopwords.words('english'))
    punc_table = str.maketrans("", "", string.punctuation)
    
    # Filter out words
    cdef str word
    for word in tokens:
        # Quick check to make sure we should keep filtering the word
        if len(word) != 1:
            # Strip punctuation from the word first and make it lowercase
            word = word.translate(punc_table).lower()

            # Add the word to the keep pile if it is not a stopword and if it is in 2of12inf dictionary
            if word not in stopwords_set and word in dict_2of12inf:
                keep_tokens.append(word)
            
    tokens = keep_tokens
    report = " ".join(tokens)
    cdef int total_words = len(tokens)

    # Gather info for report to save into redis
    report_hash = {
        'cik': cik,
        'tic': tic,
        'path': fn,
        'file_name': filename,
        'filing_date': filing_ts,
        'year': filing_dt.year,
        'report': report,
        'total_words': total_words,
        'company_data': pickle.dumps(cik_df),
        'index': index,
        'mtime': time.time()
    }

    # Close the file handle
    fh.close()
    
    # Save the stuff to redis
    print("Saving to redis: " + report_key)
    rds.hmset(report_key, report_hash)
    rds.set(cleaned_key, time.time())


In [159]:
%%cython
import os
import pickle
import math
import time
from datetime import datetime as dt

import pandas as pd
from pandas import DataFrame, Series

from collections import defaultdict

from __main__ import df, rds, pos_dict, pos_roots, pos_roots_map, neg_dict, neg_roots, neg_roots_map

from cpython cimport bool

def process(str fn):
    
    cdef str s = os.sep
    cdef str tic = fn.split(s)[1]
    cdef str cik = fn.split(s)[2]
    cdef str report_key = "report:" + cik + ":" + fn
    cdef str processed_key = "processed:"+ cik + ":" + fn
    
    # Get the report out of redis
    #print("Found in redis: " + report_key)
    cdef str report = str(rds.hget(report_key, 'report'))
    filing_dt = dt.fromtimestamp(int(float(rds.hget(report_key, 'filing_date').decode('utf-8'))))
    cik_df = pickle.loads(rds.hget(report_key, 'company_data'))
    cdef int index = int(rds.hget(report_key, 'index'))
    cdef dict report_hash = {}

    # Now that everything is cleaned up, we can run the word processing algorithm
    pos_occurs = defaultdict(int)
    neg_occurs = defaultdict(int)
    negators = pd.Series(['not', 'no', 'never'])

    # We will tokenize the text and iterate through each word
    tokens = pd.Series(report.split())

    # Now, process the text
    cdef int i
    cdef str token, root, word
    cdef bool negated
    for i, token in tokens.iteritems():
        if token in pos_dict:
            # Check to see if there is a negator
            negated = False
            for word in tokens.iloc[(i - 3):(i + 3)]:
                if word in negators.values:
                    #print("Found a negator: " + word + " - " + token)
                    negated = True
            if not negated:
                root = pos_roots_map[token]
                pos_occurs[root] += 1
        elif token in neg_dict:
            # Check to see if there is a negator
            negated = False
            for word in tokens.iloc[(i - 3):(i + 3)]:
                if word in negators.values:
                    #print("Found a negator: " + word + " - " + token)
                    negated = True
            if not negated:
                root = neg_roots_map[token]
                neg_occurs[root] += 1

    # For the roots we didn't find, set frequency to zero
    for root in pos_roots:
        if root not in pos_occurs:
            pos_occurs[root] = 0
    for root in neg_roots:
        if root not in neg_occurs:
            neg_occurs[root] = 0
            
    # Use the index we found earlier to grab the historical info
    hist_returns = cik_df.ix[(index + 1):, 'RET']
    #hist_returns = cik_df.ix[cik_df.index > index]
    #hist_returns = hist_returns[['RET']]

    # Calculate the historical return before the filing date
    cdef float hist_ret = 1.0
    for col, series in hist_returns.iteritems():
        if col == 'RET':
            for r in series:
                if not math.isnan(r):
                    hist_ret *= (r + 1.0)
    hist_ret = hist_ret - 1.0
    #print("Historical return: " + str(hist_ret))

    # Use the index we found earlier to grab the four day window returns
    returns = cik_df.ix[(index - 3):(index + 1), ['RET','vwretd']]
    #returns = cik_df.ix[index:(index + 3)]
    #returns = returns[['RET', 'vwretd']]

    # Calculate the abnormal return: r_i = M{t=0, 3} (ret_i,j) - M{t=0,3} (ret_vwi,t)
    cdef float ret = 1.0
    cdef float ret_vwi = 1.0
    for col, series in returns.iteritems():
        if col == 'RET':
            for r in series:
                if not math.isnan(r):
                    ret *= (r + 1.0)
        elif col == 'vwretd':
            for r in series:
                if  not math.isnan(r):
                    ret_vwi *= (r + 1.0)
    cdef float ab_ret = ((ret - 1.0) - (ret_vwi - 1.0))
    #print("Abnormal return: " + str(ab_ret))

    # Save results of text processing to key in redis
    report_hash['pos_occurs'] = pickle.dumps(pos_occurs)
    report_hash['neg_occurs'] = pickle.dumps(neg_occurs)
    report_hash['hist_ret'] = hist_ret
    report_hash['ab_ret'] = ab_ret
    report_hash['mtime'] = time.time()

    print("Saving to redis: " + report_key)
    rds.hmset(report_key, report_hash)
    rds.set(processed_key, time.time())

Saving to redis: report:0000884219:SEC-Edgar-data/VVI/0000884219/10-K/0000950124-05-001493.txt


Variable      Type    Data/Info
-------------------------------
F             int     0
a             int     1801
count         int     27646
end           int     2008
index         int     1110776400
start         int     1997
stop          int     100000
t             int     1997
total_words   int     11408
year          int     1996


In [163]:
%%cython
import os

from __main__ import check_redis, clean, process

from cpython cimport bool

# This is for testing
cdef int count = 1
cdef int stop = 100000
cdef bool skip_cleaned = True
cdef bool skip_processed = True
cdef bool process_file = True

cdef dict report_hash
cdef str fn, s, tic, cik, cleaned_key, processed_key, report_key
cdef bool cleaned, processed, error

cdef str dirpath
cdef list dirnames, filenames
cdef str folder = "SEC-Edgar-data"
for (dirpath, dirnames, filenames) in os.walk(folder, topdown=False):
    for filename in filenames:                
        report_hash = {}
        fn = os.sep.join([dirpath, filename])
        
        if filename.endswith('.txt'):# and filename == "0000950116-97-000637.txt":
            if count > stop:
                break
            
            s = os.sep
            tic = fn.split(s)[1]
            cik = fn.split(s)[2]
            
            # Check redis to see if we have processed or cleaned the report already
            cleaned_key = "cleaned:" + cik + ":" + fn
            processed_key = "processed:" + cik + ":" + fn
            report_key = "report:" + cik + ":" + fn
            (cleaned, processed) = check_redis(cleaned_key, processed_key, report_key)
            
            # If the report has been cleaned or we don't want to clean it anyway, skip this step
            error = False
            if not cleaned or not skip_cleaned:
                print("(" + str(count) + ") Cleaning " + fn)
                error = clean(fn)
                
                if not process and not error:
                    count += 1
                    continue
            if error: continue
            
            # After possibly cleaning, check if we should process the file
            if (not processed or not skip_processed) and process_file:
                print("(" + str(count) + ") Processing " + fn)
                process(fn)
                
                count += 1

(1) Cleaning SEC-Edgar-data/VVI/0000884219/10-K/0000950153-08-000420.txt
Saving to redis: report:0000884219:SEC-Edgar-data/VVI/0000884219/10-K/0000950153-08-000420.txt
(1) Processing SEC-Edgar-data/VVI/0000884219/10-K/0000950153-08-000420.txt
test
(2) Cleaning SEC-Edgar-data/VVI/0000884219/10-K/0000950153-06-000542.txt
Saving to redis: report:0000884219:SEC-Edgar-data/VVI/0000884219/10-K/0000950153-06-000542.txt
(2) Processing SEC-Edgar-data/VVI/0000884219/10-K/0000950153-06-000542.txt
test
(3) Cleaning SEC-Edgar-data/VVI/0000884219/10-K/0000884219-94-000033.txt
Out of date range - moved file.
(3) Cleaning SEC-Edgar-data/VVI/0000884219/10-K/0000950153-07-000453.txt
Saving to redis: report:0000884219:SEC-Edgar-data/VVI/0000884219/10-K/0000950153-07-000453.txt
(3) Processing SEC-Edgar-data/VVI/0000884219/10-K/0000950153-07-000453.txt
test
(4) Cleaning SEC-Edgar-data/VVI/0000884219/10-K/0000884219-98-000013.txt
No stock data found - moved file.
(4) Cleaning SEC-Edgar-data/VVI/0000884219/1

KeyboardInterrupt: 

In [None]:
count = 0
stop = math.inf
yearly_data = {}

rds = redis.Redis()
keys = rds.keys("report:*")
for key in keys:
    
    if count >= stop:
        break
        
    report_hash = rds.hgetall(key)
    try:
        year = 1
        pos_occurs = pickle.loads(report_hash[b'pos_occurs'])
        neg_occurs = pickle.loads(report_hash[b'neg_occurs'])
        year = int(report_hash[b'year'])
        total_words = int(report_hash[b'total_words'])
        hist_ret = float(report_hash[b'hist_ret'])
        ab_ret = float(report_hash[b'ab_ret'])
        
        try: yearly_data[year]
        except KeyError:
            yearly_data[year] = []
            
        year_list = yearly_data[year]
        year_list.append({
            'pos_occurs': pos_occurs,
            'neg_occurs': neg_occurs,
            'total_words': total_words,
            'hist_ret': hist_ret,
            'ab_ret': ab_ret
        })
        yearly_data[year] = year_list
        
        count += 1
    except KeyError:
        continue
    except e:
        print(e)


In [None]:
for year in sorted(yearly_data.keys()):
    print(year, len(yearly_data[year]))

In [None]:
start = 1997
end = 2008
# Generate a rolling training model using data up until year T-1
for t in range(start, (end + 1)):
    pos_word_weights = pd.DataFrame()
    neg_word_weights = pd.DataFrame()
    hist_returns = pd.DataFrame()
    ab_returns = pd.DataFrame()
    
    # Iterate over each year before year T and build the training data set
    for year in range((start - 1), t):
        
        try: yearly_data[year]
        except KeyError:
            print("Year " + str(year) + " not found.")
            continue
        
        # Iterate through each 10-K info for the year and generate the dataframe for the regression
        for report in yearly_data[year]:
            a = report['total_words']
            hist_ret = report['hist_ret']
            ab_ret = report['ab_ret']
            
            weights = {}
            pos_occurs = report['pos_occurs']
            for word in pos_occurs.keys():
                F = pos_occurs[word]
                weights[word] = F/(a * 1.0)
            pos_word_weights = pos_word_weights.append(weights, ignore_index=True)
            
            weights = {}
            neg_occurs = report['neg_occurs']
            for word in neg_occurs.keys():
                F = neg_occurs[word]
                weights[word] = F/(a * 1.0)
            neg_word_weights = neg_word_weights.append(weights, ignore_index=True)
            
            hist_returns = hist_returns.append({'hist_ret': hist_ret}, ignore_index=True)
            ab_returns = ab_returns.append({'ab_ret': ab_ret}, ignore_index=True)
        
    # Run the regressions for this 
    if not hist_returns.empty and not pos_word_weights.empty and not neg_word_weights.empty:
        hist_returns.reset_index()
        hist_returns_series = pd.Series(hist_returns['hist_ret'])
        ab_returns.reset_index()
        ab_returns_series = pd.Series(ab_returns['ab_ret'])
        pos_word_weights.reset_index()
        neg_word_weights.reset_index()
        
        # Estimate the weights for the words using a regression
        pos_reg = sm.OLS(hist_returns_series, pos_word_weights)
        pos_model = pos_reg.fit()
        neg_reg = sm.OLS(hist_returns_series, neg_word_weights)
        neg_model = neg_reg.fit()
        
        # Map the words to their coefficients
        pos_coeffs_dict = dict(zip(list(pos_word_weights.columns), pos_model.params))
        pos_coeffs = pd.DataFrame(list(pos_coeffs_dict.items()), columns=['word','weight'])
        neg_coeffs_dict = dict(zip(list(neg_word_weights.columns), neg_model.params))
        neg_coeffs = pd.DataFrame(list(neg_coeffs_dict.items()), columns=['word','weight'])
    
        # Calculate the average word weight as well as the standard deviation
        pos_avg = pos_coeffs['weight'].mean()
        pos_std = pos_coeffs['weight'].std()
        neg_avg = neg_coeffs['weight'].mean()
        neg_std = neg_coeffs['weight'].std()
        #print("Average: " + str(pos_avg) + "; StdDev: " + str(pos_std))
        #print("Average: " + str(neg_avg) + "; StdDev: " + str(neg_std))
        #print(pos_coeffs)

        # Normalize the weights of the words
        pos_norm = list()
        for col, series in pos_coeffs.iteritems():
            if col == 'weight':
                for weight in series:
                    pos_norm.append((weight - pos_avg) / pos_std)
        pos_coeffs['norm_weight'] = pd.Series(pos_norm, index=pos_coeffs.index)
        
        neg_norm = list()
        for col, series in neg_coeffs.iteritems():
            if col == 'weight':
                for weight in series:
                    neg_norm.append((weight - neg_avg) / neg_std)
        neg_coeffs['norm_weight'] = pd.Series(neg_norm, index=neg_coeffs.index)
        
        # Iterate through the original word weights and apply the normalized weight
        for word, series in pos_word_weights.iteritems():
            norm_weight = pos_coeffs.loc[pos_coeffs['word'] == word]['norm_weight']
            pos_word_weights[word] = series.apply(lambda x: x * norm_weight)
        for word, series in neg_word_weights.iteritems():
            norm_weight = neg_coeffs.loc[neg_coeffs['word'] == word]['norm_weight']
            neg_word_weights[word] = series.apply(lambda x: x * norm_weight)
                
        # Run the regression for abnormal (after filing) returns using the estimated weights for the words
        pos_ab_reg = sm.OLS(ab_returns_series, pos_word_weights)
        pos_ab_model = pos_ab_reg.fit()
        neg_ab_reg = sm.OLS(ab_returns_series, neg_word_weights)
        neg_ab_model = neg_ab_reg.fit()
        
        # Map the words to their coefficients
        pos_coeffs_dict = dict(zip(list(pos_word_weights.columns), pos_ab_model.params))
        pos_coeffs = pd.DataFrame(list(pos_coeffs_dict.items()), columns=['word','weight'])
        neg_coeffs_dict = dict(zip(list(neg_word_weights.columns), neg_ab_model.params))
        neg_coeffs = pd.DataFrame(list(neg_coeffs_dict.items()), columns=['word','weight'])

        # Calculate the score of each document using the weights for each word given by the regression


In [None]:
# This will store the global positive and negative words occurrances
pos_occurs_all = defaultdict(int)
neg_occurs_all = defaultdict(int)

# Go through redis and grab every 10-k
keys = rds.keys("report:*")

print("Total keys: " + str(len(keys)))
for key in keys:
    report_pos_occurs = pickle.loads(rds.hget(key, 'pos_occurs'))
    report_neg_occurs = pickle.loads(rds.hget(key, 'neg_occurs'))
    
    for word, freq in report_pos_occurs.items():
        pos_occurs_all[word] += freq
        
    for word, freq in report_neg_occurs.items():
        neg_occurs_all[word] += freq

# Print out most frequent positive words
print("Most Frequent Positive Words\n" +
       "============================")

pos_sorted = pd.Series(data=pos_occurs_all).sort_values(ascending=False)
print(pos_sorted)

# Print out most frequent negative words
print("\n\nMost Frequent Negative Words\n" +
       "============================")

neg_sorted = pd.Series(data=neg_occurs_all).sort_values(ascending=False)
print(neg_sorted)