### Milestone 1: collect, parse and organize firm 10-K forms (text).
Given a list of company names, download all 10-K filings. Parse 10-K filing and
only keep Item 1 (Business), item 1A (risk factors), item 7 (management’s
discussion and analysis of financial condition and results of operations. Text only).
Save data into database.

In [23]:
import edgar
from unidecode import unidecode
import pandas as pd
import re

In [152]:
def clear_file(doc):
    """
    @doc: the single doc got from  edgar.getDocuments
    @return: the cleared file, decode from unicode and remove white space
    """
    parsed_string = " ".join(str(doc).split())
    return unidecode(parsed_string)

def get10KByNameAndCIK(companyName, CIKNumber, noOfDocuments=1):
    """
    @companyName: the name of the company
    @CIKNumber: the cik number of the company
    @noOfDocuments: the number of differenct files for different years
    @return: the cleared file, decode from unicode and remove white space, in a list
    
    The user have to provide the CIK number by themselves
    """
    company = edgar.Company(companyName, CIKNumber)
    tree = company.getAllFilings(filingType = "10-K")
    docs = edgar.getDocuments(tree, noOfDocuments=noOfDocuments)
    if isinstance(docs, list):
        file_lists = [ clear_file(doc) for doc in docs]
        return file_lists
    else:
        return clear_file(docs)

def clear_cik_data(file_name):
    """
    @filename: "cik-lookup-data.txt"
    @return: a table [[name, cik]...]
    """
    nameCIK_table = open(file_name, "rb")
    table = []
    for line in nameCIK_table:
        try:
            line = line.decode('utf8')
        except:
            print("Cannot decode utf-8: ", str(line))
            continue
        parts = line.split(":")
        parts.pop()
        cik = parts[-1]
        parts.pop()
        name = ":".join(parts)
        table.append((name, cik))
  
    df = pd.DataFrame(table)
    df.to_csv("company_cik.csv")
    return table

def clear_snp_500_cik(file_name):
    """
    @file_name: "snp500_cik_ticker.csv"
    @return: a table [[name, cik]...]
    """
    _df = pd.read_csv(file_name)
    return _df

def getPartsFromParsedFile(file):
    return 0
    
    
# The goal is to find the starting and Ending point of ITEM1 , ITEM1A, and ITEM7
# The Heuristic Rule is 
# 1. Make sure whether the index in matched by regex
# if matched, take the second mention of the item as the starting point, the following mention of next item as the stopping point
# if not, use the first mention



def segmentITEM1_1A_7(string):
    regex_item1 = 'I[tT][Ee][Mm]\s*1[.\s]'
    regex_item1A = 'I[tT][Ee][Mm]\s*1A[.\s]'
    regex_item1B = 'I[tT][Ee][Mm]\s*1B[.\s]'
    regex_item7 = 'I[tT][Ee][Mm]\s*7[.\s]'
    regex_item8 = 'I[tT][Ee][Mm]\s*8[.\s]'

    # Check first appearance of item8, if smaller than 10000, then this appearence is index
    itr = re.finditer(regex_item8, string)
    index_matched = True
    for match in itr:
        if match.span()[0] > 10000:
            index_matched = False
        break
    
     # use ignore the first mentions

    item1_start = 0
    item1A_start = 0
    item1B_start = 0
    item7_start = 0
    item8_start = 0

    
    itr = re.finditer(regex_item1, string)
    if index_matched:   
        # 1 start point
        done = False
        for match in itr:
            item1_start = match.span()[0]
            if done:
                break
            else:
                done = True
    else:
         for match in itr:
            item1_start = match.span()[0]
            break
            
    # 1A start point
    done = False
    itr = re.finditer(regex_item1A, string)
    for match in itr:
        item1A_start = match.span()[0]
        if item1A_start > item1_start:
            print("item1A, item1: ", item1A_start, item1_start)
            break
                
    # 1B start point
    done = False
    itr = re.finditer(regex_item1B, string)
    for match in itr:
        item1B_start = match.span()[0]
        if item1B_start > item1A_start:
            print("item1B, item1A: ", item1B_start, item1A_start)
            break
        
    # 7 start point
    done = False
    itr = re.finditer(regex_item7, string)
    for match in itr:
        item7_start = match.span()[0]
        if item7_start > item1B_start:
            print("item7, item1B: ", item7_start, item1B_start)
            break
                
    # 8 start point
    done = False
    itr = re.finditer(regex_item8, string)
    for match in itr:
        item8_start = match.span()[0]
        if item8_start > item7_start:
            print("item8, item7: ", item8_start, item7_start)
            break
                
    
    return string[item1_start:item1A_start], string[item1A_start:item1B_start], string[item7_start: item8_start]
        
    
        

def getRecentFileTable(company_list):
    """
    @company_list: the table of [[name, cik]...]
    @return table of 4 lists, name, cik, file content, file number
    
    for testing, set file number to be 3 and max number of company to be 20
    """
    n_file = 100
    max_n_company = 10
    
    comp_count = 0
    tenK_count = 0
    table = list()
    
    for company_pairs in company_list:
        # limit the amount of company, only for testing
        if tenK_count == max_n_company:
            break
            
        comp_count += 1
        files = get10KByNameAndCIK(company_pairs[0], company_pairs[1], n_file)
        if len(files) != n_file: 
            print("error: ", company_pairs," want: ", n_file ,"files, actual have: ", len(files))
            
       
        tenK_count += 1
        print(tenK_count)
        for i in range(len(files)):
            item1, item1A, item7 = segmentITEM1_1A_7(files[i])
            table.append((company_pairs[0], company_pairs[1], i, files[i], item1, item1A, item7))

    return table
            
        
    
    

In [153]:
company_df = clear_snp_500_cik("snp500_cik_ticker.csv")
company_list_int = company_df[["Security", "CIK"]].values
company_list = [ [tup[0], str(tup[1])] for tup in company_list_int]

table = getRecentFileTable(company_list)




error:  ['3M Company', '66740']  want:  100 files, actual have:  29
1
item1A, item1:  36704 7894
item1B, item1A:  48197 36704
item7, item1B:  53807 48197
item8, item7:  185503 53807
item1A, item1:  35370 7436
item1B, item1A:  47310 35370
item7, item1B:  54113 47310
item8, item7:  180456 54113
item1A, item1:  33108 5107
item1B, item1A:  44695 33108
item7, item1B:  51473 44695
item8, item7:  166665 51473
item1A, item1:  33808 4828
item1B, item1A:  43977 33808
item7, item1B:  50910 43977
item8, item7:  168161 50910
item1A, item1:  31863 4827
item1B, item1A:  41574 31863
item7, item1B:  48813 41574
item8, item7:  167934 48813
item1A, item1:  32379 4827
item1B, item1A:  41527 32379
item7, item1B:  48263 41527
item8, item7:  168690 48263
item1A, item1:  32652 4988
item1B, item1A:  41467 32652
item7, item1B:  48887 41467
item8, item7:  167424 48887
item1A, item1:  31338 4985
item1B, item1A:  38606 31338
item7, item1B:  46204 38606
item8, item7:  156016 46204
item1A, item1:  27281 5013
item1B,

ConnectionError: HTTPSConnectionPool(host='www.sec.gov', port=443): Max retries exceeded with url: /Archives/edgar/data/815094/000119312515204302/d863915d10k.htm (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7fdd498761d0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))

In [141]:
before_extract_df = pd.DataFrame(table)
before_extract_df

Unnamed: 0,0,1,2,3,4,5,6
0,3M Company,66740,0,10-K 1 mmm-20171231x10k.htm 10-K mmm_Current_F...,Item 1. Business. 3M Company was incorporated ...,Item 1A. Risk Factors. Provided below is a cau...,Item 7. Management's Discussion and Analysis o...
1,3M Company,66740,1,10-K 1 mmm-20161231x10k.htm 10-K mmm_Current_F...,Item 1. Business. 3M Company was incorporated ...,Item 1A. Risk Factors. Provided below is a cau...,Item 7. Management's Discussion and Analysis o...
2,Abbott Laboratories,1800,0,10-K 1 a2234264z10-k.htm 10-K UNITED STATES SE...,ITEM 1. BUSINESS GENERAL DEVELOPMENT OF BUSINE...,ITEM 1A. RISK FACTORS In addition to the other...,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...
3,Abbott Laboratories,1800,1,10-K 1 a2230875z10-k.htm 10-K UNITED STATES SE...,ITEM 1. BUSINESS GENERAL DEVELOPMENT OF BUSINE...,ITEM 1A. RISK FACTORS In addition to the other...,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...
4,AbbVie Inc.,1551152,0,10-K 1 abbv-20171231x10k.htm 10-K Document UNI...,ITEM 1. BUSINESS OverviewAbbVie(1) is a global...,ITEM 1A. RISK FACTORS You should carefully con...,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...
5,AbbVie Inc.,1551152,1,10-K 1 abbv-12312016x10k.htm 10-K Document UNI...,ITEM 1. BUSINESS OverviewAbbVie(1) is a global...,ITEM 1A. RISK FACTORS You should carefully con...,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...
6,ABIOMED Inc,815094,0,10-K 1 abmd-10k_20180331.htm 10-K abmd-10k_201...,ITEM 1. BUSINESS Overview We are a leading pro...,ITEM 1A. RISK FACTORS Investing in our common ...,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...
7,ABIOMED Inc,815094,1,"10-K 1 abmd-10k_20170331.htm ABIOMED, INC. FOR...",ITEM 1. BUSINESS Overview We are a leading pro...,ITEM 1A. RISK FACTORS Investing in our common ...,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...
8,Accenture plc,1467373,0,10-K 1 acn831201710k.htm 10-K Document Table o...,ITEM 1. BUSINESSOverview Accenture is one of t...,ITEM 1A. RISK FACTORS In addition to the other...,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...
9,Accenture plc,1467373,1,10-K 1 acn831201610k.htm 10-K Document Table o...,ITEM 1. BUSINESSOverview Accenture is one of t...,ITEM 1A. RISK FACTORS In addition to the other...,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...
