In this notebook, I examine the XBRL tags firms use to identify their "Significant Accounting Policies" in their 10-K. Most firms use simply the "SignificantAccountingPoliciesTextBlock" tag, but there are lots that do not. I will use the "Financial Statements and Notes Datasets" from EDGAR and some heuristic searches to identify the relevant tags.

In [9]:
import pandas as pd
import os
import urllib
import zipfile

In [10]:
EXT_DIR = "/media/reggie/reg_ext/EDGAR/FSANDS/"

In [16]:
def unzip_fsands(year, quarter):
    target_filename = os.path.join(EXT_DIR, 'zips',str(year)+'q'+str(quarter)+'.zip')
    target_dir = os.path.join(EXT_DIR, 'unzipped',str(year)+'q'+str(quarter))
    print(target_dir)
    if not os.path.isdir(target_dir):
        os.makedirs(target_dir)
    with zipfile.ZipFile(target_filename, 'r') as zip_ref:
        zip_ref.extractall(target_dir)
def extract_fsands(filepath):
    print(filepath)

    with open(filepath, "r", encoding='latin1') as infile:
        contents = infile.readlines()

    data = [line.strip().split("\t") for line in contents]
    bads = []
    goods = []
    for line in data:
        if len(line) == 20:
            goods.append(line)
        else:
            bads.append(line)

    with open(filepath.replace(".tsv","") + "-clean.tsv", "w") as outfile:
        outfile.writelines(["\t".join(line)+"\n" for line in goods])
    with open(filepath.replace(".tsv","") + "-bad.tsv", "w") as outfile:
        outfile.writelines(["\t".join(line)+"\n" for line in bads])
        
    print(len(goods), " goods and ",len(bads),"bads in ",filepath)
    
def get_sap(txt_submissions, form_type, year, quarter):
    x = pd.read_csv(txt_submissions, sep="\t", low_memory=False)
    ## Merge CIK numbers
    subs = pd.read_csv(EXT_DIR+"/unzipped/{}q{}/sub.tsv".format(year, quarter), sep="\t")
    subs_10k = subs[subs["form"].apply(lambda x: str(x).find(form_type) > -1)]
    ciks = subs_10k[["adsh","cik","sic","form","period","fye","fp","filed"]]
    sap = x[x["tag"] == "SignificantAccountingPoliciesTextBlock"]
    sap = sap.merge(ciks)
    sap.to_csv(EXT_DIR + "/SAP/{}q{}.tsv".format(year, quarter), sep="\t", index=False)

In [17]:
year=2017
quarter = 2
unzip_fsands(year, quarter)
txt_submissions = os.path.join(EXT_DIR, "unzipped/{}q{}/txt.tsv".format(year,quarter))
extract_fsands(txt_submissions)
get_sap(txt_submissions.replace(".tsv","-clean.tsv"), "10-K",year, quarter)

/media/reggie/reg_ext/EDGAR/FSANDS/unzipped/2017q2
/media/reggie/reg_ext/EDGAR/FSANDS/unzipped/2017q2/txt.tsv
483814  goods and  272 bads in  /media/reggie/reg_ext/EDGAR/FSANDS/unzipped/2017q2/txt.tsv


In [18]:
sap = pd.read_csv(EXT_DIR+"/SAP/2017q2.tsv", sep="\t", low_memory=False)
print(sap.shape)
sap.head()

(629, 27)


Unnamed: 0,adsh,tag,version,ddate,qtrs,iprx,lang,dcml,durp,datp,...,footlen,context,value,cik,sic,form,period,fye,fp,filed
0,0001437749-17-005927,SignificantAccountingPoliciesTextBlock,us-gaap/2016,20161231,4,0,en-US,32767,-0.054794,-2.0,...,0,d_2015-12-29_2017-01-02,Note 2Significant Accounting Policies Principl...,1526796,5812.0,10-K,20161231,1231.0,FY,20170403
1,0001493152-17-003990,SignificantAccountingPoliciesTextBlock,us-gaap/2017,20161231,4,0,en-US,32767,0.0,0.0,...,0,From2016-01-01to2016-12-31,NOTE 2 SUMMARY OF SIGNIFICANT ACCOUNTING POLI...,1444144,8000.0,10-K,20161231,1231.0,FY,20170417
2,0001213900-17-006991,SignificantAccountingPoliciesTextBlock,us-gaap/2016,20170331,4,0,en-US,32767,0.010959,0.0,...,0,Context_FYE_01_Apr_2016T00_00_00_TO_31_Mar_201...,3. Summary of Significant Accounting Policies ...,1681941,5411.0,10-K,20170331,331.0,FY,20170629
3,0001615774-17-001500,SignificantAccountingPoliciesTextBlock,us-gaap/2016,20161231,4,0,en-US,32767,0.0,0.0,...,0,From2016-01-01to2016-12-31,NOTE 2 - Summary of Significant Accounting Pol...,1537663,2834.0,10-K,20161231,1231.0,FY,20170405
4,0001096906-17-000396,SignificantAccountingPoliciesTextBlock,us-gaap/2017,20161231,4,0,en-US,32767,0.0,0.0,...,0,Y16,1. SUMMARY OF SIGNIFICANT ACCOUNTING POLICIES ...,1131089,3949.0,10-K/A,20161231,1231.0,FY,20170526


In [24]:
#subs = pd.read_csv(EXT_DIR+"/unzipped/2017q2/sub.tsv", sep="\t", low_memory=False)
subs = pd.read_csv(EXT_DIR+"/all-sub.tsv", sep="\t", low_memory=False)
subs.head()

Unnamed: 0,adsh,cik,name,sic,countryba,stprba,cityba,zipba,bas1,bas2,...,accepted,prevrpt,detail,instance,nciks,aciks,pubfloatusd,floatdate,floataxis,floatmems
0,0000006207-13-000031,6207,AMREP CORP.,6500.0,US,NJ,PRINCETON,8540,300 ALEXANDER PARK,SUITE 204,...,2013-07-16 08:16:00.0,0,1,axr-20130430.xml,1,,18482700.0,20121031.0,,
1,0000008670-13-000015,8670,AUTOMATIC DATA PROCESSING INC,7374.0,US,NJ,ROSELAND,7068,ONE ADP BOULVARD,,...,2013-08-19 12:00:00.0,0,1,adp-20130630.xml,1,,27560940000.0,20130630.0,,
2,0000008858-13-000011,8858,AVNET INC,5065.0,US,AZ,PHOENIX,85034,2211 SOUTH 47TH STREET,,...,2013-08-09 07:00:00.0,0,1,avt-20130629.xml,1,,4058518000.0,20121228.0,,
3,0000014195-13-000016,14195,BRIGGS & STRATTON CORP,3510.0,US,WI,WAUWATOSA,53222,12301 W WIRTH ST,,...,2013-08-27 14:59:00.0,0,1,bgg-20130630.xml,1,,770100000.0,20121228.0,,
4,0000016732-13-000023,16732,CAMPBELL SOUP CO,2000.0,US,NJ,CAMDEN,8103,CAMPBELL PL,,...,2013-09-26 07:43:00.0,0,1,cpb-20130728.xml,1,,6774117000.0,20130127.0,,


In [60]:
linkbase = "https://www.sec.gov/Archives/edgar/data/"
urls = []
for i, row in subs.iterrows():
    instance = row["instance"]
    adsh = row["adsh"]
    url = linkbase + str(int(adsh.split("-")[0])) + "/"+adsh.replace("-","") + "/" + instance
    urls.append(url)

In [80]:
q = {1:1,2:1,3:1,4:2,5:2,6:2,7:3,8:3,9:3,10:4,11:4,12:4}
for i, row in subs.iterrows():
    file_date = str(row['filed'])
    year = int(file_date[:4])
    month = int(file_date[4:6])
    quarter = q[month]
    adsh = row["adsh"]
    url = row["url"]
    
    DOWNLOAD_DIR = "/home/reggie/EDGAR/xbrl-2/{}/{}".format(year, quarter)
    if not os.path.isdir(DOWNLOAD_DIR):
        os.makedirs(DOWNLOAD_DIR)
    
    target_filepath = os.path.join(DOWNLOAD_DIR,adsh+".xml")
    
    if not os.path.isfile(target_filepath):
        print("Downloading {} of {}, file: {} ...".format(i, subs.shape[0], target_filepath))
        urllib.request.urlretrieve(url, target_filepath)
    else:
        print("File {} already exists...".format(target_filepath))

File /home/reggie/EDGAR/xbrl-2/2013/3/0000006207-13-000031.xml already exists...
File /home/reggie/EDGAR/xbrl-2/2013/3/0000008670-13-000015.xml already exists...
File /home/reggie/EDGAR/xbrl-2/2013/3/0000008858-13-000011.xml already exists...
File /home/reggie/EDGAR/xbrl-2/2013/3/0000014195-13-000016.xml already exists...
File /home/reggie/EDGAR/xbrl-2/2013/3/0000016732-13-000023.xml already exists...
File /home/reggie/EDGAR/xbrl-2/2013/3/0000018498-13-000039.xml already exists...
File /home/reggie/EDGAR/xbrl-2/2013/3/0000023666-13-000014.xml already exists...
File /home/reggie/EDGAR/xbrl-2/2013/3/0000039047-13-000065.xml already exists...
File /home/reggie/EDGAR/xbrl-2/2013/3/0000039648-13-000016.xml already exists...
File /home/reggie/EDGAR/xbrl-2/2013/3/0000046640-13-000018.xml already exists...
File /home/reggie/EDGAR/xbrl-2/2013/3/0000055242-13-000017.xml already exists...
File /home/reggie/EDGAR/xbrl-2/2013/3/0000055772-13-000016.xml already exists...
File /home/reggie/EDGAR/xbrl

KeyboardInterrupt: 