In [7]:
import os
import pandas as pd
import zipfile

In [8]:
EXT_DIR = "/media/reggie/reg_ext/EDGAR/FSANDS"

In [9]:
def unzip_fsands(year, quarter):
    target_filename = os.path.join(EXT_DIR, 'zips',str(year)+'q'+str(quarter)+'.zip')
    target_dir = os.path.join(EXT_DIR, 'unzipped',str(year)+'q'+str(quarter))
    print(target_dir)
    if not os.path.isdir(target_dir):
        os.makedirs(target_dir)
    with zipfile.ZipFile(target_filename, 'r') as zip_ref:
        zip_ref.extractall(target_dir)

In [10]:
def extract_fsands(filepath):
    print(filepath)

    with open(filepath, "r", encoding='latin1') as infile:
        contents = infile.readlines()

    data = [line.strip().split("\t") for line in contents]
    bads = []
    goods = []
    for line in data:
        if len(line) == 20:
            goods.append(line)
        else:
            bads.append(line)

    with open(filepath.replace(".tsv","") + "-clean.tsv", "w") as outfile:
        outfile.writelines(["\t".join(line)+"\n" for line in goods])
    with open(filepath.replace(".tsv","") + "-bad.tsv", "w") as outfile:
        outfile.writelines(["\t".join(line)+"\n" for line in bads])
        
    print(len(goods), " goods and ",len(bads),"bads in ",filepath)

In [16]:
def get_sap(txt_submissions, form_type, year, quarter):
    x = pd.read_csv(txt_submissions, sep="\t", low_memory=False)
    ## Merge CIK numbers
    subs = pd.read_csv(EXT_DIR+"/unzipped/{}q{}/sub.tsv".format(year, quarter), sep="\t")
    subs_10k = subs[subs["form"].apply(lambda x: str(x).find(form_type) > -1)]
    ciks = subs_10k[["adsh","cik","sic","form","period","fye","fp","filed"]]
    sap = x[x["tag"] == "SignificantAccountingPoliciesTextBlock"]
    sap = sap.merge(ciks)
    sap.to_csv(EXT_DIR + "/SAP/{}q{}.tsv".format(year, quarter), sep="\t", index=False)

In [17]:
for year in [2010]:
    for quarter in range(1,5):
        unzip_fsands(year, quarter)
        txt_submissions = os.path.join(EXT_DIR, "unzipped/{}q{}/txt.tsv".format(year,quarter))
        try:
            extract_fsands(txt_submissions)
            get_sap(txt_submissions.replace(".tsv","-clean.tsv"), "10-K",year, quarter)
        except:
            continue

/media/reggie/reg_ext/EDGAR/FSANDS/unzipped/2010q1
/media/reggie/reg_ext/EDGAR/FSANDS/unzipped/2010q1/txt.tsv
15580  goods and  685 bads in  /media/reggie/reg_ext/EDGAR/FSANDS/unzipped/2010q1/txt.tsv
/media/reggie/reg_ext/EDGAR/FSANDS/unzipped/2010q2
/media/reggie/reg_ext/EDGAR/FSANDS/unzipped/2010q2/txt.tsv
13869  goods and  687 bads in  /media/reggie/reg_ext/EDGAR/FSANDS/unzipped/2010q2/txt.tsv
/media/reggie/reg_ext/EDGAR/FSANDS/unzipped/2010q3
/media/reggie/reg_ext/EDGAR/FSANDS/unzipped/2010q3/txt.tsv
49341  goods and  1880 bads in  /media/reggie/reg_ext/EDGAR/FSANDS/unzipped/2010q3/txt.tsv
/media/reggie/reg_ext/EDGAR/FSANDS/unzipped/2010q4
/media/reggie/reg_ext/EDGAR/FSANDS/unzipped/2010q4/txt.tsv
54195  goods and  1993 bads in  /media/reggie/reg_ext/EDGAR/FSANDS/unzipped/2010q4/txt.tsv


In [19]:
sap = pd.read_csv(EXT_DIR+"/SAP/2010q1.tsv", sep="\t", low_memory=False)
print(sap.shape)
sap.head()

(277, 27)


Unnamed: 0,adsh,tag,version,ddate,qtrs,iprx,lang,dcml,durp,datp,...,footlen,context,value,cik,sic,form,period,fye,fp,filed
0,0000029915-10-000024,SignificantAccountingPoliciesTextBlock,us-gaap/2009,20091231,4,0,en-US,32767,0.010959,0.0,...,0,c00004,NOTE A  SUMMARY OF SIGNIFICANT ACCOUNTING POL...,29915,2821,10-K,20091231,1231,FY,20100219
1,0000040545-10-000010,SignificantAccountingPoliciesTextBlock,us-gaap/2009,20091231,4,0,en-US,32767,0.010959,0.0,...,0,FROM_Jan01_2009_TO_Dec31_2009,Notes to Consolidated Financial Statements Not...,40545,3600,10-K,20091231,1231,FY,20100219
2,0000060667-10-000064,SignificantAccountingPoliciesTextBlock,us-gaap/2009,20100131,4,0,en-US,32767,0.021918,2.0,...,0,FROM_Jan31_2009_TO_Jan29_2010,NOTE 1: Summary of Significant Accounting Poli...,60667,5211,10-K/A,20100131,131,FY,20100330
3,0000065984-10-000041,SignificantAccountingPoliciesTextBlock,us-gaap/2009,20091231,4,0,en-US,32767,0.010959,0.0,...,0,c00047,NOTE 1. SUMMARY OF SIGNIFICANT ACCOUNTING POLI...,65984,4911,10-K/A,20091231,1231,FY,20100226
4,0000067716-10-000030,SignificantAccountingPoliciesTextBlock,us-gaap/2009,20091231,4,0,en-US,32767,0.010959,0.0,...,0,c00007,Note 1  Summary of Significant Accounting Pol...,67716,1400,10-K,20091231,1231,FY,20100217


In [22]:
for i, row in sap.iterrows():
    print(row["txtlen"])

13496
45981
28921
34895
37297
48811
17289
35829
37532
16846
12323
18520
28104
37099
26809
35498
24943
19112
25788
23884
28363
38648
18871
34143
28094
67136
28710
43714
12632
21165
41010
8829
32107
11385
41504
19448
21995
50088
21401
27089
20754
18432
17037
21876
17256
20656
14729
32531
30232
21988
64968
33757
29874
45219
54498
16393
24421
33168
14913
41010
21692
15677
28358
45686
37578
26368
21521
46595
23830
26648
38727
37117
15910
209
64815
65898
23438
29550
11896
36657
73845
13524
28824
135122
17156
32064
27571
23992
45940
32927
38946
49985
20697
25723
21638
26182
54364
35041
42775
16548
23539
21570
33428
29219
44651
17105
24223
25930
18867
16504
33873
36650
41654
38498
45098
35670
56722
28
33178
13890
22643
13478
22680
66865
30143
15971
57654
39025
29806
18500
18167
22711
43122
71535
24747
16901
24042
50119
31576
19962
25691
43292
17399
46173
28175
39376
24883
9701
34609
30303
33912
7540
33643
35101
32026
11475
27812
19512
10677
42301
17542
29996
29651
30367
30983
16787
25720
25359