EDGAR Files are stored at https://www.sec.gov/Archives/edgar/full-index/

The structure of the SEC index files stored on their EDGAR server is as follows.

Year > Quarter > File type.


In [1]:
import urllib
import os
import gzip
import pandas as pd
import matplotlib
%matplotlib inline
EXT_DIR = "/home/reggie/Dropbox/Research/edgar_download/sec-downloads"
EXT_DIR = "/media/reggie/reg_ext/EDGAR/"

In [None]:
## A general file download utility function
def downloadfile( sourceurl, targetfname ):
    mem_file = ""
    good_read = False
    xbrlfile = None
    if os.path.isfile( targetfname ):
        print( "Local copy already exists" )
        return True
    else:
        print( "Downloading:", sourceurl )
        try:
            xbrlfile = urlopen( sourceurl )
            try:
                mem_file = xbrlfile.read()
                good_read = True
            finally:
                xbrlfile.close()
        except HTTPError as e:
            print( "HTTP Error:", e.code )
        except URLError as e:
            print( "URL Error:", e.reason )
        except TimeoutError as e:
            print( "Timeout Error:", e.reason )
        except socket.timeout:
            print( "Socket Timeout Error" )
        if good_read:
            output = open( targetfname, 'wb' )
            output.write( mem_file )
            output.close()
        return good_read

In [30]:
def download_master(year, quarter, storage_path):
    download_url = "https://www.sec.gov/Archives/edgar/full-index/%s/QTR%s/master.gz" % (year, quarter)
    req = urllib.request.urlretrieve(download_url, storage_path)
    if req:
        return 0
    else:
        return -1

def unzip_master(storage_path, unzipped_storage_path):
    i = 0
    with gzip.open(storage_path, 'rb') as infile:
        with open(unzipped_storage_path, 'wb') as outfile:
            for line in infile:
                i += 1
                if i > 11: ## skip header information (metadata)
                    outfile.write(line)
    return i

def master_to_csv(unzipped_storage_path, csv_filepath):
    x = pd.read_csv(unzipped_storage_path, sep="|", encoding="latin1", header=None)
    x.columns = ["cik", "company_name", "form_type", "date_filed","edgar_url"]
    form_10k = x[x["form_type"].apply(lambda x: str(x).find("10-K") > -1)]

    form_10k.to_csv(csv_filepath, index=False)
    return None

In [None]:
## Download raw gzip file from SEC server
#download_master(year, quarter, storage_path)

## Unzip form file
#unzip_master(storage_path, unzipped_storage_path)

## Convert form (fixed-with file) to csv
#form_to_csv(unzipped_storage_path, csv_storagepath)

In [31]:
def run(year, quarter):

    ## Location where raw gzip file of submissions (master) from SEC server will be stored
    storage_path = os.path.join(EXT_DIR, "master/zipped/master-%s-QTR%s.gz" % (year, quarter))

    ## Location where unzipped "master" file will be stored
    unzipped_storage_path = os.path.join(EXT_DIR, "master/unzipped/master-%s-QTR%s" % (year, quarter))

    ## Location where CSV file of records of 10-K submissions will be stored
    csv_storage_path =  os.path.join(EXT_DIR, "master/csv/master-{}-QTR{}-10K.csv".format(year, quarter))
    
    if not os.path.isfile(storage_path):
        ## Download raw gzip file from SEC server
        download_master(year, quarter, storage_path)
    else:
        print(storage_path, " already exists")
        
    if not os.path.isfile(unzipped_storage_path):
        ## Unzip "master" file
        unzip_master(storage_path, unzipped_storage_path)
    else:
        print(unzipped_storage_path, " already exists")
        
    if not os.path.isfile(csv_storage_path):
        ## Convert "master" (fixed-with file) to csv
        master_to_csv(unzipped_storage_path, csv_storage_path)
    else:
        print(csv_storage_path, " already exists")
        
    return None


In [29]:
year = 2011
quarter=4
## Location where raw gzip file of submissions (master) from SEC server will be stored
storage_path = os.path.join(EXT_DIR, "master/zipped/master-%s-QTR%s.gz" % (year, quarter))

## Location where unzipped "master" file will be stored
unzipped_storage_path = os.path.join(EXT_DIR, "master/unzipped/master-%s-QTR%s" % (year, quarter))

## Location where CSV file of records of 10-K submissions will be stored
csv_storage_path =  os.path.join(EXT_DIR, "master/csv/master-{}-QTR{}-10K.csv".format(year, quarter))

x = pd.read_csv(unzipped_storage_path, sep="|", encoding="latin1", header=None)
x

Unnamed: 0,0,1,2,3,4
0,1000032,BINCH JAMES G,4,2011-12-02,edgar/data/1000032/0001181431-11-058482.txt
1,1000045,NICHOLAS FINANCIAL INC,10-Q,2011-11-09,edgar/data/1000045/0001193125-11-303673.txt
2,1000045,NICHOLAS FINANCIAL INC,4,2011-11-14,edgar/data/1000045/0001000045-11-000011.txt
3,1000045,NICHOLAS FINANCIAL INC,8-K,2011-10-27,edgar/data/1000045/0001193125-11-283263.txt
4,1000069,"EMPIRIC FUNDS, INC",24F-2NT,2011-12-14,edgar/data/1000069/0000894189-11-005614.txt
...,...,...,...,...,...
202624,9984,BARNES GROUP INC,4,2011-12-13,edgar/data/9984/0000009984-11-000114.txt
202625,9984,BARNES GROUP INC,4,2011-12-13,edgar/data/9984/0000009984-11-000115.txt
202626,9984,BARNES GROUP INC,8-K,2011-10-28,edgar/data/9984/0000009984-11-000101.txt
202627,9984,BARNES GROUP INC,8-K,2011-11-17,edgar/data/9984/0000009984-11-000110.txt


In [32]:
for year in range(1995, 2019):
    for quarter in range(1,5):
        run(year, quarter)
        print(year, quarter)

/media/reggie/reg_ext/EDGAR/master/zipped/master-1995-QTR1.gz  already exists
/media/reggie/reg_ext/EDGAR/master/unzipped/master-1995-QTR1  already exists
/media/reggie/reg_ext/EDGAR/master/csv/master-1995-QTR1-10K.csv  already exists
1995 1
/media/reggie/reg_ext/EDGAR/master/zipped/master-1995-QTR2.gz  already exists
/media/reggie/reg_ext/EDGAR/master/unzipped/master-1995-QTR2  already exists
/media/reggie/reg_ext/EDGAR/master/csv/master-1995-QTR2-10K.csv  already exists
1995 2
/media/reggie/reg_ext/EDGAR/master/zipped/master-1995-QTR3.gz  already exists
/media/reggie/reg_ext/EDGAR/master/unzipped/master-1995-QTR3  already exists
/media/reggie/reg_ext/EDGAR/master/csv/master-1995-QTR3-10K.csv  already exists
1995 3
/media/reggie/reg_ext/EDGAR/master/zipped/master-1995-QTR4.gz  already exists
/media/reggie/reg_ext/EDGAR/master/unzipped/master-1995-QTR4  already exists
/media/reggie/reg_ext/EDGAR/master/csv/master-1995-QTR4-10K.csv  already exists
1995 4
/media/reggie/reg_ext/EDGAR/mast

2011 4
2012 1
2012 2
2012 3
2012 4
2013 1
2013 2
2013 3
2013 4
2014 1
2014 2
2014 3
2014 4
2015 1
2015 2
2015 3
2015 4
2016 1
2016 2
2016 3
2016 4
2017 1
2017 2
2017 3
2017 4
2018 1
2018 2
2018 3
2018 4


In [2]:
## Filings by quarter
filings = pd.DataFrame(columns=["cik", "company_name", "form_type", "date_filed","edgar_url"])
for f in os.listdir(os.path.join(EXT_DIR, "master/csv")):
    if f.endswith(".csv"):
        f = os.path.join(EXT_DIR, "master/csv/",f)
        x = pd.read_csv(f)
        filings = filings.append(x)
print(filings.shape)
filings.head()

(289399, 5)


Unnamed: 0,cik,company_name,form_type,date_filed,edgar_url
0,100030,TRW INC,10-K405,1995-03-24,edgar/data/100030/0000950152-95-000415.txt
1,100122,TUCSON ELECTRIC POWER CO,10-K,1995-03-09,edgar/data/100122/0000100122-95-000007.txt
2,10012,BARNETT BANKS INC,10-K/A,1995-03-22,edgar/data/10012/0000912057-95-001566.txt
3,10012,BARNETT BANKS INC,10-K,1995-02-03,edgar/data/10012/0000912057-95-000305.txt
4,100166,TULTEX CORP,10-K,1995-03-31,edgar/data/100166/0000100166-95-000031.txt


In [3]:
edgar_url_base = "https://www.sec.gov/Archives/"
filings["edgar_url_full"] = filings["edgar_url"].apply(lambda x: edgar_url_base + str(x))
filings.head()

Unnamed: 0,cik,company_name,form_type,date_filed,edgar_url,edgar_url_full
0,100030,TRW INC,10-K405,1995-03-24,edgar/data/100030/0000950152-95-000415.txt,https://www.sec.gov/Archives/edgar/data/100030...
1,100122,TUCSON ELECTRIC POWER CO,10-K,1995-03-09,edgar/data/100122/0000100122-95-000007.txt,https://www.sec.gov/Archives/edgar/data/100122...
2,10012,BARNETT BANKS INC,10-K/A,1995-03-22,edgar/data/10012/0000912057-95-001566.txt,https://www.sec.gov/Archives/edgar/data/10012/...
3,10012,BARNETT BANKS INC,10-K,1995-02-03,edgar/data/10012/0000912057-95-000305.txt,https://www.sec.gov/Archives/edgar/data/10012/...
4,100166,TULTEX CORP,10-K,1995-03-31,edgar/data/100166/0000100166-95-000031.txt,https://www.sec.gov/Archives/edgar/data/100166...


In [16]:
filings.to_csv("../data/master-filings-10k.csv")

In [15]:
#req = urllib.request.urlretrieve(download_url, storage_path)
storage_path_10k = filings["edgar_url"].iloc[0].split("/")[-1]
req = urllib.request.urlretrieve(filings["edgar_url_full"].iloc[0], "../temp/"+storage_path_10k)