In [1]:
import feedparser
import os.path
import sys, getopt
import time
import socket
from urllib.request import urlopen
from urllib.error import URLError, HTTPError
import xml.etree.ElementTree as ET
import zipfile
import zlib

In [2]:
EXT_DIR = "/media/reggie/reg_ext/EDGAR/xbrl/"
EXT_DIR = "/home/reggie/EDGAR/xbrl/"

## Download XBRL Filings
We need to download the XBRL RSS feed from EDGAR, which contains an XML file of all the XBRL files uploaded to EDGAR in a given month.

In [3]:
## A general file download utility function
def downloadfile( sourceurl, targetfname ):
    mem_file = ""
    good_read = False
    xbrlfile = None
    if os.path.isfile( targetfname ):
        print( "Local copy already exists" )
        return True
    else:
        print( "Downloading:", sourceurl )
        try:
            xbrlfile = urlopen( sourceurl )
            try:
                mem_file = xbrlfile.read()
                good_read = True
            finally:
                xbrlfile.close()
        except HTTPError as e:
            print( "HTTP Error:", e.code )
        except URLError as e:
            print( "URL Error:", e.reason )
        except TimeoutError as e:
            print( "Timeout Error:", e.reason )
        except socket.timeout:
            print( "Socket Timeout Error" )
        if good_read:
            output = open( targetfname, 'wb' )
            output.write( mem_file )
            output.close()
        return good_read

In [4]:
def download_xbrl_zipfile(item, enclosures, target_dir):
    enclosure = enclosures[0]
    sourceurl = enclosure[ "href" ]
    cik = item[ "edgar_ciknumber" ]
    targetfname = target_dir+cik+'-'+sourceurl.split('/')[-1]
    retry_counter = 3
    while retry_counter > 0:
        good_read = downloadfile( sourceurl, targetfname )
        if good_read:
            break
        else:
            print( "Retrying:", retry_counter )
            retry_counter -= 1

In [5]:
def download_xbrl_manually(root, item, target_dir, itemIndex):
    """
    item:
    target_dir:
    itemIndex:
    """
    linkname = item[ "link" ].split('/')[-1]
    linkbase = os.path.splitext(linkname)[0]
    cik = item[ "edgar_ciknumber" ]
    zipfname = target_dir+cik+'-'+linkbase+"-xbrl.zip"
    if not os.path.isfile( zipfname ):
        edgarNamespace = {'edgar': 'http://www.sec.gov/Archives/edgar'}
        currentItem = list(root.iter( "item" ))[itemIndex]
        xbrlFiling = currentItem.find( "edgar:xbrlFiling", edgarNamespace )
        xbrlFilesItem = xbrlFiling.find( "edgar:xbrlFiles", edgarNamespace )
        xbrlFiles = xbrlFilesItem.findall( "edgar:xbrlFile", edgarNamespace )
        if not os.path.exists(  target_dir+"temp" ):
            os.makedirs( target_dir+"temp" )
        zf = zipfile.ZipFile( zipfname, "w" )
        try:
            for xf in xbrlFiles:
                xfurl = xf.get( "{http://www.sec.gov/Archives/edgar}url" )
                if xfurl.endswith( (".xml",".xsd") ):
                    targetfname = target_dir+"temp/"+xfurl.split('/')[-1]
                    retry_counter = 3
                    while retry_counter > 0:
                        good_read = downloadfile( xfurl, targetfname )
                        if good_read:
                            break
                        else:
                            print( "Retrying:", retry_counter )
                            retry_counter -= 1
                    zf.write( targetfname, xfurl.split('/')[-1], zipfile.ZIP_DEFLATED )
                    os.remove( targetfname )
        finally:
            zf.close()
            os.rmdir( target_dir+"temp" )

In [6]:
def sec_download(year, month, form_type="10-K"):
    root = None
    feedFile = None
    feedData = None
    good_read = False
    itemIndex = 0
    edgarFilingsFeed = 'http://www.sec.gov/Archives/edgar/monthly/xbrlrss-' + str(year) + '-' + str(month).zfill(2) + '.xml'

    target_dir = EXT_DIR + "/" + str(year) + '/' + str(month).zfill(2) + '/'
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)


    feedFile = urlopen( edgarFilingsFeed )
    feedData = feedFile.read()
    feedFile.close()

    # we have to unfortunately use both feedparser (for normal cases) and ET for old-style RSS feeds,
    # because feedparser cannot handle the case where multiple xbrlFiles are referenced without enclosure
    try:
        root = ET.fromstring(feedData)
    except ET.ParseError as perr:
        print( "XML Parser Error:", perr )
    feed = feedparser.parse( feedData )
    print( feed[ "channel" ][ "title" ] )

    # Process RSS feed and walk through all items contained
    for item in feed.entries:
        try:
            print( item[ "summary" ], item[ "title" ], item[ "published" ] )

            # Identify ZIP file enclosure, if available
            enclosures = [ l for l in item[ "links" ] if l[ "rel" ] == "enclosure" ]
            if item["summary"].find(form_type) > -1:
                if ( len( enclosures ) > 0 ):
                    # ZIP file enclosure exists, so we can just download the ZIP file
                    download_xbrl_zipfile(item, enclosures, target_dir)
                else:
                    # We need to manually download all XBRL files here and ZIP them ourselves...
                    download_xbrl_manually(root, item, target_dir, itemIndex)
        except KeyError as e:
            print( "Key Error:", e )
        finally:
            print( "----------" )
        itemIndex += 1

In [9]:
for year in range(2009,2011):
    for quarter in range(1,5):
        sec_download(year, quarter, "10-K")

All XBRL Data Submitted to the SEC for 2009-01
8-K BOWNE & CO INC (0000013610) (Filer) Fri, 30 Jan 2009 12:31:08 EST
----------
8-K AMERICAN INTERNATIONAL GROUP INC (0000005272) (Filer) Fri, 23 Jan 2009 15:06:42 EST
----------
8-K/A AMERICAN ELECTRIC POWER CO INC (0000004904) (Filer) Thu, 22 Jan 2009 18:15:07 EST
----------
8-K FORD MOTOR CO (0000037996) (Filer) Fri, 16 Jan 2009 14:00:48 EST
----------
8-K PITNEY BOWES INC /DE/ (0000078814) (Filer) Mon, 12 Jan 2009 18:17:12 EST
----------
8-K Philip Morris International Inc. (0001413329) (Filer) Fri, 09 Jan 2009 12:06:59 EST
----------
8-K NORTHROP GRUMMAN CORP /DE/ (0001133421) (Filer) Fri, 09 Jan 2009 11:59:42 EST
----------
All XBRL Data Submitted to the SEC for 2009-02
8-K SANDRIDGE ENERGY INC (0001349436) (Filer) Fri, 27 Feb 2009 17:25:38 EST
----------
8-K OMNICOM GROUP INC (0000029989) (Filer) Fri, 27 Feb 2009 17:09:09 EST
----------
8-K RR Donnelley & Sons Co (0000029669) (Filer) Fri, 27 Feb 2009 17:00:42 EST
----------
8-K XER

Downloading: http://www.sec.gov/Archives/edgar/data/101829/000119312509024624/utx-20081231_pre.xml
----------
8-K GENERAL MILLS INC (0000040704) (Filer) Tue, 10 Feb 2009 17:18:26 EST
----------
8-K FLEXTRONICS INTERNATIONAL LTD. (0000866374) (Filer) Tue, 10 Feb 2009 17:04:26 EST
----------
8-K ADOBE SYSTEMS INC (0000796343) (Filer) Mon, 09 Feb 2009 14:56:49 EST
----------
8-K AUTOMATIC DATA PROCESSING INC (0000008670) (Filer) Mon, 09 Feb 2009 11:29:22 EST
----------
8-K AGL RESOURCES INC (0001004155) (Filer) Thu, 05 Feb 2009 16:29:06 EST
----------
All XBRL Data Submitted to the SEC for 2009-03
8-K INTERNATIONAL PAPER CO /NEW/ (0000051434) (Filer) Mon, 30 Mar 2009 16:23:39 EDT
----------
8-K AIR PRODUCTS & CHEMICALS INC /DE/ (0000002969) (Filer) Mon, 30 Mar 2009 12:19:58 EDT
----------
8-K AES CORP (0000874761) (Filer) Fri, 27 Mar 2009 17:01:26 EDT
----------
8-K INTERNATIONAL BUSINESS MACHINES CORP (0000051143) (Filer) Fri, 27 Mar 2009 12:38:29 EDT
----------
8-K Western Union CO (000

----------
10-Q APPLE INC (0000320193) (Filer) Mon, 25 Jan 2010 16:23:57 EST
----------
10-K ADOBE SYSTEMS INC (0000796343) (Filer) Fri, 22 Jan 2010 12:14:18 EST
Downloading: http://www.sec.gov/Archives/edgar/data/796343/000079634310000003/0000796343-10-000003-xbrl.zip
----------
10-K/A BECTON DICKINSON & CO (0000010795) (Filer) Thu, 21 Jan 2010 17:05:45 EST
Downloading: http://www.sec.gov/Archives/edgar/data/10795/000095012310004150/0000950123-10-004150-xbrl.zip
----------
10-Q SUPERVALU INC (0000095521) (Filer) Wed, 13 Jan 2010 18:27:34 EST
----------
10-Q MICRON TECHNOLOGY INC (0000723125) (Filer) Tue, 12 Jan 2010 14:40:38 EST
----------
10-Q MONSANTO CO /NEW/ (0001110783) (Filer) Fri, 08 Jan 2010 16:16:25 EST
----------
10-Q BEST BUY CO INC (0000764478) (Filer) Thu, 07 Jan 2010 16:47:21 EST
----------
10-Q APOLLO GROUP INC (0000929887) (Filer) Thu, 07 Jan 2010 16:08:53 EST
----------
10-Q NIKE INC (0000320187) (Filer) Wed, 06 Jan 2010 17:17:10 EST
----------
10-Q BED BATH & BEYOND 

----------
10-K EATON CORP (0000031277) (Filer) Fri, 26 Feb 2010 16:27:49 EST
Downloading: http://www.sec.gov/Archives/edgar/data/31277/000095012310018180/0000950123-10-018180-xbrl.zip
----------
10-K ROPER INDUSTRIES INC (0000882835) (Filer) Fri, 26 Feb 2010 16:27:31 EST
Downloading: http://www.sec.gov/Archives/edgar/data/882835/000088283510000009/0000882835-10-000009-xbrl.zip
----------
10-K EXPEDITORS INTERNATIONAL OF WASHINGTON INC (0000746515) (Filer) Fri, 26 Feb 2010 16:26:57 EST
Downloading: http://www.sec.gov/Archives/edgar/data/746515/000119312510042974/0001193125-10-042974-xbrl.zip
----------
10-K PIONEER NATURAL RESOURCES CO (0001038357) (Filer) Fri, 26 Feb 2010 16:26:04 EST
Downloading: http://www.sec.gov/Archives/edgar/data/1038357/000119312510042964/0001193125-10-042964-xbrl.zip
----------
10-K SLM CORP (0001032033) (Filer) Fri, 26 Feb 2010 16:26:01 EST
Downloading: http://www.sec.gov/Archives/edgar/data/1032033/000095012310018176/0000950123-10-018176-xbrl.zip
----------


----------
10-K Invesco Ltd. (0000914208) (Filer) Fri, 26 Feb 2010 14:16:44 EST
Downloading: http://www.sec.gov/Archives/edgar/data/914208/000095012310017949/0000950123-10-017949-xbrl.zip
----------
10-K CIMAREX ENERGY CO (0001168054) (Filer) Fri, 26 Feb 2010 14:07:29 EST
Downloading: http://www.sec.gov/Archives/edgar/data/1168054/000104746910001512/0001047469-10-001512-xbrl.zip
----------
10-K SPRINT NEXTEL CORP (0000101830) (Filer) Fri, 26 Feb 2010 14:05:08 EST
Downloading: http://www.sec.gov/Archives/edgar/data/101830/000119312510042491/0001193125-10-042491-xbrl.zip
----------
10-K NEWFIELD EXPLORATION CO /DE/ (0000912750) (Filer) Fri, 26 Feb 2010 13:51:10 EST
Downloading: http://www.sec.gov/Archives/edgar/data/912750/000095012310017899/0000950123-10-017899-xbrl.zip
----------
10-K PFIZER INC (0000078003) (Filer) Fri, 26 Feb 2010 13:43:14 EST
Downloading: http://www.sec.gov/Archives/edgar/data/78003/000119312510042425/0001193125-10-042425-xbrl.zip
----------
10-K WELLS FARGO & CO/MN

----------
10-K ASSURANT INC (0001267238) (Filer) Thu, 25 Feb 2010 17:27:16 EST
Downloading: http://www.sec.gov/Archives/edgar/data/1267238/000119312510040943/0001193125-10-040943-xbrl.zip
----------
10-K SOUTHWESTERN ENERGY CO (0000007332) (Filer) Thu, 25 Feb 2010 17:26:00 EST
Downloading: http://www.sec.gov/Archives/edgar/data/7332/000000733210000005/0000007332-10-000005-xbrl.zip
----------
10-K ZIMMER HOLDINGS INC (0001136869) (Filer) Thu, 25 Feb 2010 17:24:52 EST
Downloading: http://www.sec.gov/Archives/edgar/data/1136869/000095012310017177/0000950123-10-017177-xbrl.zip
----------
10-K STARWOOD HOTEL & RESORTS WORLDWIDE INC (0000316206) (Filer) Thu, 25 Feb 2010 17:21:35 EST
Downloading: http://www.sec.gov/Archives/edgar/data/316206/000095012310017170/0000950123-10-017170-xbrl.zip
----------
10-K AT&T INC. (0000732717) (Filer) Thu, 25 Feb 2010 17:19:20 EST
Downloading: http://www.sec.gov/Archives/edgar/data/732717/000073271710000013/0000732717-10-000013-xbrl.zip
----------
10-K COLG

----------
10-K EQUITY RESIDENTIAL (0000906107) (Filer) Thu, 25 Feb 2010 12:18:37 EST
Downloading: http://www.sec.gov/Archives/edgar/data/906107/000119312510040142/0001193125-10-040142-xbrl.zip
----------
10-K TEXTRON INC (0000217346) (Filer) Thu, 25 Feb 2010 12:02:30 EST
Downloading: http://www.sec.gov/Archives/edgar/data/217346/000095012310016801/0000950123-10-016801-xbrl.zip
----------
10-K KRAFT FOODS INC (0001103982) (Filer) Thu, 25 Feb 2010 11:55:07 EST
Downloading: http://www.sec.gov/Archives/edgar/data/1103982/000119312510040106/0001193125-10-040106-xbrl.zip
----------
10-K AMETEK INC/ (0001037868) (Filer) Thu, 25 Feb 2010 11:30:13 EST
Downloading: http://www.sec.gov/Archives/edgar/data/1037868/000095012310016787/0000950123-10-016787-xbrl.zip
----------
10-K SIMON PROPERTY GROUP INC /DE/ (0001063761) (Filer) Thu, 25 Feb 2010 11:12:02 EST
Downloading: http://www.sec.gov/Archives/edgar/data/1063761/000104746910001324/0001047469-10-001324-xbrl.zip
----------
10-K GRAINGER W W INC 

----------
10-K DTE ENERGY CO (0000936340) (Filer) Tue, 23 Feb 2010 17:23:23 EST
Downloading: http://www.sec.gov/Archives/edgar/data/936340/000095012310015829/0000950123-10-015829-xbrl.zip
----------
10-K NRG ENERGY, INC. (0001013871) (Filer) Tue, 23 Feb 2010 17:17:24 EST
Downloading: http://www.sec.gov/Archives/edgar/data/1013871/000095012310015824/0000950123-10-015824-xbrl.zip
----------
10-K PAPA JOHNS INTERNATIONAL INC (0000901491) (Filer) Tue, 23 Feb 2010 17:16:07 EST
Downloading: http://www.sec.gov/Archives/edgar/data/901491/000110465910008891/0001104659-10-008891-xbrl.zip
----------
10-K MEDCO HEALTH SOLUTIONS INC (0001170650) (Filer) Tue, 23 Feb 2010 17:15:01 EST
Downloading: http://www.sec.gov/Archives/edgar/data/1170650/000095012310015821/0000950123-10-015821-xbrl.zip
----------
10-K AMEDISYS INC (0000896262) (Filer) Tue, 23 Feb 2010 16:35:37 EST
Downloading: http://www.sec.gov/Archives/edgar/data/896262/000119312510037640/0001193125-10-037640-xbrl.zip
----------
10-K HARTFOR

----------
10-K VENTAS INC (0000740260) (Filer) Fri, 19 Feb 2010 16:32:37 EST
Downloading: http://www.sec.gov/Archives/edgar/data/740260/000119312510035246/0001193125-10-035246-xbrl.zip
----------
10-K ABBOTT LABORATORIES (0000001800) (Filer) Fri, 19 Feb 2010 16:05:52 EST
Downloading: http://www.sec.gov/Archives/edgar/data/1800/000104746910001018/0001047469-10-001018-xbrl.zip
----------
10-K BRISTOL MYERS SQUIBB CO (0000014272) (Filer) Fri, 19 Feb 2010 15:38:21 EST
Downloading: http://www.sec.gov/Archives/edgar/data/14272/000119312510035167/0001193125-10-035167-xbrl.zip
----------
10-K Cooper Industries plc (0001141982) (Filer) Fri, 19 Feb 2010 15:33:58 EST
Downloading: http://www.sec.gov/Archives/edgar/data/1141982/000095012310014528/0000950123-10-014528-xbrl.zip
----------
10-K PATTERSON UTI ENERGY INC (0000889900) (Filer) Fri, 19 Feb 2010 15:09:48 EST
Downloading: http://www.sec.gov/Archives/edgar/data/889900/000095012310014515/0000950123-10-014515-xbrl.zip
----------
10-K ICU MEDIC

Downloading: http://www.sec.gov/Archives/edgar/data/62996/000095012310013437/mas-20091231.xsd
Downloading: http://www.sec.gov/Archives/edgar/data/62996/000095012310013437/mas-20091231_cal.xml
Downloading: http://www.sec.gov/Archives/edgar/data/62996/000095012310013437/mas-20091231_lab.xml
Downloading: http://www.sec.gov/Archives/edgar/data/62996/000095012310013437/mas-20091231_pre.xml
Downloading: http://www.sec.gov/Archives/edgar/data/62996/000095012310013437/mas-20091231_def.xml
----------
10-K QWEST COMMUNICATIONS INTERNATIONAL INC (0001037949) (Filer) Tue, 16 Feb 2010 17:06:09 EST
Downloading: http://www.sec.gov/Archives/edgar/data/1037949/000119312510032428/0001193125-10-032428-xbrl.zip
----------
10-K MOTOROLA INC (0000068505) (Filer) Tue, 16 Feb 2010 16:45:44 EST
Downloading: http://www.sec.gov/Archives/edgar/data/68505/000104746910000905/0001047469-10-000905-xbrl.zip
----------
10-K GOODRICH CORP (0000042542) (Filer) Tue, 16 Feb 2010 16:10:01 EST
Downloading: http://www.sec.gov

----------
N-CSR PROFESSIONALLY MANAGED PORTFOLIOS (0000811030) (Filer) Fri, 05 Feb 2010 15:27:05 EST
----------
10-K EXELON GENERATION CO LLC (0001168165) (Filer) Fri, 05 Feb 2010 15:16:34 EST
Downloading: http://www.sec.gov/Archives/edgar/data/1168165/000119312510023280/0001193125-10-023280-xbrl.zip
----------
10-Q PRECISION CASTPARTS CORP (0000079958) (Filer) Fri, 05 Feb 2010 15:00:19 EST
----------
10-Q SPDR GOLD TRUST (0001222333) (Filer) Fri, 05 Feb 2010 14:59:58 EST
----------
10-Q AMERISOURCEBERGEN CORP (0001140859) (Filer) Fri, 05 Feb 2010 14:55:07 EST
----------
10-K SCHLUMBERGER LTD /NV/ (0000087347) (Filer) Fri, 05 Feb 2010 12:31:59 EST
Downloading: http://www.sec.gov/Archives/edgar/data/87347/000119312510023041/0001193125-10-023041-xbrl.zip
----------
10-K PRICE T ROWE GROUP INC (0001113169) (Filer) Fri, 05 Feb 2010 07:27:34 EST
Downloading: http://www.sec.gov/Archives/edgar/data/1113169/000095012310009025/0000950123-10-009025-xbrl.zip
----------
10-Q PARKER HANNIFIN CORP 

All XBRL Data Submitted to the SEC for 2010-04
10-Q/A MASSEY ENERGY CO (0000037748) (Filer) Fri, 30 Apr 2010 17:14:25 EDT
----------
10-Q KINDER MORGAN ENERGY PARTNERS L P (0000888228) (Filer) Fri, 30 Apr 2010 17:03:02 EDT
----------
10-Q XCEL ENERGY INC (0000072903) (Filer) Fri, 30 Apr 2010 16:49:26 EDT
----------
10-Q AVNET INC (0000008858) (Filer) Fri, 30 Apr 2010 16:41:54 EDT
----------
10-Q Covidien plc (0001385187) (Filer) Fri, 30 Apr 2010 16:36:51 EDT
----------
10-Q SAFEWAY INC (0000086144) (Filer) Fri, 30 Apr 2010 16:35:27 EDT
----------
10-Q AMERICAN ELECTRIC POWER CO INC (0000004904) (Filer) Fri, 30 Apr 2010 16:15:15 EDT
----------
10-Q VENTAS INC (0000740260) (Filer) Fri, 30 Apr 2010 16:04:57 EDT
----------
10-Q GENWORTH FINANCIAL INC (0001276520) (Filer) Fri, 30 Apr 2010 16:04:11 EDT
----------
10-Q CORNING INC /NY (0000024741) (Filer) Fri, 30 Apr 2010 16:00:53 EDT
----------
10-Q PROCTER & GAMBLE CO (0000080424) (Filer) Fri, 30 Apr 2010 16:00:50 EDT
----------
10-Q TEXAS 