In [1]:
import re
import requests
import unicodedata # normalize the text
from bs4 import BeautifulSoup

In [2]:
# The text from each page of the document will be disgusting. Lets normalize the text
# cant rely on only unicodedata lib to help, there is also windows_1252_characters that cause issues
# from stack overflow, we have a way to help normalize the remaining portions of data
def restore_windows_1252_characters(restore_string):
    # Replace c1 control chars in unicode string s by chars at 
    # corresponding code points in windows 1252 where possible
    def to_windows_1252(match):
        try:
            return bytes([ord(match.group(0))]).decode('windows-1252')
        except UnicodeDecodeError:
            # no char at corresponding code point, remove it
            return ''
    return re.sub(r'[\u0080-\u0099]', to_windows_1252, restore_string)

In [3]:
# define a url
new_html_file = r"https://www.sec.gov/Archives/edgar/data/1166036/000110465904027382/0001104659-04-027382.txt"

# grab response
response = requests.get(new_html_file)

# parse the response
soup = BeautifulSoup(response.content, 'lxml')

In [4]:
# soup

In [5]:
# define master dictionary to house all filings
master_filings_dict = {}

# define unique key for each filing
accession_number = '0001104659-04-027382'

# add key to dict and add new level
master_filings_dict[accession_number] = {}

# add next levels 
# filing documents and sec filing headers
master_filings_dict[accession_number]['sec_header_content'] = {}
master_filings_dict[accession_number]['filing_documents'] = None

In [6]:
# grab sec-header document
sec_header_tag = soup.find('sec-header')
sec_header_tag

<sec-header>0001104659-04-027382.hdr.sgml : 20040913
<acceptance-datetime>20040913074905
ACCESSION NUMBER:		0001104659-04-027382
CONFORMED SUBMISSION TYPE:	8-K/A
PUBLIC DOCUMENT COUNT:		7
CONFORMED PERIOD OF REPORT:	20040730
ITEM INFORMATION:		Completion of Acquisition or Disposition of Assets
ITEM INFORMATION:		Financial Statements and Exhibits
FILED AS OF DATE:		20040913
DATE AS OF CHANGE:		20040913

FILER:

	COMPANY DATA:	
		COMPANY CONFORMED NAME:			MARKWEST ENERGY PARTNERS L P
		CENTRAL INDEX KEY:			0001166036
		STANDARD INDUSTRIAL CLASSIFICATION:	CRUDE PETROLEUM &amp; NATURAL GAS [1311]
		IRS NUMBER:				270005456
		FISCAL YEAR END:			1231

	FILING VALUES:
		FORM TYPE:		8-K/A
		SEC ACT:		1934 Act
		SEC FILE NUMBER:	001-31239
		FILM NUMBER:		041026639

	BUSINESS ADDRESS:	
		STREET 1:		155 INVERNESS DR WEST
		STREET 2:		STE 200
		CITY:			ENGLEWOOD
		STATE:			CO
		ZIP:			80112
		BUSINESS PHONE:		303-925-9275

	MAIL ADDRESS:	
		STREET 1:		155 INVERNESS DR WEST
		STREET 2:		STE 200
		C

In [7]:
sec_header_tag.get_text()

'0001104659-04-027382.hdr.sgml : 20040913\n20040913074905\nACCESSION NUMBER:\t\t0001104659-04-027382\nCONFORMED SUBMISSION TYPE:\t8-K/A\nPUBLIC DOCUMENT COUNT:\t\t7\nCONFORMED PERIOD OF REPORT:\t20040730\nITEM INFORMATION:\t\tCompletion of Acquisition or Disposition of Assets\nITEM INFORMATION:\t\tFinancial Statements and Exhibits\nFILED AS OF DATE:\t\t20040913\nDATE AS OF CHANGE:\t\t20040913\n\nFILER:\n\n\tCOMPANY DATA:\t\n\t\tCOMPANY CONFORMED NAME:\t\t\tMARKWEST ENERGY PARTNERS L P\n\t\tCENTRAL INDEX KEY:\t\t\t0001166036\n\t\tSTANDARD INDUSTRIAL CLASSIFICATION:\tCRUDE PETROLEUM & NATURAL GAS [1311]\n\t\tIRS NUMBER:\t\t\t\t270005456\n\t\tFISCAL YEAR END:\t\t\t1231\n\n\tFILING VALUES:\n\t\tFORM TYPE:\t\t8-K/A\n\t\tSEC ACT:\t\t1934 Act\n\t\tSEC FILE NUMBER:\t001-31239\n\t\tFILM NUMBER:\t\t041026639\n\n\tBUSINESS ADDRESS:\t\n\t\tSTREET 1:\t\t155 INVERNESS DR WEST\n\t\tSTREET 2:\t\tSTE 200\n\t\tCITY:\t\t\tENGLEWOOD\n\t\tSTATE:\t\t\tCO\n\t\tZIP:\t\t\t80112\n\t\tBUSINESS PHONE:\t\t303-925-

In [8]:
# store sec header content inside the dict
master_filings_dict[accession_number]['sec_header_content']['sec_header_code'] = sec_header_tag

In [9]:
master_filings_dict

{'0001104659-04-027382': {'sec_header_content': {'sec_header_code': <sec-header>0001104659-04-027382.hdr.sgml : 20040913
   <acceptance-datetime>20040913074905
   ACCESSION NUMBER:		0001104659-04-027382
   CONFORMED SUBMISSION TYPE:	8-K/A
   PUBLIC DOCUMENT COUNT:		7
   CONFORMED PERIOD OF REPORT:	20040730
   ITEM INFORMATION:		Completion of Acquisition or Disposition of Assets
   ITEM INFORMATION:		Financial Statements and Exhibits
   FILED AS OF DATE:		20040913
   DATE AS OF CHANGE:		20040913
   
   FILER:
   
   	COMPANY DATA:	
   		COMPANY CONFORMED NAME:			MARKWEST ENERGY PARTNERS L P
   		CENTRAL INDEX KEY:			0001166036
   		STANDARD INDUSTRIAL CLASSIFICATION:	CRUDE PETROLEUM &amp; NATURAL GAS [1311]
   		IRS NUMBER:				270005456
   		FISCAL YEAR END:			1231
   
   	FILING VALUES:
   		FORM TYPE:		8-K/A
   		SEC ACT:		1934 Act
   		SEC FILE NUMBER:	001-31239
   		FILM NUMBER:		041026639
   
   	BUSINESS ADDRESS:	
   		STREET 1:		155 INVERNESS DR WEST
   		STREET 2:		STE 200
   		

In [10]:
# Parse the documents
# find all document tags and loop through results
# init master document dict
master_document_dict = {}

for filing_document in soup.find_all('document'):
    document_id = filing_document.type.find(text=True, recursive = False).strip()    
#     print(document_id)
    document_sequence = filing_document.sequence.find(text=True, recursive = False).strip()    
    document_filename = filing_document.filename.find(text=True, recursive = False).strip()    
    document_description = filing_document.description.find(text=True, recursive = False).strip()
    
    # insert key
    master_document_dict[document_id] = {}
    
    # add different parts of document
    master_document_dict[document_id]['document_sequence'] = document_sequence
    master_document_dict[document_id]['document_filename'] = document_filename
    master_document_dict[document_id]['document_description'] = document_description
    
    # add document content
    master_document_dict[document_id]['document_code'] = filing_document.extract()
    
    # get all text in document
    filing_doc_text = filing_document.find('text').extract()
    
    # get all thematic breaks, aka end of page lining
    all_thematic_breaks = filing_doc_text.find_all('hr', {'width' : '100%'})
    

In [11]:
master_document_dict

{'8-K/A': {'document_sequence': '1',
  'document_filename': 'a04-10341_18ka.htm',
  'document_description': '8-K/A',
  'document_code': <document>
  <type>8-K/A
  <sequence>1
  <filename>a04-10341_18ka.htm
  <description>8-K/A
  
  </description></filename></sequence></type></document>},
 'EX-2.1': {'document_sequence': '2',
  'document_filename': 'a04-10341_1ex2d1.htm',
  'document_description': 'EX-2.1',
  'document_code': <document>
  <type>EX-2.1
  <sequence>2
  <filename>a04-10341_1ex2d1.htm
  <description>EX-2.1
  
  </description></filename></sequence></type></document>},
 'EX-4.1': {'document_sequence': '3',
  'document_filename': 'a04-10341_1ex4d1.htm',
  'document_description': 'EX-4.1',
  'document_code': <document>
  <type>EX-4.1
  <sequence>3
  <filename>a04-10341_1ex4d1.htm
  <description>EX-4.1
  
  </description></filename></sequence></type></document>},
 'EX-4.2': {'document_sequence': '4',
  'document_filename': 'a04-10341_1ex4d2.htm',
  'document_description': 'EX-4.