In [33]:
import re
import requests
import unicodedata # normalize the text
from bs4 import BeautifulSoup

In [34]:
# The text from each page of the document will be disgusting. Lets normalize the text
# cant rely on only unicodedata lib to help, there is also windows_1252_characters that cause issues
# from stack overflow, we have a way to help normalize the remaining portions of data
def restore_windows_1252_characters(restore_string):
    # Replace c1 control chars in unicode string s by chars at 
    # corresponding code points in windows 1252 where possible
    def to_windows_1252(match):
        try:
            return bytes([ord(match.group(0))]).decode('windows-1252')
        except UnicodeDecodeError:
            # no char at corresponding code point, remove it
            return ''
    return re.sub(r'[\u0080-\u0099]', to_windows_1252, restore_string)

In [35]:
# define a url
new_html_file = r"https://www.sec.gov/Archives/edgar/data/1166036/000110465904027382/0001104659-04-027382.txt"

# grab response
response = requests.get(new_html_file)

# parse the response
soup = BeautifulSoup(response.content, 'lxml')

In [36]:
# soup

In [37]:
# define master dictionary to house all filings
master_filings_dict = {}

# define unique key for each filing
accession_number = '0001104659-04-027382'

# add key to dict and add new level
master_filings_dict[accession_number] = {}

# add next levels 
# filing documents and sec filing headers
master_filings_dict[accession_number]['sec_header_content'] = {}
master_filings_dict[accession_number]['filing_documents'] = None

In [38]:
# grab sec-header document
sec_header_tag = soup.find('sec-header')
sec_header_tag

In [39]:
sec_header_tag.get_text()

AttributeError: 'NoneType' object has no attribute 'get_text'

In [40]:
# store sec header content inside the dict
master_filings_dict[accession_number]['sec_header_content']['sec_header_code'] = sec_header_tag

In [41]:
master_filings_dict

{'0001104659-04-027382': {'sec_header_content': {'sec_header_code': None},
  'filing_documents': None}}

In [42]:
# Parse the documents
# find all document tags and loop through results
# init master document dict
master_document_dict = {}

for filing_document in soup.find_all('document'):
    document_id = filing_document.type.find(text=True, recursive = False).strip()    
#     print(document_id)
    document_sequence = filing_document.sequence.find(text=True, recursive = False).strip()    
    document_filename = filing_document.filename.find(text=True, recursive = False).strip()    
    document_description = filing_document.description.find(text=True, recursive = False).strip()
    
    # insert key
    master_document_dict[document_id] = {}
    
    # add different parts of document
    master_document_dict[document_id]['document_sequence'] = document_sequence
    master_document_dict[document_id]['document_filename'] = document_filename
    master_document_dict[document_id]['document_description'] = document_description
    
    # add document content
    master_document_dict[document_id]['document_code'] = filing_document.extract()
    
    # get all text in document
    filing_doc_text = filing_document.find('text').extract()
    
    # get all thematic breaks, aka end of page lining
    all_thematic_breaks = filing_doc_text.find_all('hr', {'width' : '100%'})
    

In [43]:
master_document_dict

{}