In [13]:
import re
import requests
import unicodedata
from bs4 import BeautifulSoup

In [14]:
def restore_windows_1252_characters(restore_string):
    """
        Replace C1 control characters in the Unicode string s by the
        characters at the corresponding code points in Windows-1252,
        where possible.
    """

    def to_windows_1252(match):
        try:
            return bytes([ord(match.group(0))]).decode('windows-1252')
        except UnicodeDecodeError:
            # No character at the corresponding code point: remove it.
            return ''
        
    return re.sub(r'[\u0080-\u0099]', to_windows_1252, restore_string)

In [15]:
# define the url to specific html_text file
uri = r"https://www.sec.gov/Archives/edgar/data/19617/0000019617-20-000257.txt"

# grab the response
response = requests.get(uri)

# pass it through the parser, in this case let's just use lxml because the tags seem to follow xml.
soup = BeautifulSoup(response.content, 'lxml')

In [16]:
# define a dictionary that will house all filings.
master_filings_dict = {}

# let's use the accession number as the key. This 
accession_number = '0001104659-04-027382'

# add a new level to our master_filing_dict, this will also be a dictionary.
master_filings_dict[accession_number] = {}

# this dictionary will contain two keys, the sec header content, and a documents key.
master_filings_dict[accession_number]['sec_header_content'] = {}
master_filings_dict[accession_number]['filing_documents'] = None

In [17]:
# grab the sec-header tag, so we can store it in the master filing dictionary.
sec_header_tag = soup.find('sec-header')

# store the tag in the dictionary just as is.
master_filings_dict[accession_number]['sec_header_content']['sec_header_code'] = sec_header_tag

# display the sec header tag, so you can see how it looks.
display(sec_header_tag)

<sec-header>0000019617-20-000257.hdr.sgml : 20200225
<acceptance-datetime>20200225163102
ACCESSION NUMBER:		0000019617-20-000257
CONFORMED SUBMISSION TYPE:	10-K
PUBLIC DOCUMENT COUNT:		240
CONFORMED PERIOD OF REPORT:	20191231
FILED AS OF DATE:		20200225
DATE AS OF CHANGE:		20200225

FILER:

	COMPANY DATA:	
		COMPANY CONFORMED NAME:			JPMORGAN CHASE &amp; CO
		CENTRAL INDEX KEY:			0000019617
		STANDARD INDUSTRIAL CLASSIFICATION:	NATIONAL COMMERCIAL BANKS [6021]
		IRS NUMBER:				132624428
		STATE OF INCORPORATION:			DE
		FISCAL YEAR END:			1231

	FILING VALUES:
		FORM TYPE:		10-K
		SEC ACT:		1934 Act
		SEC FILE NUMBER:	001-05805
		FILM NUMBER:		20651128

	BUSINESS ADDRESS:	
		STREET 1:		383 MADISON AVENUE
		CITY:			NEW YORK
		STATE:			NY
		ZIP:			10017
		BUSINESS PHONE:		2122706000

	MAIL ADDRESS:	
		STREET 1:		383 MADISON AVENUE
		CITY:			NEW YORK
		STATE:			NY
		ZIP:			10017

	FORMER COMPANY:	
		FORMER CONFORMED NAME:	J P MORGAN CHASE &amp; CO
		DATE OF NAME CHANGE:	20010102

	FORMER C

In [18]:
# initialize master document dictionary
master_document_dict={}

# Loop through each document in the filing
for filing_document in soup.find_all('document'):
    # document id
    document_id = filing_document.type.find(text=True, recursive=False).strip()
    if document_id!='10-K':
        continue
    # document sequence
    document_sequence = filing_document.sequence.find(text=True, recursive=False).strip()
    
    # document filename
    document_filename = filing_document.filename.find(text=True, recursive=False).strip()
    
    # document description
    document_description = filing_document.description.find(text=True, recursive=False).strip()
    
    # insert the key
    master_document_dict[document_id] = {}
    
    # add the differnt parts of the document
    master_document_dict[document_id]['document_sequence'] = document_sequence
    master_document_dict[document_id]['document_filename'] = document_filename
    master_document_dict[document_id]['document_description'] = document_description
    
    # add document content
    master_document_dict[document_id]['document_code'] = filing_document.extract()
    
    # get all the text in document
    filing_doc_text = filing_document.find('text').extract()
    
    # get all thematic breaks (page breaks)
    all_thematic_breaks = filing_doc_text.find_all('hr',{'style':'page-break-after:always'})
    
    # convert all the breaks into a string
    all_thematic_breaks = [str(thematic_break) for thematic_break in all_thematic_breaks]
    
    # prep the document for being split
    filing_doc_string = str(filing_doc_text)
    
    if len(all_thematic_breaks) > 0:
        
        # creates our pattern
        regex_delimited_pattern = '|'.join(map(re.escape, all_thematic_breaks))
        
        # split the document along the thematic breaks
        split_filing_string = re.split(regex_delimited_pattern, filing_doc_string)
        
        # store the document in the dictionary
        master_document_dict[document_id]['pages_code'] = split_filing_string
    
    elif len(all_thematic_breaks)==0:
        # store the document in the dictionary
        master_document_dict[document_id]['pages_code'] = [filing_doc_string]
        
    # display some information to the user.
    print('-'*80)
    print('The document {} was parsed.'.format(document_id))
    print('There was {} thematic breaks(s) found.'.format(len(all_thematic_breaks)))
    

# store the documents in the master_filing_dictionary.
master_filings_dict[accession_number]['filing_documents'] = master_document_dict

print('-'*80)
print('All the documents for filing {} were parsed and stored.'.format(accession_number))

--------------------------------------------------------------------------------
The document 10-K was parsed.
There was 312 thematic breaks(s) found.
--------------------------------------------------------------------------------
All the documents for filing 0001104659-04-027382 were parsed and stored.


In [None]:
# first grab all documents
filing_documents = master_filings_dict[accession_number]['filing_documents']

# loop through each document
for document_id in filing_documents:
    
    # display some info to give status updates.
    print('-'*80)
    print('Pulling document {} for text normilzation.'.format(document_id))
    
    # grab all the pages for each document
    document_pages = filing_documents[document_id]['pages_code']
    
    # pages length
    pages_length = len(document_pages)
    
    # initialize some dictionaries
    repaired_pages = {}
    
    normalized_text = {}
    
    for index, page in enumerate(document_pages):
        
        # pass it through the parser to repair it
        page_soup = BeautifulSoup(page, "html5")
        
        # grab the text from each page
        page_text = page_soup.html.body.get_text(' ', strip=True)
        
        # normalize the text        
        page_text_norm = restore_windows_1252_characters(unicodedata.normalize('NFKD', page_text))
        
        page_text_norm = page_text_norm.replace('  ',' ').replace('\n',' ')
        
        page_number = index + 1
        
        # add the normalized text to the dictionary
        normalized_text[page_number] = page_text_norm
        
        # add the reapired html code to the dictionary
        repaired_pages[page_number] = page_soup
        
         # display a status to the user
        print('Page {} of {} from document {} has had their text normalized.'.format(index + 1, pages_length, document_id))
        
    # add normalized text dictionary to the master filing dicitonary
    filing_documents[document_id]['page_normalized_text'] = normalized_text
    
    # add the repaired html code back to the document dictionary
    filing_documents[document_id]['pages_code'] = repaired_pages
    
    # define the generated page numbers
    gen_page_numbers = list(repaired_pages.keys())
    
    # add the page numbers we have.
    filing_documents[document_id]['pages_numbers_generated'] = gen_page_numbers    
    
    # display a status to the user.
    print('All the pages from document {} have been normalized.'.format(document_id))