In [64]:
'''Form a list of strings from HTML text-content and structure

Functional style solution to simplify the HTML
DOM tree of any chunk of HTML. Higher-order Functions,
Lambdas, Closure, and List Comprehensions are used.

This algorithm focuses on extracting text content
while retaining basic structural relationships
from the HTML as semantic structural units
of natural language.

IN:  HTML Chunk or Whole document
OUT: List of HTML lines destructured
'''
from bs4 import BeautifulSoup
import pprint

def get(file):
    '''RETURNS: html/txt file as Python List of lines'''
    with open(file, 'r') as html_file:
        return html_file.readlines()

def soup_lines(lines):
    '''Makes beautiful soup object instances from HTML lines
    IN: List of html lines in a list
    OUT: List of beautifulsoup instances'''
    for l in lines:
        return (BeautifulSoup(l, 'html.parser') for l in lines)

read_tags = lambda soup: [tag.name for tag in soup.find_all()]
read_attrs = lambda soup: [tag.attrs for tag in soup.find_all()]
read_text = lambda soup: [tag.text for tag in soup.find_all()]

stringify = lambda str_list: ''.join(str_list)
#is_char = lambda char: char == 
def remove_char(rm_char, line_string):
    line = ''
    for char in line_string:
        if char == rm_char:
            line += ''
        else:
            line += char
    return line

def join_parts(tag, attr, text):
    return f'{", ".join(tag)}, {attr}, {"".join(text)}'
                
# NOTE: Need to manually preselect the section of HTML in the document
# This is done by another class in the codebase but here it is expecting
# to receive the body only or just the div that holds the product info.
def destructure_html():
    # Split noisy HTML chunk into lines keeping natural order
    # And hold as Beautiful Soup object instances (1 per line)
    # Important: KEEP DOCUMENT ORDER
    html_lines = get('proj_files/asus_mb16ac.html')
    # Do a series of List Comprehensions:
    # Remove tabs
    lines = [remove_char('\t', line) for line in html_lines]
    # Create list: HTML_TAGS
    tags = [read_tags(soup) for soup in soup_lines(lines)]
    # Create list: HTML_ATTRIBUTES as dictionary-like
    attrs = [read_attrs(soup) for soup in soup_lines(lines)]
    # Create list: TEXT string from HTML content
    text = [read_text(soup) for soup in soup_lines(lines)]
    # Create quiet list: STRINGS of 3 above lists
    quiet_lines = (list(map(join_parts, tags, attrs, text)))

    pp = pprint.PrettyPrinter(indent=0)
    pp.pprint(quiet_lines)
    return quiet_lines

destructure_html()

["div, [{'id': 'specifications', 'class': [], 'style': 'display: block;'}], \n",
"aside, [{'class': ['product-aside', 'row']}], \n",
"div, [{'id': 'aside-button', 'class': ['span12']}], \n",
"a, [{'href': 'javascript:void(0)', 'class': ['print-btn', 'pull-right'], "
"'onclick': 'open_printer()'}], Print",
', [], ',
', [], ',
"div, [{'id': 'model_number', 'class': ['hide']}], \n",
', [], ',
"select, [{'name': 'modelnumber'}], \n",
'option, [{}], N50-AS031C',
'option, [{}], N50-123456',
'option, [{}], N50-234567',
', [], ',
', [], ',
"div, [{'id': 'spec-area', 'class': ['row']}], \n",
"ul, [{'class': ['product-spec']}], \n",
"li, span, div, [{}, {'class': ['spec-item']}, {'class': ['spec-data']}], "
'Display\n'
'Display\n',
'strong, br, [{}, {}], Panel Size: ',
'strong, br, [{}, {}], Panel Type :',
'strong, br, [{}, {}], True Resolution : ',
'strong, br, [{}, {}], Display Viewing Area(HxV) : ',
'strong, br, [{}, {}], Display Surface',
'strong, br, [{}, {}], Pixel Pitch : ',
'strong, br, 

["div, [{'id': 'specifications', 'class': [], 'style': 'display: block;'}], \n",
 "aside, [{'class': ['product-aside', 'row']}], \n",
 "div, [{'id': 'aside-button', 'class': ['span12']}], \n",
 "a, [{'href': 'javascript:void(0)', 'class': ['print-btn', 'pull-right'], 'onclick': 'open_printer()'}], Print",
 ', [], ',
 ', [], ',
 "div, [{'id': 'model_number', 'class': ['hide']}], \n",
 ', [], ',
 "select, [{'name': 'modelnumber'}], \n",
 'option, [{}], N50-AS031C',
 'option, [{}], N50-123456',
 'option, [{}], N50-234567',
 ', [], ',
 ', [], ',
 "div, [{'id': 'spec-area', 'class': ['row']}], \n",
 "ul, [{'class': ['product-spec']}], \n",
 "li, span, div, [{}, {'class': ['spec-item']}, {'class': ['spec-data']}], Display\nDisplay\n",
 'strong, br, [{}, {}], Panel Size: ',
 'strong, br, [{}, {}], Panel Type :',
 'strong, br, [{}, {}], True Resolution : ',
 'strong, br, [{}, {}], Display Viewing Area(HxV) : ',
 'strong, br, [{}, {}], Display Surface',
 'strong, br, [{}, {}], Pixel Pitch : ',
