In [44]:
from bs4 import BeautifulSoup
import pandas as pd
import re
import pprint

In [116]:
pd.set_option('display.max_colwidth', 100)

In [133]:
def get_body_main(tag:str, id:str, html_input='index.html', is_string=False):
    product_dictionary = {
        'main_tag': tag,
        'main_div_id': id,
    }

    html_string = ''
    if not is_string:
        with open(html_input, 'r') as input_file:
            html_string = input_file.read()
        #html_input = BeautifulSoup(open(html_input, encoding="utf8").read(), 'html.parser')
    
    soup = BeautifulSoup(html_string, 'html.parser')
    for s in soup.findAll('script'):
        s.extract()
    for s in soup.findAll('style'):
        s.extract()
    
    title = soup.find('title').text
    soup = soup.find(tag, id=id)
    product_dictionary.update({'html_body_string':str(soup)})
    product_dictionary.update({'html_lines': str(soup).splitlines()})
    product_dictionary.update({'body_text':soup.get_text()})
    product_dictionary.update({'page_title':title})

    return product_dictionary

In [134]:
def remove_chars(sentence,chars):
    return ''.join([c for c in sentence if c not in chars])

def replace_chars(text, c):
    return text.replace(c, ' ')

def get_input(file):
    with open(file, 'r') as html_file:
        lines = html_file.readlines()
        return [l.replace('\n','') for l in lines]
    
def soup_lines(lines):
    for text in lines:
        return (BeautifulSoup(text, 'html.parser') for text in lines)

def read_tags(soup):
    for tag in soup.find_all():
        return tag.name

def read_attrs(soup):
    attrs = {'class': '', 'id': '', 'name': ''}
    for tag in soup.find_all():
        attrs['class'] = tag.attrs.get('class')
        attrs['id'] = tag.attrs.get('id')
        attrs['name'] = tag.attrs.get('name')
    return attrs

def has_all_attrs(attrs):
    if not attrs['class'] and not attrs['id'] and not attrs['name']:
        return False
    return True

def if_not_none(item, attribute):
    if item:
        ia = item.get(attribute)
        return ia if not ia == '' else None
    return None
 
def prepare_data_dictionary(tags, attrs, text):
    for i in range(0, len(tags)):
        tags[i] = tags[i] if tags[i] else None
        #attrs[i] = attrs[i] if has_all_attrs(attrs[i]) else None
        text[i] = text[i].strip() if text[i].strip() else None
    tag_id = [if_not_none(a, 'id') for a in attrs]
    tag_class = [if_not_none(a, 'class') for a in attrs]
    tag_name = [if_not_none(a, 'name') for a in attrs]
    return {
        'LABEL': '',
        'TXT': text,
        'TAG': tags,
        'ID': tag_id,
        'CLASS': tag_class,
        'NAME': tag_name,
        'TXT': text,
    }

#read_text = lambda soup: [tag.text for tag in soup.find_all()]

def get_data(html_lines):
    lines = [line.replace('\t','') for line in html_lines]
    tags = [read_tags(soup) for soup in soup_lines(html_lines)]
    attrs = [read_attrs(soup) for soup in soup_lines(lines)]
    text = [soup.text for soup in soup_lines(lines)]
    return prepare_data_dictionary(tags, attrs, text)

In [150]:
product = get_body_main('div','productInfoContainer','html-test-data/megabuy.html')
html_lines = product['html_lines']
example_data = get_data(html_lines)
adf = pd.DataFrame(example_data)
#adf = adf.drop(adf[adf['TAG'].isnull()].index)
adf.dropna(axis=0, how='all', inplace=True, thresh=2)
adf = adf.fillna(axis=0, value='')
adf['LABEL'] = adf['TAG']+' '+adf['ID']
adf['LABEL'] = adf['LABEL'].apply(lambda s: remove_chars(s,'[]'))
adf['LABEL'] = adf['LABEL'].apply(lambda t: replace_chars(t,'-'))
adf['LABEL'] = adf['LABEL'].apply(lambda t: replace_chars(t,'_'))
text_label = adf.loc[:,['LABEL', 'TXT']]
with_text = text_label[(text_label['TXT'] != '')]
#adf.query('all')
with open('megabuy_output.csv', 'w') as output:
    output.write(str(text_label.to_csv()))
text_label.head(50)
#adf.head(50)
with_text.head(50)

Unnamed: 0,LABEL,TXT
2,div,"Asus ZenScreen MB16AC 39.6 cm (15.6"") Full HD LCD Monitor - 16:9 - Dark Grey - 1920 x 1080 - 220..."
5,li,Description
6,li,Specifications
10,div,"Extend your vision. Free your world. ASUS ZenScreen™ MB16AC is designed to keep you productive, ..."
14,div,IPS Panel
15,p,A technology for LCD screens designed to solve the main limitations of previous LCD screens incl...
24,td,Accessories
25,td,Asus Smart Case DisplayPort trade; over USB-C trade; cable (USB-C to USB-C) USB Type-C to A adap...
28,td,Aspect Ratio
29,td,Wide Screen 15.6 quot;(39.6cm) 16:9


In [163]:
print(f'Input data as {len(full_corpus)} rows and {len(full_corpus.columns)} columns')

Input data as 169 rows and 3 columns


In [164]:
num_divs = len(full_corpus[full_corpus['HTML Tag'] == 'div'])
num_imgs = len(full_corpus[full_corpus['HTML Tag'] == 'img'])
print(f'Out of {len(full_corpus)} rows, has {num_divs} div tags')
print(f'There are  {num_imgs} images')

Out of 169 rows, has 16 div tags
There are  0 images


In [165]:
tag_none = full_corpus['HTML Tag'].isnull().sum()
atr_none = full_corpus['HTML Attributes'].isnull().sum()
txt_none = full_corpus['Text Content'].isnull().sum()
print(f'Number with empty tags is {tag_none}')
print(f'Number with empty attributes is {atr_none}')
print(f'Number with empty text is {txt_none}')

Number with empty tags is 68
Number with empty attributes is 136
Number with empty text is 94
