In [2]:
%pip install pandas
%pip install py7zr


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Collecting py7zr
  Downloading py7zr-0.20.5-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.4/66.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting texttable (from py7zr)
  Downloading texttable-1.6.7-py2.py3-none-any.whl (10 kB)
Collecting pycryptodomex>=3.6.6 (from py7zr)
  Downloading pycryptodomex-3.18.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hCollecting pyzstd>=0.14.4 (from py7zr)
  Downloading pyzstd-0.15.9-cp310-cp310-manylinux_2_17_x86_64.manylinux20

# HTML To Unstructed XML

This notebook will demonstrate a rudimentary process to convert raw HTML downloaded from the internet into a structured XML file tagged with keywords that describe it best, using Machine Learning.

It only takes into account the HTML body, the title tag, and the alt-tags of media. In particular, *Javascript* cannot be executed, so pages which get their content from a server cannot be processed.

## Input data

Our dataset consists of tens of thousands of website homepages as input data, from a subset of the Majestic Million database, gathered in the span of two months, and over a million classifications obtained from Wikidata for tagging each element of the XML. Some websites could not be downloaded because of HTTP errors. Webpages that do not have a sufficient amount of tokens were removed.

The classifications are stored as a JSON file with keys corresponding to the IDs of the types, in the 'wd:' or Wikidata namespace, and values being the titles of those IDs.

The websites are compressed into a tarred XZ archive, which must be decompressed manually and placed into the `data/sites` folder.

First we shall load the classification data:

In [1]:
import os

os.chdir('../data')

In [3]:
import py7zr

with py7zr.SevenZipFile("sites.7z", mode='r') as z:
    z.extractall()
    
os.chdir('..')


FileNotFoundError: [Errno 2] No such file or directory: 'sites.7z'

In [4]:
import pandas as pd
import json

with open('data/entities.json') as f:
    entities = json.load(f)

df = pd.DataFrame.from_dict(entities, orient='index', columns=['value'])


## Cleaning the input data

We need to ensure that the input HTML pages have styles and Javascript as well as other kinds of behavior tags are removed, because they currently cannot be processed.

In [1]:
from copy import copy
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup, Tag, Comment
import re

def convert_to_element(tag):
    """
    Method used to convert Beautifulsoup HTML tags into XML elements.
    """
    if isinstance(tag, Tag):
        element = ET.Element(tag.name, tag.attrs)
        for child in tag.contents:
            if isinstance(child, Tag):
                sub_element = convert_to_element(child)
                element.append(sub_element)
            else:
                element.text = child
        return element
    else:
        return ET.Element(tag)

def extract_body(html):
    """
    Extracts the body of the HTML, stripping out extraneous visual elemenets.
    The keywords (names of the site) are also obtained.
    """
    html = re.sub("\s{2,}", "", html)
    soup = BeautifulSoup(html, 'html.parser')

    # Remove whitespace nodes
    def remove_whitespace_nodes(node):
        if not isinstance(node, str):
            for child in node.contents:
                if isinstance(child, str) and len(child.strip()) == 0:
                    child.extract()
                else:
                    remove_whitespace_nodes(child)

    remove_whitespace_nodes(soup)

    # Remove script tags (JavaScript)
    for script in soup.find_all('script'):
        script.extract()
    for script in soup.find_all('noscript'):
        script.extract()

    # Remove style tags (CSS)
    for style in soup.find_all('style'):
        style.extract()

    # Remove comments
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))
    for comment in comments:
        comment.extract()
    
    # Remove SVG images because these have complex
    # drawing paths that we are not interested in
    for style in soup.find_all('svg'):
        style.extract()

    # For the same reason, discard Form and Input elements.
    for style in soup.find_all('form'):
        style.extract()
    for style in soup.find_all('input'):
        style.extract()

    # Extract body
    body = copy(soup.body)

    # Remove "class" attributes from all elements
    def remove_class_attributes(tag):
        #if tag.has_attr('class'):
        #    del tag['class']
        
        # Also remove data- attributes
        # and aria-, style CSS, IDs, and other noisy
        # attributes.
        for attr in list(tag.attrs):
            if not attr in ["alt", "title", "src", "href"]:
                del tag[attr]

            # Also remove empty keys
            elif tag[attr].strip() == "":
                del tag[attr]
        
        for child in tag.children:
            if isinstance(child, Tag):
                remove_class_attributes(child)

    remove_class_attributes(body)

    # Remove empty nodes
    def remove_empty_nodes(tag):
        children = tag.contents
        for child in children:
            if isinstance(child, Tag):
                if len(child.contents) > 0:
                    remove_empty_nodes(child)
                if len(child.contents) == 0 and len(child.attrs.keys()) == 0:
                    child.extract()

    remove_empty_nodes(body)

    def collapse_element(element):
        if len(element.contents) == 1 and isinstance(element.contents[0], Tag):
            child = element.contents[0]
            element.replace_with(child)

    # Recursively collapse elements
    def process_element(element):
        for child in element.children:
            if isinstance(child, Tag):
                process_element(child)
                collapse_element(child)

    process_element(body)
    
    # Add the title
    title = soup.new_tag('title')
    title.string = soup.title.text
    body.insert(0, title)
    
    # Take note of the site name because eventually they will be filtered out.
    # Find the first occurrence of <meta property="og:site_name" ...>
    meta_tag = soup.find('meta', attrs={'property': 'og:site_name'})

    keywords = set()
    if meta_tag:
        # Get the value of the "content" attribute
        keyword = meta_tag.get('content')
        keywords.add(keyword)
        
        # See if the're a .com or similar at the end of the content
        # and get rid of it, to use as a new keyword.
        key = '.'.join(keyword.split('.')[:-1])
        if key != keyword:
            keywords.add(key)
    
    # Convert body contents to XML
    xml = str(body)

    return xml, keywords

We are going to clean each of the webpages we have in the `data/sites` directory, one at a time.