In [None]:
%pip install pandas
%pip install numpy
%pip install nltk
%pip install scikit-learn
%pip install beautifulsoup4
%pip install py7zr
%pip install lxml
%pip install tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# HTML To Structed XML

This notebook will demonstrate a rudimentary process to convert raw HTML downloaded from the internet into a structured XML file tagged with keywords that describe it best, using Machine Learning.

It only takes into account the HTML body, the title tag, and the alt-tags of media. In particular, *Javascript* cannot be executed, so pages which get their content from a server cannot be processed.

## Input data

Our dataset consists of thousands of website homepages as input data, from a subset of the Majestic Million database, gathered in the span of two months, and over a million classifications obtained from Wikidata for tagging each element of the XML. Some websites could not be downloaded because of HTTP errors. Webpages that do not have a sufficient amount of tokens were removed.

The classifications are stored as a JSON file with keys corresponding to the IDs of the types, in the 'wd:' or Wikidata namespace, and values being the titles of those IDs.

The websites are compressed into a tarred XZ archive, which must be decompressed manually and placed into the `data/sites` folder.

First we shall load the classification data:

In [1]:
import os

os.chdir('../data')

In [4]:
import py7zr

with py7zr.SevenZipFile("sites.7z", mode='r') as z:
    z.extractall()

If you have already extracted the data in a previous run, just skip to this block.

In [1]:
import os

os.chdir('..')

## Semantic types database

Next we will load the dataset containing the list of Wikidata types, in the wdata: namespace, and their corresponding titles. This will be used to map the identified keyword to a semantic type.

In [2]:
import pandas as pd
import json

with open('data/entities.json') as f:
    entities = json.load(f)

df = pd.DataFrame.from_dict(entities, orient='index', columns=['value']) #FIXME It seems that this is unused


## Cleaning the input data

We need to ensure that the input HTML pages have styles and Javascript as well as other kinds of behavior tags are removed, because they currently cannot be processed.

In [3]:
import warnings
from bs4 import MarkupResemblesLocatorWarning, XMLParsedAsHTMLWarning

warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

In [4]:
from copy import copy
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup, Tag, Comment
import re

def convert_to_element(tag):
    """
    Method used to convert Beautifulsoup HTML tags into XML elements.
    """
    if isinstance(tag, Tag):
        element = ET.Element(tag.name, tag.attrs)
        for child in tag.contents:
            if isinstance(child, Tag):
                sub_element = convert_to_element(child)
                element.append(sub_element)
            else:
                element.text = child
        return element
    else:
        return ET.Element(tag)

def detect_language(soup):
    return soup.find('html') and soup.find('html').get('lang') or 'en-us'

def extract_body(html):
    """
    Extracts the body of the HTML, stripping out extraneous visual elemenets.
    The keywords (names of the site) are also obtained.
    """
    html = re.sub("\s{2,}", " ", html)
    soup = BeautifulSoup(html, 'html.parser')


    # Do not process non-english sites
    lang = detect_language(soup).lower()
    if lang != "en" and not lang.startswith("en-") and not lang.startswith("en_"):
        return None, set()

    # Remove whitespace nodes
    def remove_whitespace_nodes(node):
        if not isinstance(node, str):
            for child in node.contents:
                if isinstance(child, str) and len(child.strip()) == 0:
                    child.extract()
                else:
                    remove_whitespace_nodes(child)

    remove_whitespace_nodes(soup)

    # Remove script tags (JavaScript)
    for script in soup.find_all('script'):
        script.extract()
    for script in soup.find_all('noscript'):
        script.extract()

    # Remove style tags (CSS)
    for style in soup.find_all('style'):
        style.extract()

    # Remove comments
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))
    for comment in comments:
        comment.extract()
    
    # Remove SVG images because these have complex
    # drawing paths that we are not interested in
    for style in soup.find_all('svg'):
        style.extract()

    # For the same reason, discard Form and Input elements.
    for style in soup.find_all('form'):
        style.extract()
    for style in soup.find_all('input'):
        style.extract()

    # Extract body
    body = copy(soup.body)
    if not body:
        return None, set()

    # Remove "class" attributes from all elements
    def remove_class_attributes(tag):
        #if tag.has_attr('class'):
        #    del tag['class']
        
        # Also remove data- attributes
        # and aria-, style CSS, IDs, and other noisy
        # attributes.
        for attr in list(tag.attrs):
            if not attr in ["alt", "title", "src", "href"]:
                del tag[attr]

            # Also remove empty keys
            elif tag[attr].strip() == "":
                del tag[attr]
        
        for child in tag.children:
            if isinstance(child, Tag):
                remove_class_attributes(child)

    remove_class_attributes(body)

    # Remove empty nodes
    def remove_empty_nodes(tag):
        children = tag.contents
        for child in children:
            if isinstance(child, Tag):
                if len(child.contents) > 0:
                    remove_empty_nodes(child)
                if len(child.contents) == 0 and len(child.attrs.keys()) == 0:
                    child.extract()

    remove_empty_nodes(body)

    def collapse_element(element):
        if len(element.contents) == 1 and isinstance(element.contents[0], Tag):
            child = element.contents[0]
            element.replace_with(child)

    # Recursively collapse elements
    def process_element(element):
        for child in element.children:
            if isinstance(child, Tag):
                process_element(child)
                collapse_element(child)

    process_element(body)
    
    def get_alphanumeric_tokens(input_string):
        # Define the regular expression pattern to match alphanumeric tokens
        pattern = r'\b\w+\b'
        
        # Find all matches of the pattern in the input string
        matches = re.findall(pattern, input_string)
        
        # Remove empty strings from the list of matches
        alphanumeric_tokens = [token.lower() for token in matches if token.strip()]
        
        return alphanumeric_tokens
    
    # Add the title
    if soup.title:
        title = soup.new_tag('title')
        title.string = soup.title.text
        body.insert(0, title)
    
    # Take note of the site name because eventually they will be filtered out.
    # Find the first occurrence of <meta property="og:site_name" ...>
    meta_tag = soup.find('meta', attrs={'property': 'og:site_name'})

    keywords = set()
    if meta_tag:
        # Get the value of the "content" attribute
        keyword = meta_tag.get('content')
        if keyword:
            keywords.update(get_alphanumeric_tokens(keyword))
            
            # See if the're a .com or similar at the end of the content
            # and get rid of it, to use as a new keyword.
            key = '.'.join(keyword.split('.')[:-1])
            if key != keyword:
                keywords.add(key)
    
    # Convert body contents to XML
    xml = str(body)

    return xml, keywords

We are going to clean each of the webpages we have in the `data/sites` directory, one at a time.

In [5]:
from lxml import etree

def list_html_files(directory_path):
    html_files = []
    for filename in os.listdir(directory_path):
        if filename.lower().endswith('.html'):
            html_files.append(filename)
    return html_files

html_files = list_html_files("data/sites")

def process_html(html_file):
    with open(f"data/sites/{html_file}", errors='ignore') as f:
        html = f.read()

    try:
        keywords = set()
        keywords.add(html_file[:-5])
        xml_output, keywords = extract_body(html)

        if xml_output:
            dom = etree.fromstring(xml_output, etree.HTMLParser())
            return (etree.tostring(dom, encoding="unicode", pretty_print=False), keywords)
    except RecursionError:
        # Crazy or maliciously crafted website that makes us crash, skip it
        pass

    return None, None


# Data Tokenization

To tokenize the XML content into words using the xml.dom.minidom module, you can extract the text content from the DOM nodes and then tokenize the text using appropriate techniques. 

## Punkt tokenizer

The Punkt tokenizer is a pre-trained unsupervised machine learning tokenizer available in the NLTK (Natural Language Toolkit) library. It is designed specifically for tokenizing natural language text, capable of handling various punctuation patterns and ambiguous word boundaries. It uses a combination of unsupervised and supervised learning techniques to determine sentence boundaries and word tokenization, and has been trained on large corpora and can handle a wide range of languages and text genres.

Punkt is a widely used and reliable tool for tokenizing natural language text in many NLP (Natural Language Processing) tasks, including text analysis, information retrieval, and machine learning algorithms that require tokenized input, but it must be downloaded first:

In [6]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
from lxml import etree
import re

def extract_text_from_tags(dom):
    # This method is a stub. It is defined in the next cell.
    etree.fromstring(dom)

class DOMElementInfo:
    def __init__(self, element=None):
        if isinstance(element, etree._Element):
            self.element_name = element.tag
            self.attributes = {attr: value for attr, value in element.attrib.items()}
            self.user_data = etree.tostring(element, encoding="unicode", pretty_print=False)
            self.children = [DOMElementInfo(child) for child in element if child is not None]
        else:
            self.element_name = ""
            self.attributes = {}
            self.user_data = []
            self.children = []

    def to_xml(self):
        element = etree.Element(self.element_name)
        for attr, value in self.attributes.items():
            element.set(attr, value)
        # Write user data - assume list
        if self.user_data:
            element.set("templaterules-keywords", ','.join(list(set(keyword for keyword in self.user_data if keyword))))
        for child_info in self.children:
            child_element = child_info.to_xml()
            element.append(child_element)
        return element
    
    
    @staticmethod
    def remove_xml_declaration(input_string):
        # Use regular expression to find and remove the <?xml ... ?> substring
        pattern = r"<\?xml.*?\>"
        return re.sub(pattern, '', input_string)
        #input_string = re.sub(pattern, '', input_string)

        # The input dataset chokes when this sequence is found.
        #pattern = r"<\/?https:>"
        #return re.sub(pattern, '', input_string)
    
    
    @staticmethod
    def try_extract(dom):
        attempts = 0
        while True:
            attempts += 1
            if attempts > 100:
                # Give up
                raise RuntimeError("This DOM is impossible to resolve")
            try:
                extract_text_from_tags(dom)
                return dom
            except etree.XMLSyntaxError as e:
                line, column = DOMElementInfo.extract_line_column_from_xml_error(str(e))
                if not line or not column:
                    raise e
                dom = DOMElementInfo.remove_char_at_position(dom, line, column)
                continue
            
    @staticmethod
    def extract_line_column_from_xml_error(error_message):
        # Regular expression pattern to find line and column numbers from the error message
        pattern = r"line (\d+), column (\d+)"
        match = re.search(pattern, error_message)
        
        if match:
            line_number = int(match.group(1))
            column_number = int(match.group(2))
            return line_number, column_number
        else:
            return None, None

    @staticmethod
    def remove_char_at_position(text, line_number, column_number):
        lines = text.split('\n')

        if line_number <= 0 or line_number > len(lines):
            return text

        line_index = line_number - 1
        line = lines[line_index]

        if column_number <= 0 or column_number > len(line):
            return text

        # Calculate the position of the character to remove in the original string
        char_index_to_remove = sum(len(lines[i]) + 1 for i in range(line_index)) + column_number - 1

        # Create a new string without the character at the specified position
        new_text = text[:char_index_to_remove] + text[char_index_to_remove + 1:]

        return new_text



In [8]:
import re

from nltk.corpus import stopwords
from lxml import etree

def extract_text_from_tags(dom):
    dom = etree.fromstring(dom)
    body_text = ''
    h1_text = ''
    h2_text = ''
    h3_text = ''
    h4_text = ''
    h5_text = ''
    h6_text = ''
    other_text = ''

    def extract_text(node, header_level=-1):
        nonlocal body_text, h1_text, h2_text, h3_text, h4_text, h5_text, h6_text, other_text

        if node.tag == etree.Comment:
            return

        if isinstance(node, etree._ElementUnicodeResult):
            node_data = node
        else:
            node_data = node.text

        if node_data and isinstance(node_data, str):
            text = node_data.strip()
            if header_level == 0:
                body_text += ' ' + text
            elif header_level == 1:
                h1_text += ' ' + text
            elif header_level == 2:
                h2_text += ' ' + text
            elif header_level == 3:
                h3_text += ' ' + text
            elif header_level == 4:
                h4_text += ' ' + text
            elif header_level == 5:
                h5_text += ' ' + text
            elif header_level == 6:
                h6_text += ' ' + text
            else:
                other_text += ' ' + text

        for child in node:
            extract_text(child, header_level + 1)

    body = dom
    for child in body:
        tag_name = child.tag.lower()
        if tag_name == 'title':
            extract_text(child, 0)
        elif tag_name == 'h1':
            extract_text(child, 1)
        elif tag_name == 'h2':
            extract_text(child, 2)
        elif tag_name == 'h3':
            extract_text(child, 3)
        elif tag_name == 'h4':
            extract_text(child, 4)
        elif tag_name == 'h5':
            extract_text(child, 5)
        elif tag_name == 'h6':
            extract_text(child, 6)
        else:
            extract_text(child)

    return ' '.join([body_text.strip(), h1_text.strip(), h2_text.strip(), h3_text.strip(), h4_text.strip(),
                     h5_text.strip(), h6_text.strip(), other_text.strip()])


# Define common stopwords/conjunctions/pronouns
stopwords_list = set(stopwords.words('english'))


def tokenize_dom(dom, keywords):
    try:
        dom = extract_text_from_tags(dom)
    except etree.XMLSyntaxError:
        return []
    except AttributeError:
        # Because cython-function-or-method in element.tag nonsense
        return []
    
    tokenized_text = nltk.word_tokenize(dom)
    
    # Remove tokens with symbols using regular expressions
    tokenized_text = [token for token in tokenized_text if re.match(r'^[-a-zA-Z0-9]+$', token)]

    # Remove tokens that are entirely numbers
    tokenized_text = [token for token in tokenized_text if not token.isdigit()]

    # Remove tokens that are one character long
    tokenized_text = [token for token in tokenized_text if len(token) > 1]

    # Remove common stopwords
    tokenized_text = [token for token in tokenized_text if token.lower() not in stopwords_list]

    # Convert titlecase tokens to lowercase, except for ALL CAPS tokens
    tokenized_text = [token.lower() if not token.isupper() else token for token in tokenized_text]

    # And finally remove the website name(s) from the list of tokens
    tokenized_text = [token for token in tokenized_text if token not in keywords]
    
    return tokenized_text


# Token weights

### This has been removed from the calculation because it kept producing repetitive n-grams as the most common tokens.

Next we are going to prepare the vectorizers by inserting more tokens based on how close to the beginning of the list the token is. Because of the way the list is ordered, the title will be more emphasized than the headings, which will be successively more emphasized than the paragraphs and other elements.

We will use an exponential distriution with $\lambda = t_I$ with $t_I$ representing the number of tokens at the beginning of the list that are interesting (and to be boosted). $t_I$ will be hardcoded to $min(\frac{t}{10}, 25)$ in all training data, where $t$ is the total number of tokens. This is to accomodate small webpages with small token lists close to $25$.

Consequentially we are going to use as our horizontal scaling (expanding specifically) factor $\frac{1}{\lambda^{2}}$.

In [9]:
import numpy as np

def flatten(l):
    return [item for sublist in l for item in sublist]

def weight_tokens(tokenized_text):
    tokens = len(tokenized_text)
    interesting_tokens = min(tokens/10, 25)
    lam = interesting_tokens
    tokenized_text_lists=[]
    for i in range(0, len(tokenized_text)):
        x = i+1 # Make it one-based
        
        # Fractional additions are not possible so just truncate the addition number to an integer,
        # discarding the decimal place.
        y = int(lam * np.exp(-lam * 1/(interesting_tokens**2)*x) + 1)
        tokenized_text_lists.append([tokenized_text[i]]*y)
    tokenized_text_2 = flatten(tokenized_text_lists)
    tokenized_text = tokenized_text_2
    del tokenized_text_2
    return tokenized_text


## Token vectorizers

At this point, we have a choice between two machine learning algorithms for classifying the data - we can use Bag of Words which just counts the frequency of the terms in the documet, or we can use TF/IDF which stands for *Term Frequency times the Inverse Document Frequency* (the "IDF" being the *inverse* of the number of times the term appears in the document).

TF/IDF is more readily compared between different documents and so this is the metric that we will settle with. We will consider only terms with a frequency of 0.1 or greater.

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

def train_tokens_model(weighted_text):
    threshold = 0.1
    # Calculate TF-IDF
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3))
    tfidf = tfidf_vectorizer.fit_transform([" ".join(weighted_text)])

    # Get top tokens according to the threshold and TF-IDF scores
    feature_names_tfidf = tfidf_vectorizer.get_feature_names_out()
    input_data = [x for x in zip(feature_names_tfidf, tfidf.toarray()[0]) if x[1] >= threshold]
    top_tokens_tfidf = sorted(input_data, key=lambda x: x[1], reverse=True)[:3]
    return top_tokens_tfidf



# Semantic type inference

Using the semantic type Wikidata we have imported, now we can turn each keyword into a symantic identifier representing a structured type as defined in [XML Schema](https://www.w3.org/XML/Schema).

In [11]:
from scipy.sparse import csr_matrix
from scipy import sparse
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

def collect_non_zero_indices(vector1, vector2):
    # Get the indices of non-zero elements in vector1
    non_zero_indices = np.nonzero(vector1)[0]

    # Create two new vectors with non-zero indices
    new_vector1 = vector1[non_zero_indices]
    new_vector2 = vector2[non_zero_indices]

    return non_zero_indices, new_vector1, new_vector2

def find_most_similar_identifier_preprocess(dictionary):
    # Create a list of phrases from the dictionary values
    phrases = [phrase.lower() for phrase in list(dictionary.values())]

    # Create a CountVectorizer and fit-transform the text data
    vectorizer = CountVectorizer()
    vectors = vectorizer.fit_transform(phrases)
    
    norm_vectors = csr_matrix(sparse.linalg.norm(vectors, axis=1)).transpose()

    return vectors, vectorizer, norm_vectors

def find_most_similar_identifier(input_word, vectors, vectorizer, norm_vectors, dictionary):
    # Transform the input word using the pre-fitted vectorizer
    input_vector = vectorizer.transform([input_word])

    dot = csr_matrix.dot(vectors, input_vector.T)

    if not dot.nnz:
        return "" # Nothing in the dictionary is similar to it.
    
    non_zero_indicies, dot, norm_vectors = collect_non_zero_indices(dot, norm_vectors)

    norm_input = csr_matrix(sparse.linalg.norm(input_vector, axis=1)).transpose()

    # Calculate cosine similarity scores between input_vector and phrase vectors
    similarity_scores = csr_matrix(dot / (norm_vectors * norm_input))
    
    # Find the index of the most similar phrase
    most_similar_index = non_zero_indicies[csr_matrix.argmax(similarity_scores)]

    # Get the corresponding identifier from the dictionary
    most_similar_identifier = list(dictionary.keys())[most_similar_index]

    # In case the dictionary give us a non-identifier
    if not re.match(r'^Q\d+$', most_similar_identifier):
        return ''

    return most_similar_identifier



# Scalability

If we run each processing stage one of the time with all of the input elements at once, it is highly likely the kernel will fail before that particular stage is complete. During tests, we found that the dataset of around 15,000 web pages would consume about 8 gigabytes of memory when fully tokenized. Production datasets will almost certainly be an order of magnitude larger than this figure, so it requires an alternate approach to processing.

Instead of running each processing stage one at a time, we can split the input data into batches of reasonable sizes and run the entire pipeline on the batches one at a time, beginning and ending with disk I/O access.

In [24]:
def process_html_with_progress(html_file):
    try:
        result = process_html(html_file)
    except RuntimeError:
        result = None
    except ValueError:
        result = None
    return result


In [25]:
def gather_element_info(element):
    element = DOMElementInfo.remove_xml_declaration(element)
    element = DOMElementInfo.try_extract(element)
    info = DOMElementInfo(etree.fromstring(element))
    return info

In [26]:
from copy import copy

def tokenize_dom_with_progress(dom_info, keywords):
    def process_element_with_progress(element_info):
        try:
            element_info = copy(element_info)
            if (element_info.user_data):
                element_info.user_data = tokenize_dom(element_info.user_data, keywords)
        except RuntimeError:
            element_info.user_data = []
            
        for i in range(len(element_info.children)):
            child = element_info.children[i]
            element_info.children[i] = process_element_with_progress(child)
        
        return element_info

    # Process DOM elements recursively
    processed_results = process_element_with_progress(dom_info)

    return processed_results

In [27]:
from copy import copy

def weight_tokens_with_progress(tokenized_text):
    def weight_tokens_with_progress_recursive(tokenized_text):
        tokenized_text = copy(tokenized_text)
        tokenized_text.user_data = weight_tokens(tokenized_text.user_data)
        
        for i in range(len(tokenized_text.children)):
            child = tokenized_text.children[i]
            tokenized_text.children[i] = weight_tokens_with_progress_recursive(child)
        
        return tokenized_text

    # Process elements recursively
    processed_results = weight_tokens_with_progress_recursive(tokenized_text)

    return processed_results

In [28]:
from copy import copy

def train_tokens_model_with_progress(weighted_text):
    def train_tokens_model_with_progress_recursive(weighted_text):
        try:
            weighted_text = copy(weighted_text)
            weighted_text.user_data = train_tokens_model(weighted_text.user_data)
        except ValueError:
            weighted_text.user_data = []
        
        for i in range(len(weighted_text.children)):
            child = weighted_text.children[i]
            weighted_text.children[i] = train_tokens_model_with_progress_recursive(child)
        
        return weighted_text

    # Process elements recursively
    processed_results = train_tokens_model_with_progress_recursive(weighted_text)

    return processed_results


In [29]:
from copy import copy

def find_most_similar_identifier_with_progress(top_tokens_tfidf_data, vectors, vectorizer, norm_vectors, entities):
    def find_most_similar_identifier_with_progress_recursive(top_tokens_tfidf_data, vectors, vectorizer, norm_vectors, entities):
        try:
            top_tokens_tfidf_data = copy(top_tokens_tfidf_data)
            top_tokens_tfidf_data.user_data = [find_most_similar_identifier(input_word[0], vectors, vectorizer, norm_vectors, entities)
                                               for input_word in top_tokens_tfidf_data.user_data]
        except ValueError:
            top_tokens_tfidf_data.user_data = []
        
        for i in range(len(top_tokens_tfidf_data.children)):
            child = top_tokens_tfidf_data.children[i]
            top_tokens_tfidf_data.children[i] = find_most_similar_identifier_with_progress_recursive(child, vectors, vectorizer, norm_vectors, entities)
        
        return top_tokens_tfidf_data

    # Process elements recursively
    processed_results = find_most_similar_identifier_with_progress_recursive(top_tokens_tfidf_data, vectors, vectorizer, norm_vectors, entities)

    return processed_results


In [30]:
import concurrent.futures
import multiprocessing
from tqdm import tqdm
import re

num_cores = multiprocessing.cpu_count()

progress_format = "{percentage:.3f}% |{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]"


def pipeline_with_progress(html_file):
    site_html, keywords = process_html_with_progress(html_file)
    if not site_html:
        progress_bar.update(1)
        return None
    try:
        dom_info = gather_element_info(site_html)
    except RuntimeError:
        progress_bar.update(1)
        return None
    if not dom_info:
        progress_bar.update(1)
        return None
    
    tokenized_text = tokenize_dom_with_progress(dom_info, keywords)
    if not tokenized_text:
        progress_bar.update(1)
        return None
    #weighted_text = weight_tokens_with_progress(tokenized_text)
    top_tokens_tfidf_data = train_tokens_model_with_progress(tokenized_text)
    if not top_tokens_tfidf_data:
        progress_bar.update(1)
        return None
    most_similar_identifiers = find_most_similar_identifier_with_progress(top_tokens_tfidf_data, vectors, vectorizer, norm_vectors, entities)
    if not most_similar_identifiers:
        progress_bar.update(1)
        return None
    try:
        result = most_similar_identifiers.to_xml()
        xml_file = re.sub('\.html', '.xml', html_file)
        with open(f"outputs/{xml_file}", "w") as f:
            f.write(etree.tostring(result, encoding="unicode", pretty_print=False))
    except TypeError as e:
        print("Hey does this look familiar?")
        result = None
    progress_bar.update(1)
    return result

def divide_into_batches(lst, batch_size):
    return [lst[i:i+batch_size] for i in range(0, len(lst), batch_size)]

# Feel free to change this depending on your memory size.
batch_size = 64

# Divide the work into batches to be processed at once.
# Maximum memory usage is the amount the largest dataset in the batch uses.
data_batches = divide_into_batches(html_files[0:5], batch_size)

# Use tqdm to create a progress bar for the loop
with tqdm(total=len(html_files[0:5]), bar_format=progress_format) as progress_bar:
    vectors, vectorizer, norm_vectors = find_most_similar_identifier_preprocess(entities)
    for batch in data_batches:
        # Create a ThreadPoolExecutor with the number of cores available
        with concurrent.futures.ThreadPoolExecutor(max_workers=num_cores) as executor:
            # Use the map function to distribute the processing across multiple threads
            results = list(executor.map(pipeline_with_progress, batch))


100.000% |██████████| 5/5 [01:20<00:00, 16.13s/it]
