In [1]:
import os
from bs4 import BeautifulSoup
from markdownify import markdownify as md

from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
import time
import nest_asyncio
from dotenv import load_dotenv


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/isaiaszc/pathway/pathway-
[nltk_data]     indexer/.venv/lib/python3.12/site-
[nltk_data]     packages/llama_index/core/_static/nltk_cache...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
nest_asyncio.apply()
load_dotenv()


True

In [3]:
html_dir = "../data/data_16_09_24/crawl/html/"
outpath = "../data/data_16_09_24/out/from_html/"


## Helper functions


In [4]:
def clean_html(soup):
    # Extract the title text
    title_text = soup.title.string if soup.title else None

    # Remove unnecessary elements
    for tag in soup(
        ["head", "style", "script", "img", "svg", "meta", "link", "iframe", "noscript"]
    ):
        tag.decompose()

    # Determine the content container (main or body)
    content = soup.main or soup.body

    if content and title_text:
        # Create a title header and insert it at the beginning
        title_header = soup.new_tag("h1")
        title_header.string = title_text
        content.insert(0, title_header)

    return (
        content or soup
    )  # Return the cleaned content or the entire soup as a fallback


In [5]:
def convert_html_to_markdown(html_content):
    # Parse the HTML content
    soup = BeautifulSoup(html_content, "html.parser")

    # Clean the HTML content
    soup = clean_html(soup)

    # Convert the cleaned HTML to Markdown
    markdown_content = md(str(soup))

    return markdown_content


# Convert process


In [6]:
# load the first file inside the html_dir to test
filenames = os.listdir(html_dir)
filenames.sort()
filenames = [{"name": name} for name in filenames]
print("len:", len(filenames))


len: 247


In [7]:
# load the file
for filename in filenames:
    with open(html_dir + filename["name"], "r") as file:
        html_content = file.read()

    print(filename["name"])
    # Convert the HTML content to Markdown
    markdown_content = convert_html_to_markdown(html_content)

    filename["size"] = len(markdown_content)

    # Save the Markdown content to a file
    with open(outpath + filename["name"].replace(".html", ".txt"), "w") as file:
        file.write(markdown_content)


105adb7a.html
1376e9a9.html
141279b1.html
144bfd06.html
14627860.html
15c11982.html
16ebd247.html
179960e.html
1abc5b56.html
1cb73477.html
1e298fa4.html
1e599075.html
1f430f8f.html
20147eec.html
2047a48a.html
215c6547.html
2212bf68.html
267d52f4.html
27c0f092.html
2800d21b.html
2a538add.html
2d33fb5d.html
2dfe0bae.html
31ec975c.html
320acbd2.html
320f97b2.html
324adb56.html
34407a89.html
34f469b2.html
368e3bc1.html
37753287.html
389b94fc.html
3ac3f114.html
3dfa6235.html
3ec4ba7a.html
4208553.html
45762537.html
470a7e1f.html
49b390b3.html
4b0388c9.html
4bffd9bb.html
4fa96053.html
4fc47ef8.html
4fd94cb4.html
50baadf.html
50c7a511.html
50d50453.html
511f8bf1.html
5131b4dd.html
5323b8b5.html
533483d3.html
53edec91.html
54668001.html
54c54d4b.html
552a812e.html
5542b9ec.html
570af0fb.html
5740941c.html
5897ad76.html
5a02ac08.html
5c128f58.html
5d877af6.html
5dad35ba.html
5e9f6e2b.html
60307bad.html
613127fa.html
613dffc5.html
617e58ab.html
641bb45.html
667369ce.html
67e14179.html
67f6ff37.h

KeyboardInterrupt: 

# Convert with LlamaParse


In [8]:
parser = LlamaParse(
    result_type="markdown",  # "markdown" and "text" are available
    parsing_instruction=(
        "Convert the provided text into accurate and well-structured Markdown format, strictly preserving the original structure. "
        "Use headers from H1 to H3 only where they naturally occur in the text, and do not create additional headers or modify existing ones. "
        "Do not split the text into multiple sections or alter the sequence of content. "
        "Detect bold, large, or all-uppercase text as headers only if they represent a natural section break in the original text. "
        "Preserve all links, ensuring that they remain correctly formatted and in their original place in the text. "
        "Maintain bullet points and numbered lists with proper indentation to reflect any nested lists, ensuring list numbers remain in sequence. "
        "If the text is not a header, ensure that bold and italic text is properly formatted using double **asterisks** for bold and single *asterisks* for italic. "
        "Detect and correctly format blockquotes using the '>' symbol for any quoted text, but do not reformat text that is already in correct Markdown format. "
        "Respect the original line breaks and text flow, avoiding unnecessary splits, merges, or reordering of content. "
        "If any tables are detected, parse them as a title (bold header) followed by list items, but do not reformat existing Markdown tables. "
        "Merge identical headers only if they represent the same section and their content is identical, ensuring no changes to the order of the text. "
        "Do not enclose fragments of code/Markdown or any other content in triple backticks unless they are explicitly formatted as code blocks in the original text. "
        "Ensure that the final output is a clean, concise Markdown document that closely reflects the original text's intent and structure, without adding or omitting any content."
    ),
)


In [9]:
file_extractor = {".txt": parser}


def parse_txt_to_md(file):
    load_file = outpath + file.replace(".html", ".txt")

    documents = SimpleDirectoryReader(
        input_files=[load_file], file_extractor=file_extractor
    ).load_data()

    size = sum([len(doc.text) for doc in documents])

    out_name = f"{outpath}{file.replace('.html', '.md')}"
    # out_name = file.replace(".txt", ".md")
    with open(out_name, "w") as f:
        for doc in documents:
            f.write(doc.text)
            f.write("\n\n")
        print(out_name, "saved.")

    return size


In [None]:
for idx, file in enumerate(filenames):
    limit_try = 2
    size = parse_txt_to_md(file["name"])
    print(f"size parsed: {size}, file size: {file['size']}, diff: {file['size'] - size}")
    while size < (file["size"] - 2000) and limit_try > 0:
        try:
            time.sleep(2)
            size = parse_txt_to_md(file["name"])
            limit_try -= 1
            break
        except Exception as e:
            limit_try -= 1
            print(f"Error processing file {file["name"]} - {limit_try} tries left.")
            time.sleep(2)

    if limit_try == 0:
        print(f"Error processing file {file["name"]} - {limit_try} tries left.")
        file["llama_error"] = True
    
    # delete the txt file
    loaded_file = outpath + file["name"].replace(".html", ".txt")
    os.remove(loaded_file)
    

In [None]:
# print the error files
error_files = [file for file in filenames if file.get("llama_error")]
print("Error files:", len(error_files))


# Validation


In [None]:
# verify if all the files form the origin exist in the output

origin_files = [file["name"] for file in filenames]
output_files = os.listdir(outpath)

missing_files = [
    file for file in origin_files if file.replace(".html", ".md") not in output_files
]

print("Missing files:", len(missing_files))
