In [1]:
import os
from bs4 import BeautifulSoup
from markdownify import markdownify as md

from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
import time
import nest_asyncio
from dotenv import load_dotenv


In [2]:
nest_asyncio.apply()
load_dotenv()


True

In [3]:
html_dir = "../data/crawl/html/"
outpath = "../data/out_sep_4/from_html/"


## Helper functions


In [4]:
def clean_html(soup):
    # Extract the title text
    title_text = soup.title.string if soup.title else None

    # Remove unnecessary elements
    for tag in soup(
        ["head", "style", "script", "img", "svg", "meta", "link", "iframe", "noscript"]
    ):
        tag.decompose()

    # Determine the content container (main or body)
    content = soup.main or soup.body

    if content and title_text:
        # Create a title header and insert it at the beginning
        title_header = soup.new_tag("h1")
        title_header.string = title_text
        content.insert(0, title_header)

    return (
        content or soup
    )  # Return the cleaned content or the entire soup as a fallback


In [5]:
def convert_html_to_markdown(html_content):
    # Parse the HTML content
    soup = BeautifulSoup(html_content, "html.parser")

    # Clean the HTML content
    soup = clean_html(soup)

    # Convert the cleaned HTML to Markdown
    markdown_content = md(str(soup))

    return markdown_content


# Convert process


In [6]:
# load the first file inside the html_dir to test
filenames = os.listdir(html_dir)
filenames.sort()
filenames = [{"name": name} for name in filenames]
print("len:", len(filenames))


len: 188


In [22]:
# load the file
for filename in filenames:
    with open(html_dir + filename["name"], "r") as file:
        html_content = file.read()

    print(filename["name"])
    # Convert the HTML content to Markdown
    markdown_content = convert_html_to_markdown(html_content)

    filename["size"] = len(markdown_content)

    # Save the Markdown content to a file
    with open(outpath + filename["name"].replace(".html", ".txt"), "w") as file:
        file.write(markdown_content)


-Admission-Requirements.html
-After-PathwayConnect.html
-Answers-to-Your-Questions-about-Ecclesiastical-Endorsement.html
-Application-Process.html
-Assistance-for-Students-with-Disabilities.html
-BYU-Idaho-Course-Exceptions.html
-BYU-Pathway-Support.html
-BYU-Pathway-Worldwide-Website.html
-Common-Misconceptions-about-Choosing-Certificates.html
-Communication-Resources.html
-Confidentiality-of-Student-Records.html
-Course-Auditing.html
-Course-Registration-Process.html
-Course-Retakes-and-Returning-Students.html
-Courses--Sequences.html
-Credits--Courses.html
-Email.html
-English-Placement-Assessment.html
-Facilitating-the-Gathering.html
-Gathering-Calendar.html
-Gathering-Location.html
-Gathering-Schedule.html
-Gathering-Standards.html
-Gatherings.html
-Grading-System--GPA.html
-Honor-Code.html
-Institutes-of-Religion--PEF-Self-Reliance-Centers.html
-Leadership-and-Roles.html
-Local-Adaption.html
-Local-Options.html
-Marketing-Materials.html
-Non-Participating-Students.html
-Online-Ce

# Convert with LlamaParse


In [9]:
parser = LlamaParse(
    result_type="markdown",  # "markdown" and "text" are available
    parsing_instruction=(
        "Convert the provided text into accurate and well-structured Markdown format, strictly preserving the original structure. "
        "Use headers from H1 to H3 only where they naturally occur in the text, and do not create additional headers or modify existing ones. "
        "Do not split the text into multiple sections or alter the sequence of content. "
        "Detect bold, large, or all-uppercase text as headers only if they represent a natural section break in the original text. "
        "Preserve all links, ensuring that they remain correctly formatted and in their original place in the text. "
        "Maintain bullet points and numbered lists with proper indentation to reflect any nested lists, ensuring list numbers remain in sequence. "
        "If the text is not a header, ensure that bold and italic text is properly formatted using double **asterisks** for bold and single *asterisks* for italic. "
        "Detect and correctly format blockquotes using the '>' symbol for any quoted text, but do not reformat text that is already in correct Markdown format. "
        "Respect the original line breaks and text flow, avoiding unnecessary splits, merges, or reordering of content. "
        "If any tables are detected, parse them as a title (bold header) followed by list items, but do not reformat existing Markdown tables. "
        "Merge identical headers only if they represent the same section and their content is identical, ensuring no changes to the order of the text. "
        "Do not enclose fragments of code/Markdown or any other content in triple backticks unless they are explicitly formatted as code blocks in the original text. "
        "Ensure that the final output is a clean, concise Markdown document that closely reflects the original text's intent and structure, without adding or omitting any content."
    ),
)


In [10]:
file_extractor = {".txt": parser}


def parse_txt_to_md(file):
    load_file = outpath + file.replace(".html", ".txt")

    documents = SimpleDirectoryReader(
        input_files=[load_file], file_extractor=file_extractor
    ).load_data()

    size = sum([len(doc.text) for doc in documents])

    out_name = f"{outpath}{file.replace('.html', '.md')}"
    # out_name = file.replace(".txt", ".md")
    with open(out_name, "w") as f:
        for doc in documents:
            f.write(doc.text)
            f.write("\n\n")
        print(out_name, "saved.")

    return size


In [23]:
for idx, file in enumerate(filenames):
    limit_try = 2
    size = parse_txt_to_md(file["name"])
    print(f"size parsed: {size}, file size: {file['size']}, diff: {file['size'] - size}")
    while size < (file["size"] - 2000) and limit_try > 0:
        try:
            time.sleep(2)
            size = parse_txt_to_md(file["name"])
            limit_try -= 1
            break
        except Exception as e:
            limit_try -= 1
            print(f"Error processing file {file["name"]} - {limit_try} tries left.")
            time.sleep(2)

    if limit_try == 0:
        print(f"Error processing file {file["name"]} - {limit_try} tries left.")
        file["llama_error"] = True
    
    # delete the txt file
    loaded_file = outpath + file["name"].replace(".html", ".txt")
    os.remove(loaded_file)
    

Started parsing the file under job_id 0571821b-ac69-47e1-9996-b44b1669de3f
../data/out_sep_4/from_html/-Admission-Requirements.md saved.
size parsed: 5033, file size: 5813
Started parsing the file under job_id 99172dce-d3b0-441e-83f6-8d2ae97adfbe
../data/out_sep_4/from_html/-After-PathwayConnect.md saved.
size parsed: 302, file size: 405
Started parsing the file under job_id 8a193777-a946-4eb3-b839-5bd1bc7e48c5
../data/out_sep_4/from_html/-Answers-to-Your-Questions-about-Ecclesiastical-Endorsement.md saved.
size parsed: 4975, file size: 5227
Started parsing the file under job_id 1fb28de8-8f4c-44f0-af33-a80979a76003
../data/out_sep_4/from_html/-Application-Process.md saved.
size parsed: 3512, file size: 4080
Started parsing the file under job_id 30a162f9-d29a-4bec-9273-8009c1248e28
../data/out_sep_4/from_html/-Assistance-for-Students-with-Disabilities.md saved.
size parsed: 2674, file size: 2857
Started parsing the file under job_id 8c6560ed-82cf-447e-8e43-5cce43d2ba7a
../data/out_sep_4

In [24]:
# print the error files
error_files = [file for file in filenames if file.get("llama_error")]
print("Error files:", len(error_files))


Error files: 1
