In [11]:
import os
from bs4 import BeautifulSoup
from markdownify import markdownify as md

from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
import time
import nest_asyncio
from dotenv import load_dotenv


In [12]:
nest_asyncio.apply()
load_dotenv()


True

In [13]:
html_dir = "../data/html/"
outpath = "../data/out_2_9/"


## Helper functions


In [14]:
def clean_html(soup):
    # Extract the title text
    title_text = soup.title.string if soup.title else None

    # Remove unnecessary elements
    for tag in soup(
        ["head", "style", "script", "img", "svg", "meta", "link", "iframe", "noscript"]
    ):
        tag.decompose()

    # Determine the content container (main or body)
    content = soup.main or soup.body

    if content and title_text:
        # Create a title header and insert it at the beginning
        title_header = soup.new_tag("h1")
        title_header.string = title_text
        content.insert(0, title_header)

    return (
        content or soup
    )  # Return the cleaned content or the entire soup as a fallback


In [15]:
def convert_html_to_markdown(html_content):
    # Parse the HTML content
    soup = BeautifulSoup(html_content, "html.parser")

    # Clean the HTML content
    soup = clean_html(soup)

    # Convert the cleaned HTML to Markdown
    markdown_content = md(str(soup))

    return markdown_content


# Convert process


In [16]:
# load the first file inside the html_dir to test
files = os.listdir(html_dir)
files.sort()
filenames = files[:20]
print(filenames)


['-Administering-the-Assessment.html', '-Admission-Decisions.html', '-Admission-Requirements.html', '-Admission-to-a-Program-Type.html', '-After-PathwayConnect.html', '-Answers-to-Your-Questions-about-Ecclesiastical-Endorsement.html', '-Application-Deadline.html', '-Application-Process.html', '-Assistance-for-Students-with-Disabilities.html', '-Attendance.html', '-BYU-Idaho-Course-Exceptions.html', '-BYU-Pathway-Area-Managers.html', '-BYU-Pathway-Support.html', '-BYU-Pathway-Worldwide-Communications.html', '-BYU-Pathway-Worldwide-Website.html', '-Blog-Articles.html', '-CES-Honor-Code.html', '-Calculating-Tuition.html', '-Certificates--Degrees.html', '-Cheating.html']


In [17]:
# load the file
for filename in filenames:
    with open(html_dir + filename, "r") as file:
        html_content = file.read()

    print(filename)
    # Convert the HTML content to Markdown
    markdown_content = convert_html_to_markdown(html_content)

    # Save the Markdown content to a file
    with open(outpath + filename.replace(".html", ".txt"), "w") as file:
        file.write(markdown_content)


-Administering-the-Assessment.html
-Admission-Decisions.html
-Admission-Requirements.html
-Admission-to-a-Program-Type.html
-After-PathwayConnect.html
-Answers-to-Your-Questions-about-Ecclesiastical-Endorsement.html
-Application-Deadline.html
-Application-Process.html
-Assistance-for-Students-with-Disabilities.html
-Attendance.html
-BYU-Idaho-Course-Exceptions.html
-BYU-Pathway-Area-Managers.html
-BYU-Pathway-Support.html
-BYU-Pathway-Worldwide-Communications.html
-BYU-Pathway-Worldwide-Website.html
-Blog-Articles.html
-CES-Honor-Code.html
-Calculating-Tuition.html
-Certificates--Degrees.html
-Cheating.html


# Convert with LlamaParse


In [18]:
parser = LlamaParse(
    result_type="markdown",  # "markdown" and "text" are available
    parsing_instruction=(
        "Convert the provided text into accurate and well-structured Markdown format, strictly preserving the original structure. "
        "Use headers from H1 to H3 only where they naturally occur in the text, and do not create additional headers or modify existing ones. "
        "Do not split the text into multiple sections or alter the sequence of content. "
        "Detect bold, large, or all-uppercase text as headers only if they represent a natural section break in the original text. "
        "Preserve all links, ensuring that they remain correctly formatted and in their original place in the text. "
        "Maintain bullet points and numbered lists with proper indentation to reflect any nested lists, ensuring list numbers remain in sequence. "
        "If the text is not a header, ensure that bold and italic text is properly formatted using double **asterisks** for bold and single *asterisks* for italic. "
        "Detect and correctly format blockquotes using the '>' symbol for any quoted text, but do not reformat text that is already in correct Markdown format. "
        "Respect the original line breaks and text flow, avoiding unnecessary splits, merges, or reordering of content. "
        "If any tables are detected, parse them as a title (bold header) followed by list items, but do not reformat existing Markdown tables. "
        "Merge identical headers only if they represent the same section and their content is identical, ensuring no changes to the order of the text. "
        "Do not enclose fragments of code/Markdown or any other content in triple backticks unless they are explicitly formatted as code blocks in the original text. "
        "Ensure that the final output is a clean, concise Markdown document that closely reflects the original text's intent and structure, without adding or omitting any content."
    ),
)


In [19]:
file_extractor = {".txt": parser}


def parse_txt_to_md(file):
    load_file = outpath + file.replace(".html", ".txt")

    documents = SimpleDirectoryReader(
        input_files=[load_file], file_extractor=file_extractor
    ).load_data()

    size = sum([len(doc.text) for doc in documents])

    out_name = f"{outpath}{file.replace('.html', '.md')}"
    # out_name = file.replace(".txt", ".md")
    with open(out_name, "w") as f:
        for doc in documents:
            f.write(doc.text)
            f.write("\n\n")
        print(out_name, "saved.")

    # delete the txt file
    os.remove(load_file)
    return size


In [20]:
for idx, file in enumerate(filenames):
    # limit_try = 2
    # size = parse_txt_to_md(file)

    # while size < (file["size"] - 300) and limit_try > 0:
    try:
        time.sleep(2)
        size = parse_txt_to_md(file)
    except Exception as e:
        print(f"Error processing file {file}.")
        time.sleep(2)


Started parsing the file under job_id 5809450c-6fe0-4aef-896c-2d1e9138d670
../data/out_2_9/-Administering-the-Assessment.md saved.
Started parsing the file under job_id b3a9c9b5-0a30-4a8c-a8e7-0422e709ab2f
../data/out_2_9/-Admission-Decisions.md saved.
Started parsing the file under job_id a0d882ac-22ef-4e52-8955-e8ea533ed528
../data/out_2_9/-Admission-Requirements.md saved.
Started parsing the file under job_id cc2bc196-6b06-4da3-8e87-1517a888015f
../data/out_2_9/-Admission-to-a-Program-Type.md saved.
Started parsing the file under job_id 57c4f8cd-c3b7-4d66-b74f-df5f38aff83a
../data/out_2_9/-After-PathwayConnect.md saved.
Started parsing the file under job_id bb3be3ad-4b35-4878-8799-261c7e314364
../data/out_2_9/-Answers-to-Your-Questions-about-Ecclesiastical-Endorsement.md saved.
Started parsing the file under job_id af065834-7a05-4215-84ed-df039015fdfe
../data/out_2_9/-Application-Deadline.md saved.
Started parsing the file under job_id a13f3284-0a0b-48bd-ab7d-4328f488bbd0
../data/ou