In [45]:
import os
from bs4 import BeautifulSoup
from markdownify import markdownify as md
import re

from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
import time
import nest_asyncio
from dotenv import load_dotenv


In [46]:
nest_asyncio.apply()
load_dotenv()


True

In [47]:
html_dir = "../data/data_test_sep_27/crawl/html/"
outpath = "../data/data_test_sep_27/out/from_html/"


## Helper functions


In [48]:
def clean_html(soup):
    # Extract the title text
    title_text = soup.title.string if soup.title else None

    # Remove unnecessary elements
    for tag in soup(
        [
            "head",
            "style",
            "script",
            "img",
            "svg",
            "meta",
            "link",
            "iframe",
            "noscript",
            "footer",
        ]
    ):
        tag.decompose()

    # Create selectors to remove elements
    selectors = [
        '[aria-label="Search Filter"]',
        '[aria-label*="Menu"]',
        '[aria-label*="menu"]',
        '[class*="menu"]',
        '[class*="Menu"]',
        ".sr-only",
        ".navbar",
        ".breadcrumb",
        ".btn-toolbar",
        ".skip-link",
    ]

    # Remove elements by selectors
    for selector in selectors:
        for tag in soup.select(selector):
            tag.decompose()

    # Determine the content container (main or body)
    content = soup.main or soup.body

    if content and title_text:
        # Create a title header and insert it at the beginning
        title_header = soup.new_tag("title")
        title_header.string = title_text
        content.insert(0, title_header)

    return (
        content or soup
    )  # Return the cleaned content or the entire soup as a fallback


In [49]:
def convert_html_to_markdown(html_content):
    # Parse the HTML content
    soup = BeautifulSoup(html_content, "html.parser")

    # Clean the HTML content
    soup = clean_html(soup)

    # Obten el elemento en la posicion 0
    title = soup.contents[0]
    # verify is the title is a title, if it is, save the text, if not, save ""
    title_text = title.text if title.name == "title" else ""
    if title_text:
        # remove the title from the soup
        title.decompose()

    # Convert the cleaned HTML to Markdown
    markdown_content = md(str(soup), heading_style="ATX")  # heading_style="ATX"

    # Clean the \n from the markdown because sometimnes are too many, 3, 4 five or more, and replace it just for one \n
    markdown_content = re.sub(r"\n{2,}", "\n\n", markdown_content)

    return markdown_content, title_text


In [50]:
def clean_title(title):
    # replace enters with spaces
    title = title.replace("\n", " ")
    # replace a lot of spaces with one space
    title = " ".join(title.split())
    # trim the text
    title = title.strip()

    return title


# Convert process


In [51]:
# load the first file inside the html_dir to test
filenames = os.listdir(html_dir)
filenames.sort()
filenames = [{"name": name} for name in filenames]
print("len:", len(filenames))


len: 268


In [52]:
# load the file
for filename in filenames:
    with open(html_dir + filename["name"], "r") as file:
        html_content = file.read()

    # Convert the HTML content to Markdown
    markdown_content, title_text = convert_html_to_markdown(html_content)

    title_text = clean_title(title_text)
    # if title_text:
    #     # insert at the beginning of the file the markdown content
    #     markdown_content = "title: " + title_text + "\n" + markdown_content

    filename["size"] = len(markdown_content)
    filename["title_tag"] = title_text

    # Save the Markdown content to a file
    with open(outpath + filename["name"].replace(".html", ".txt"), "w") as file:
        file.write(markdown_content)


In [53]:
empty_txts = []

# Now, will verify if the txt saved are empty
for filename in filenames:
    with open(outpath + filename["name"].replace(".html", ".txt"), "r") as file:
        content = file.read()
        # remove \n and spaces
        content = content.replace("\n", "").replace(" ", "")

    if not content:
        empty_txts.append(filename["name"])
        print("empty:", filename["name"].replace(".html", ".txt"))


In [38]:
# load the file
for filename in empty_txts:
    with open(html_dir + filename, "r") as file:
        html_content = file.read()

    # Parse the HTML content
    soup = BeautifulSoup(html_content, "html.parser")

    # Clean the HTML content
    soup = clean_html(soup)
    print("filename:", filename)
    print("soup:", soup)
    print()
    print()


filename: 320acbd2.html
soup: <body><title>Rise 360</title><!-- Google Tag Manager--><!-- End Google Tag Manager--><div id="app"></div></body>


filename: 4208553.html
soup: <body class="cb remove-segoe-ui-symbol" data-bind="defineGlobals: ServerData, bodyCssClass" style="display: block;"><title>Sign in to your account</title>

<div><!-- -->
<!-- -->
<div data-bind="if: activeDialog"></div>
<form action="https://login.microsoftonline.com/61e6eeb3-5fd7-4aaa-ae3c-61e8deb09b79/login" autocomplete="off" class="provide-min-height" data-bind="visible: !isLoginPageHidden(), autoSubmit: forceSubmit, attr: { action: postUrl }, ariaHidden: !!activeDialog(), css: { 'provide-min-height': svr.fUseMinHeight }" id="i0281" method="post" name="f1" novalidate="novalidate" spellcheck="false" target="_top">
<!-- ko withProperties: { '$loginPage': $data } -->
<div class="login-paginated-page" data-bind="component: { name: 'master-page',
        publicMethods: masterPageMethods,
        params: {
          

# Convert with LlamaParse


In [39]:
parser = LlamaParse(
    result_type="markdown",  # "markdown" and "text" are available
    parsing_instruction=(
        "Convert the provided text into accurate and well-structured Markdown format, strictly preserving the original structure. "
        "Use headers from H1 to H3 only where they naturally occur in the text, and do not create additional headers or modify existing ones. "
        "Do not split the text into multiple sections or alter the sequence of content. "
        "Detect bold, large, or all-uppercase text as headers only if they represent a natural section break in the original text. "
        "Preserve all links, ensuring that they remain correctly formatted and in their original place in the text. "
        "Maintain bullet points and numbered lists with proper indentation to reflect any nested lists, ensuring list numbers remain in sequence. "
        "If the text is not a header, ensure that bold and italic text is properly formatted using double **asterisks** for bold and single *asterisks* for italic. "
        "Detect and correctly format blockquotes using the '>' symbol for any quoted text, but do not reformat text that is already in correct Markdown format. "
        "Respect the original line breaks and text flow, avoiding unnecessary splits, merges, or reordering of content. "
        "If any tables are detected, parse them as a title (bold header) followed by list items, but do not reformat existing Markdown tables. "
        "Merge identical headers only if they represent the same section and their content is identical, ensuring no changes to the order of the text. "
        "Do not enclose fragments of code/Markdown or any other content in triple backticks unless they are explicitly formatted as code blocks in the original text. "
        "Ensure that the final output is a clean, concise Markdown document that closely reflects the original text's intent and structure, without adding or omitting any content."
    ),
)


In [40]:
file_extractor = {".txt": parser}


def parse_txt_to_md(file):
    load_file = outpath + file.replace(".html", ".txt")

    documents = SimpleDirectoryReader(
        input_files=[load_file], file_extractor=file_extractor
    ).load_data()

    size = sum([len(doc.text) for doc in documents])

    out_name = f"{outpath}{file.replace('.html', '.md')}"
    # out_name = file.replace(".txt", ".md")
    with open(out_name, "w") as f:
        for doc in documents:
            f.write(doc.text)
            f.write("\n\n")
        print(out_name, "saved.")

    return size


In [41]:
for idx, file in enumerate(filenames):
    limit_try = 2
    size = parse_txt_to_md(file["name"])
    print(f"size parsed: {size}, file size: {file['size']}, diff: {file['size'] - size}")
    while size < (file["size"] - 2000) and limit_try > 0:
        try:
            time.sleep(2)
            size = parse_txt_to_md(file["name"])
            limit_try -= 1
            break
        except Exception as e:
            limit_try -= 1
            print(f"Error processing file {file["name"]} - {limit_try} tries left.")
            time.sleep(2)

    if limit_try == 0:
        print(f"Error processing file {file["name"]} - {limit_try} tries left.")
        file["llama_error"] = True
    
    # delete the txt file
    loaded_file = outpath + file["name"].replace(".html", ".txt")
    os.remove(loaded_file)
    

Started parsing the file under job_id 720031ca-aff2-42a4-a5ac-f89ca1e2a35e
../data/data_test_27/out/from_html/105adb7a.md saved.
size parsed: 1498, file size: 1521, diff: 23
Started parsing the file under job_id bec89233-f849-4ed3-8a67-92cefe256090
../data/data_test_27/out/from_html/141279b1.md saved.
size parsed: 2670, file size: 2728, diff: 58
Started parsing the file under job_id 685dcab4-92a4-4bd3-bdf1-4c980794d6f6
../data/data_test_27/out/from_html/15c11982.md saved.
size parsed: 5220, file size: 5300, diff: 80
Started parsing the file under job_id 0769016d-5cb0-4d3f-b59d-90b53acf4829
../data/data_test_27/out/from_html/15ffa1be.md saved.
size parsed: 1779, file size: 1772, diff: -7
Started parsing the file under job_id 1dacc175-7a81-45af-b1e6-45229a35d060
../data/data_test_27/out/from_html/16ebd247.md saved.
size parsed: 1047, file size: 1123, diff: 76
Started parsing the file under job_id a28223b1-595a-4443-ba63-d2b68370675b
../data/data_test_27/out/from_html/179960e.md saved.
si

In [42]:
# print the error files
error_files = [file for file in filenames if file.get("llama_error")]
print("Error files:", len(error_files))


Error files: 0


In [44]:
# validate empty files
empty_mds = []

# Now, will verify if the txt saved are empty
for filename in filenames:
    with open(outpath + filename["name"].replace(".html", ".md"), "r") as file:
        content = file.read()
        # remove \n and spaces
        content = content.replace("\n", "").replace(" ", "")

    if not content:
        empty_mds.append(filename["name"])
        print("empty:", filename["name"].replace(".html", ".md"))


# Validation


In [None]:
# verify if all the files form the origin exist in the output

origin_files = [file["name"] for file in filenames]
output_files = os.listdir(outpath)

missing_files = [
    file for file in origin_files if file.replace(".html", ".md") not in output_files
]

print("Missing files:", len(missing_files))


# Add Metadata to HTML md


In [None]:
# load the files from the output
output_files = os.listdir(outpath)
print(output_files)


In [None]:
len(output_files)


Añade a todos los archivos el title cuando lo incluya, para luego poder añadirlo a la metadata


In [22]:
for file in output_files:
    # obten el file de filenames
    file_info = [f for f in filenames if f["name"].replace(".html", ".md") == file]

    if len(file_info) > 0 and file_info[0]["title_tag"]:
        # abre el archivo para que me permite insertar el titulo
        with open(outpath + file, "r") as f:
            content = f.read()
            # insert the title at the beginning of the file
            content = "title: " + file_info[0]["title_tag"] + "\n" + content

        # guardar el archivo con el título añadido
        with open(outpath + file, "w") as f:
            f.write(content)


In [2]:
import os
from utils.parser import (
    associate_markdown_with_metadata,
    attach_metadata_to_markdown_directories,
)


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/isaiaszc/pathway/pathway-
[nltk_data]     indexer/.venv/lib/python3.12/site-
[nltk_data]     packages/llama_index/core/_static/nltk_cache...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
all_links_path = "../data/data_16_09_24/index"

metadata_dict = associate_markdown_with_metadata(
    all_links_path, outpath, "all_links.csv"
)


../data/data_16_09_24/out/from_html/f2b9de8e.md
../data/data_16_09_24/out/from_html/42096f87.md
../data/data_16_09_24/out/from_html/bccf0caa.md
../data/data_16_09_24/out/from_html/50c7a511.md
../data/data_16_09_24/out/from_html/c9608a27.md
../data/data_16_09_24/out/from_html/570af0fb.md
../data/data_16_09_24/out/from_html/a91a42ec.md
../data/data_16_09_24/out/from_html/b5f7bbcc.md

¡TITLE FOUND!

../data/data_16_09_24/out/from_html/1e1f7b87.md
../data/data_16_09_24/out/from_html/641bb45.md

¡TITLE FOUND!

../data/data_16_09_24/out/from_html/a054b968.md
../data/data_16_09_24/out/from_html/778c50c4.md

¡TITLE FOUND!

../data/data_16_09_24/out/from_html/f408480.md

¡TITLE FOUND!

../data/data_16_09_24/out/from_html/667369ce.md

¡TITLE FOUND!

../data/data_16_09_24/out/from_html/6ddc1724.md

¡TITLE FOUND!

../data/data_16_09_24/out/from_html/34f469b2.md

¡TITLE FOUND!

../data/data_16_09_24/out/from_html/91d1baa2.md
../data/data_16_09_24/out/from_html/bee7233.md

¡TITLE FOUND!

../data/dat

In [4]:
metadata_dict


{'../data/data_16_09_24/out/from_html/f2b9de8e.md': {'url': 'https://pathway-missionary.powerappsportals.com/knowledgebase/article/KA-01173/en-us',
  'heading': 'PathwayConnect (PC) | PathwayConnect (PC)',
  'subheading': 'Course Retakes | Course Retakes',
  'title': 'Does a Student Have to Retake a Course for Which they Already Have a Passing Grade? | What to do If a student has already taken this course with a B or higher?'},
 '../data/data_16_09_24/out/from_html/42096f87.md': {'url': 'https://faq.whatsapp.com/859240711908360/?cms_platform=web&helpref=platform_switcher',
  'heading': 'WhatsApp',
  'subheading': 'Desktop',
  'title': 'How to Edit Your Profile on a Computer'},
 '../data/data_16_09_24/out/from_html/bccf0caa.md': {'url': 'https://www.byupathway.edu/policies/catalog/7-01-institute-religion-course-requirements',
  'heading': 'Graduation from (BYU-Idaho and Ensign College) | Institute/Religion | Institute/Religion',
  'subheading': 'Requirements | Missing | Missing',
  'tit

In [5]:
len(metadata_dict)


246

In [6]:
attach_metadata_to_markdown_directories(outpath, metadata_dict)


Metadata attached to ../data/data_16_09_24/out/from_html/f2b9de8e.md
Metadata attached to ../data/data_16_09_24/out/from_html/42096f87.md
Metadata attached to ../data/data_16_09_24/out/from_html/bccf0caa.md
No metadata found for ../data/data_16_09_24/out/from_html/cacd36b8.md. Skipping.
Metadata attached to ../data/data_16_09_24/out/from_html/50c7a511.md
Metadata attached to ../data/data_16_09_24/out/from_html/c9608a27.md
Metadata attached to ../data/data_16_09_24/out/from_html/570af0fb.md
Metadata attached to ../data/data_16_09_24/out/from_html/a91a42ec.md
Metadata attached to ../data/data_16_09_24/out/from_html/b5f7bbcc.md
Metadata attached to ../data/data_16_09_24/out/from_html/1e1f7b87.md
Metadata attached to ../data/data_16_09_24/out/from_html/641bb45.md
Metadata attached to ../data/data_16_09_24/out/from_html/a054b968.md
Metadata attached to ../data/data_16_09_24/out/from_html/778c50c4.md
Metadata attached to ../data/data_16_09_24/out/from_html/f408480.md
Metadata attached to ../