#### Index

In [None]:
import pandas as pd
import requests
from playwright.async_api import async_playwright
import zlib
from pathway_indexer.parser import parse_files_to_md

In [None]:
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=False)
page = await browser.new_page()


In [None]:
def generate_hash_filename(url):
    """Generate a hash of the URL to use as a filename."""
    url_hash = zlib.crc32(url.encode())
    file_name = f"{url_hash:x}"
    return file_name


In [None]:
save_path = "../../data/data_30_10"
url = "https://help.byupathway.edu/knowledgebase/"


In [None]:
await page.goto(url)
await page.wait_for_load_state()


In [None]:
while True:
    try:
        await page.click("text=Show More...")
        button = page.query_selector("#pagingButton")
    except:
        break


In [None]:
# get the element with id articleList
articles = await page.query_selector("#articleList")
# get all the links inside it, the link is the a tag, the title in the h5 tag inside the a tag and the descripcion in the p tag inside the a tag
links = await articles.query_selector_all("a")
data = []
for link in links:
    title = await link.query_selector("h5")
    description = await link.query_selector("p")
    filename = generate_hash_filename(url + await link.get_attribute("href"))
    data.append(
        {
            "URL": url + await link.get_attribute("href"),
            "Section": await title.inner_text(),
            "Subsection": await description.inner_text(),
            "Title": await title.inner_text(),
            "filename": filename,
        }
    )

df = pd.DataFrame(data)
df


#### Crawler

In [None]:
# save the df in a csv file
df.to_csv(f"{save_path}/articles.csv", index=False)


In [None]:
for index, row in df.iterrows():
    await page.goto(row["URL"])
    await page.wait_for_load_state()
    content = await page.query_selector(".wrapper-body")
    post_content = await content.inner_html()
    if post_content:
        with open(f"{save_path}/crawl/{row["filename"]}.html", "w") as f:
            f.write(post_content)
    

In [None]:
await browser.close()

#### Parser

In [None]:
parse_files_to_md(
    input_directory="../../data/data_30_10/crawl/",
    out_folder='../../data/data_30_10/out/',
    metadata_csv='../../data/data_30_10/articles.csv'
)