#### Index


In [None]:
import os
import pandas as pd
import requests
from playwright.async_api import async_playwright
import zlib
from pathway_indexer.parser import parse_files_to_md


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/isaiaszc/pathway/pathway-
[nltk_data]     indexer/.venv/lib/python3.12/site-
[nltk_data]     packages/llama_index/core/_static/nltk_cache...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=False)
page = await browser.new_page()


In [None]:
def generate_hash_filename(url):
    """Generate a hash of the URL to use as a filename."""
    url_hash = zlib.crc32(url.encode())
    file_name = f"{url_hash:x}"
    return file_name


In [None]:
save_path = "../../data/data_30_10"
url = "https://help.byupathway.edu/knowledgebase/"


In [None]:
# in the save_path create 2 folders, crawl and out. inside the out folder create a folder called from_html
os.makedirs(f"{save_path}/crawl", exist_ok=True)
os.makedirs(f"{save_path}/out", exist_ok=True)
os.makedirs(f"{save_path}/out/from_html", exist_ok=True)


In [None]:
await page.goto(url)
await page.wait_for_load_state()


In [None]:
while True:
    try:
        await page.click("text=Show More...")
        button = page.query_selector("#pagingButton")
    except:
        break


  button = page.query_selector("#pagingButton")


In [None]:
# get the element with id articleList
articles = await page.query_selector("#articleList")
# get all the links inside it, the link is the a tag, the title in the h5 tag inside the a tag and the descripcion in the p tag inside the a tag
links = await articles.query_selector_all("a")
data = []
for link in links:
    title = await link.query_selector("h5")
    description = await link.query_selector("p")
    filename = generate_hash_filename(url + await link.get_attribute("href"))
    data.append(
        {
            "URL": url + await link.get_attribute("href"),
            "Section": await title.inner_text(),
            "Subsection": await description.inner_text(),
            "Title": await title.inner_text(),
            "filename": filename,
        }
    )

df = pd.DataFrame(data)
df


Unnamed: 0,URL,Section,Subsection,Title,filename
0,https://help.byupathway.edu/knowledgebase/arti...,How long does it take to get a decision,How long will it take to get my admission deci...,How long does it take to get a decision,200631eb
1,https://help.byupathway.edu/knowledgebase/arti...,Academic Misconduct Policies for PathwayConnec...,What are the policies for academic misconduct ...,Academic Misconduct Policies for PathwayConnec...,69b665df
2,https://help.byupathway.edu/knowledgebase/arti...,PC - Disruptive Behavior,.,PC - Disruptive Behavior,c08db428
3,https://help.byupathway.edu/knowledgebase/arti...,Institute/Religion Course Information,What are the requirements for transferring Ins...,Institute/Religion Course Information,e49f35bd
4,https://help.byupathway.edu/knowledgebase/arti...,Autodrop for EnglishConnect,What is Autodrop for EnglishConnect?,Autodrop for EnglishConnect,e55d5f8a
...,...,...,...,...,...
125,https://help.byupathway.edu/knowledgebase/arti...,Admission Requirements,Is there anything that could stop me from join...,Admission Requirements,2b532722
126,https://help.byupathway.edu/knowledgebase/arti...,Submitting Transfer Credits,Where do I go to submit my transfer credits?,Submitting Transfer Credits,c49f2c39
127,https://help.byupathway.edu/knowledgebase/arti...,Inviting Friends to Pathway,What are some ways I can encourage someone I k...,Inviting Friends to Pathway,ba2e5084
128,https://help.byupathway.edu/knowledgebase/arti...,Viewing Account Holds,How do I view the holds on my account?,Viewing Account Holds,34e7b84d


#### Crawler


In [None]:
# save the df in a csv file
df.to_csv(f"{save_path}/articles.csv", index=False)
df.to_csv(f"{save_path}/all_links.csv", index=False)  # temporary


In [10]:
for index, row in df.iterrows():
    await page.goto(row["URL"])
    await page.wait_for_load_state()
    content = await page.query_selector(".wrapper-body")
    post_content = await content.inner_html()
    if post_content:
        with open(f"{save_path}/crawl/{row["filename"]}.html", "w") as f:
            f.write(post_content)
    

In [None]:
await browser.close()


#### Parser


In [None]:
exclude_path = "../../data/data_30_10/excluded_domains.txt"

parse_files_to_md(
    input_directory="../../data/data_30_10/crawl/",
    out_folder="../../data/data_30_10/out/",
    metadata_csv="../articles.csv",
    excluded_domains_path=exclude_path,
)


Starting file processing...
Processing file: ../../data/data_30_10/crawl/5a645d39.html
Converted HTML to TXT and saved to: ../../data/data_30_10/out/from_html/5a645d39.txt
Started parsing the file under job_id 469dd1e7-6fa0-45dd-92c0-4e45e2baf771
Parsed TXT to MD and saved to: ../../data/data_30_10/out/from_html/5a645d39.md
Processing file: ../../data/data_30_10/crawl/123fa75.html
Converted HTML to TXT and saved to: ../../data/data_30_10/out/from_html/123fa75.txt
Started parsing the file under job_id 4b4bc1e5-42f5-4e01-8255-6fdc4342ccd6
Parsed TXT to MD and saved to: ../../data/data_30_10/out/from_html/123fa75.md
Processing file: ../../data/data_30_10/crawl/c36d0c83.html
Converted HTML to TXT and saved to: ../../data/data_30_10/out/from_html/c36d0c83.txt
Started parsing the file under job_id a0b657c8-6306-4d2e-a96a-0aeeef8eeb5f
Parsed TXT to MD and saved to: ../../data/data_30_10/out/from_html/c36d0c83.md
Processing file: ../../data/data_30_10/crawl/7b39ca29.html
Converted HTML to TXT 

In [None]:
# validate if the files are created and get a list of the files that are not created
files = os.listdir("../../data/data_30_10/out/from_html")
files_not_created = []
for index, row in df.iterrows():
    if f"{row['filename']}.md" not in files:
        files_not_created.append(row["URL"])

# find those files in the df and print them
df[df["URL"].isin(files_not_created)]


Unnamed: 0,URL,Section,Subsection,Title,filename
