In [36]:
import pandas as pd
import requests
from playwright.async_api import async_playwright
import zlib
from pathway_indexer.parser import parse_files_to_md

In [37]:
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=False)
page = await browser.new_page()


In [38]:
def generate_hash_filename(url):
    """Generate a hash of the URL to use as a filename."""
    url_hash = zlib.crc32(url.encode())
    file_name = f"{url_hash:x}"
    return file_name


In [48]:
save_path = "../../data/data_01_11"
url = "https://catalog.byupathway.edu/programs?type=Degree&page=1&pq="
url_to_save = 'https://catalog.byupathway.edu'

In [49]:
await page.goto(url)
await page.wait_for_load_state()


In [51]:
# get the element with id articleList
articles = await page.query_selector("#programs-list")
# get all the links inside it with data-test="programCard"
links = await articles.query_selector_all('a[data-test="programCard"]')
data = []
for link in links:
    title = await link.query_selector("h3.media-title")
    filename = generate_hash_filename(url + await link.get_attribute("href"))
    data.append(
        {
            "URL": url_to_save + await link.get_attribute("href"),
            "Title": await title.inner_text(),
            "filename": filename,
        }
    )

df = pd.DataFrame(data)
df

Unnamed: 0,URL,Title,filename
0,https://catalog.byupathway.edu/programs/P601,Applied Business Management (Associate),d4043968
1,https://catalog.byupathway.edu/programs/P901,Applied Business Management (Bachelor),df587e55
2,https://catalog.byupathway.edu/programs/P603,Applied Health (Associate),3a0a5844
3,https://catalog.byupathway.edu/programs/P903,Applied Health (Bachelor),31561f79
4,https://catalog.byupathway.edu/programs/AAS-CO...,Communication (Associate),4089c18c
5,https://catalog.byupathway.edu/programs/BAS-CO...,Communication (Bachelor),df3bc6cc
6,https://catalog.byupathway.edu/programs/P604,Family and Human Services (Associate),a46ecde7
7,https://catalog.byupathway.edu/programs/P904,Family and Human Services (Bachelor),af328ada
8,https://catalog.byupathway.edu/programs/P606,Family History Research (Associate),4a60accb
9,https://catalog.byupathway.edu/programs/AAS-IT...,Information Technology (Associate),3908419c


In [52]:
# save the df in a csv file
df.to_csv(f"{save_path}/articles.csv", index=False)

In [53]:
for index, row in df.iterrows():
    await page.goto(row["URL"])
    await page.wait_for_load_state()
    content = await page.query_selector('[role="main"]#main-content')
    if content:
        post_content = await content.inner_html()
        if post_content:
            with open(f"{save_path}/crawl/{row['filename']}.html", "w") as f:
                f.write(post_content)