In [1]:
import pandas as pd
import requests
from playwright.async_api import async_playwright
import hashlib
import zlib


In [2]:
def generate_content_hash(content):
    """Generate a SHA-256 hash of the content."""
    return hashlib.sha256(content).hexdigest()


In [3]:
def generate_hash_filename(url):
    """Generate a hash of the URL to use as a filename."""
    url_hash = zlib.crc32(url.encode())
    file_name = f"{url_hash:x}"
    return file_name


In [4]:
# whatsapp function
async def get_whatsapp_content(url):
    playwright = await async_playwright().start()
    browser = await playwright.chromium.launch(headless=False)
    page = await browser.new_page()

    post_xpath = "/html/body/div[1]/div/div/div/div[2]/div/div/div[1]/div[1]/div[2]/div[2]/div/div/div[1]/div/div/div/div/div/div/div"

    print(url)
    await page.goto(url)
    await page.wait_for_load_state()
    post = await page.query_selector(f"xpath={post_xpath}")
    post_content = await post.inner_html()
    await browser.close()
    if post:
        return post_content
    else:
        print(f"Error with {url}")
        return None


In [5]:
async def fetch_content_with_playwright(url, filepath):
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        await page.goto(url)
        content = await page.content()
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(content)
        await browser.close()


In [6]:
import re
import os
import time
import asyncio
import nest_asyncio

nest_asyncio.apply()


async def crawl_csv(df, output_file="../data/data_16_09_24/output_data.csv"):
    """Takes CSV file in the format Heading, Subheading, Title, URL and processes each URL."""

    # Define a base directory within the user's space
    base_dir = "../data/data_16_09_24/crawl/"

    # Create directories if they don't exist
    os.makedirs(os.path.join(base_dir, "html"), exist_ok=True)
    os.makedirs(os.path.join(base_dir, "pdf"), exist_ok=True)
    os.makedirs(os.path.join(base_dir, "others"), exist_ok=True)

    output_data = []

    async def process_row(row):
        url = row[0]
        heading = row[1]
        sub_heading = row[2]
        title = row[3]
        filename = row[4]

        # Edit the title to become filename

        # Determine the filepaths
        html_filepath = os.path.join(base_dir, "html", f"{filename}.html")
        pdf_filepath = os.path.join(base_dir, "pdf", f"{filename}.pdf")

        # Skip fetching if the file already exists
        if os.path.exists(html_filepath) or os.path.exists(pdf_filepath):
            print(f"File already exists for {title}. Skipping fetch.")
            return

        retry_attempts = 3

        print("Working on ", url)
        while retry_attempts > 0:
            try:
                time.sleep(3)
                response = requests.get(url)
                response.raise_for_status()  # http errors
                content_type = response.headers.get("content-type")

                if "whatsapp" in url:
                    content = await get_whatsapp_content(url)
                    filepath = html_filepath
                    with open(filepath, "w", encoding="utf-8") as f:
                        f.write(content)
                    content = content.encode("utf-8")
                elif "text/html" in content_type:
                    content = response.text.encode("utf-8")
                    filepath = html_filepath
                    with open(filepath, "w", encoding="utf-8") as f:
                        f.write(response.text)

                elif "application/pdf" in content_type:
                    content = response.content
                    filepath = pdf_filepath
                    with open(filepath, "wb") as f:
                        f.write(response.content)

                else:
                    # Handle other content types by saving with the correct extension
                    file_extension = content_type.split("/")[-1].split(";")[0]
                    filepath = os.path.join(
                        base_dir, "others", f"{filename}.{file_extension}"
                    )
                    content = response.content
                    with open(filepath, "wb") as f:
                        f.write(response.content)

                # Create content hash
                content_hash = generate_content_hash(content)

                # Append to the output list
                output_data.append(
                    [
                        heading,
                        sub_heading,
                        title,
                        url,
                        filepath,
                        content_type.split("/")[1].split(";")[0],
                        content_hash,
                    ]
                )
                break  # Exit retry loop after successful fetch

            except requests.exceptions.HTTPError as http_err:
                if response.status_code == 403:
                    print(
                        f"Access forbidden for {url}: {http_err}. Using Playwright to fetch HTML."
                    )
                    html_filepath = os.path.join(base_dir, "html", f"{filename}.html")
                    await fetch_content_with_playwright(url, html_filepath)
                    output_data.append(
                        [
                            heading,
                            sub_heading,
                            title,
                            url,
                            html_filepath,
                            "text/html",
                            None,
                        ]
                    )
                    break  # Don't retry if it's a 403 error
                else:
                    print(f"HTTP error occurred for {url}: {http_err}")
                    retry_attempts -= 1
                    if retry_attempts > 0:
                        print(f"Retrying in 10 seconds...")
                        time.sleep(10)
                    else:
                        output_data.append(
                            [
                                heading,
                                sub_heading,
                                title,
                                url,
                                str(http_err),
                                str(response.status_code),
                                None,
                            ]
                        )

            except requests.exceptions.RequestException as err:
                print(f"Error occurred for {url}: {err}")
                retry_attempts -= 1
                if retry_attempts > 0:
                    print(f"Retrying in 10 seconds...")
                    time.sleep(10)
                else:
                    output_data.append(
                        [heading, sub_heading, title, url, str(err), "Error", None]
                    )

    # Create a list of tasks for asyncio to run
    tasks = [process_row(row) for _, row in df.iterrows()]

    # Run the tasks asynchronously
    await asyncio.gather(*tasks)

    # Create a DataFrame from the output data
    output_df = pd.DataFrame(
        output_data,
        columns=[
            "Heading",
            "Subheading",
            "Title",
            "URL",
            "Filepath",
            "Content Type",
            "Content Hash",
        ],
    )

    # Append to the existing CSV file or create a new one if it doesn't exist
    if os.path.exists(output_file):
        output_df.to_csv(output_file, mode="w", header=False, index=False)
    else:
        output_df.to_csv(output_file, index=False)

    print(f"Processing completed. Output saved to {output_file}")


In [7]:
index_path = "../data/data_16_09_24/index/"


In [8]:
# load the acm_site.csv, missionary.csv, stdhndbk.csv and join them
df = pd.read_csv(f"{index_path}acm_site.csv")
df2 = pd.read_csv(f"{index_path}missionary.csv")
df3 = pd.read_csv(f"{index_path}stdhndbk.csv")

df = pd.concat([df, df2, df3], ignore_index=True)
df.head()


Unnamed: 0,Section,Subsection,Title,URL
0,Area Coordination (For ACMs Only),ACC Council,ACM Council Agendas,https://office365lds.sharepoint.com/sites/BYU-...
1,Block,,International Area Transitioning to Block a Guide,https://missionaries.prod.byu-pathway.psdops.c...
2,Block,,PathwayConnect,https://missionaries.prod.byu-pathway.psdops.c...
3,Block,,PC 103 Block Pilot Spring 2022,https://missionaries.prod.byu-pathway.psdops.c...
4,Missionary Processes,Vetting ACMs,ACM Vetting Recommendation Instructions,https://missionaries.prod.byu-pathway.psdops.c...


In [9]:
df.fillna("Missing", inplace=True)

df_merged = (
    df.groupby("URL")
    .agg(
        {
            "Section": lambda x: list(x),
            "Subsection": lambda x: list(x),
            "Title": lambda x: list(x),
        }
    )
    .reset_index()
)

df_merged.head()


Unnamed: 0,URL,Section,Subsection,Title
0,http://path.churchofjesuschrist.org/,[PATH],[PATH for EC3 Gathering Missionaries],[Link to the PATH Website]
1,https://byui-ilearn.screenstepslive.com/m/7669...,[Canvas – Student Questions],[Getting Started],[Canvas Student Orientation]
2,https://calendar.ensign.edu/academic-calendars,[Calendars],[Academic Calendars],[Ensign College Academic Calendars]
3,https://churchofjesuschrist.sharepoint.com/sit...,[Gatherings],[PC102],[Forming Team Project Groups]
4,https://churchofjesuschrist.sharepoint.com/sit...,[English Connect 1 & 2],[EnglishConnect Global Launch],[EnglishConnect Global Launch]


In [10]:
## add a final column with the hash filename
df_merged["filename"] = df_merged["URL"].apply(generate_hash_filename)
df_merged.head()


Unnamed: 0,URL,Section,Subsection,Title,filename
0,http://path.churchofjesuschrist.org/,[PATH],[PATH for EC3 Gathering Missionaries],[Link to the PATH Website],74fd16dd
1,https://byui-ilearn.screenstepslive.com/m/7669...,[Canvas – Student Questions],[Getting Started],[Canvas Student Orientation],bf2cffa7
2,https://calendar.ensign.edu/academic-calendars,[Calendars],[Academic Calendars],[Ensign College Academic Calendars],5dad35ba
3,https://churchofjesuschrist.sharepoint.com/sit...,[Gatherings],[PC102],[Forming Team Project Groups],72713f0a
4,https://churchofjesuschrist.sharepoint.com/sit...,[English Connect 1 & 2],[EnglishConnect Global Launch],[EnglishConnect Global Launch],f83bf75a


In [11]:
# save the files as "all_links.csv"
df_merged.to_csv(f"{index_path}all_links.csv", index=False)


In [12]:
async def main():
    df = pd.read_csv(index_path + "all_links.csv")
    # filter only the urls from whatsapp
    await crawl_csv(df)


In [13]:
if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())


  url = row[0]
  heading = row[1]
  sub_heading = row[2]
  title = row[3]
  filename = row[4]


Working on  http://path.churchofjesuschrist.org/
Working on  https://byui-ilearn.screenstepslive.com/m/76692/l/865828-canvas-student-orientation
Working on  https://calendar.ensign.edu/academic-calendars
Working on  https://churchofjesuschrist.sharepoint.com/sites/BYUPW-MissionaryServices/SitePages/SHEP--Forming%20Team%20Project%20Groups.aspx
Access forbidden for https://churchofjesuschrist.sharepoint.com/sites/BYUPW-MissionaryServices/SitePages/SHEP--Forming%20Team%20Project%20Groups.aspx: 403 Client Error: Forbidden for url: https://churchofjesuschrist.sharepoint.com/sites/BYUPW-MissionaryServices/SitePages/SHEP--Forming%20Team%20Project%20Groups.aspx. Using Playwright to fetch HTML.
Working on  https://churchofjesuschrist.sharepoint.com/sites/EnglishConnect/
Access forbidden for https://churchofjesuschrist.sharepoint.com/sites/EnglishConnect/: 403 Client Error: Forbidden for url: https://churchofjesuschrist.sharepoint.com/sites/EnglishConnect/. Using Playwright to fetch HTML.
Workin

In [14]:
dfout = pd.read_csv("../data/data_16_09_24/output_data.csv")
dfout.head()


Unnamed: 0,Heading,Subheading,Title,URL,Filepath,Content Type,Content Hash
0,['Canvas – Student Questions'],['Getting Started'],['Canvas Student Orientation'],https://byui-ilearn.screenstepslive.com/m/7669...,../data/data_16_09_24/crawl/html/bf2cffa7.html,html,1795398bf45f9f98531bc95bb20caac734be73eaeaa2db...
1,['Canvas – Student Questions'],['Troubleshooting'],['Which Browsers Does Canvas Support'],https://community.canvaslms.com/t5/Canvas-Basi...,../data/data_16_09_24/crawl/html/8dc6d2f8.html,html,d63f1fc59922b50bd147367da56dfd038813adf3be2a73...
2,['Chat Bots'],['Missing'],['Companion.byupathway.org'],https://companion.byupathway.org/,../data/data_16_09_24/crawl/html/7e0351e.html,html,ee9abbbc1e1e79b72fd896b889b9cc7cfaa2ee4787143f...
3,['Gatherings'],['Lead Student'],['How to get class members to be lead and obse...,https://content.byui.edu/file/c529b6cc-a70b-4a...,../data/data_16_09_24/crawl/pdf/481ad292.pdf,pdf,92878893b8ef58f2822b998485ab85e038130560aa7af6...
4,['Certificates & Degrees'],['Application Process'],['How do students access the BYUI application?'],https://degreeapplication.byupathway.edu/,HTTPSConnectionPool(host='degreeapplication.by...,Error,


In [15]:
df = pd.read_csv("../data/data_16_09_24/output_data.csv")

# Filter out rows with '#' in the URL
# df_filtered = df[~df['URL'].str.contains('#')]

# # Save the filtered DataFrame back to CSV
# df_filtered.to_csv('output_data.csv', index=False)


In [16]:
df_error = dfout[(dfout["Content Type"] == "403") | (dfout["Content Type"] == "404")]
df_error


Unnamed: 0,Heading,Subheading,Title,URL,Filepath,Content Type,Content Hash


In [17]:
df_error.to_csv("../data/data_16_09_24/error_file.csv", mode="w", index=False)


In [18]:
# read the len of files in the directory: ../data/data_16_09_24/crawl/html
import os

path = "../data/data_16_09_24/crawl/html"
html_files = os.listdir(path)
html_total = str(len(html_files))
print("len of files in html: " + html_total)

# read the len of files in the directory: ../data/data_16_09_24/crawl/pdf
path = "../data/data_16_09_24/crawl/pdf"
pdf_files = os.listdir(path)
pdf_total = str(len(pdf_files))
print("len of files in pdf: " + pdf_total)

# read the len of files in the directory: ../data/data_16_09_24/crawl/others
path = "../data/data_16_09_24/crawl/others"
other_files = os.listdir(path)
other_totals = str(len(other_files))
print("len of files in others: " + other_totals)

total = int(html_total) + int(pdf_total) + int(other_totals)
print("Total number of files: " + str(total))


len of files in html: 246
len of files in pdf: 294
len of files in others: 1
Total number of files: 541
