In [1]:
# %pip install requests bs4 pandas


In [2]:
import pandas as pd
import requests


In [3]:
# df = pd.read_csv('../CSVs/stdhndbk.csv')
# df2 = pd.read_csv('../CSVs/missionary.csv')
# df3 = pd.read_csv('acmsite.csv')
# # df2.head(3)
# df3.head(3)


In [4]:
import hashlib


def generate_content_hash(content):
    """Generate a SHA-256 hash of the content."""
    return hashlib.sha256(content).hexdigest()


In [5]:
# %pip install playwright


In [6]:
from playwright.async_api import async_playwright


async def fetch_content_with_playwright(url, filepath):
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        await page.goto(url)
        content = await page.content()
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(content)
        await browser.close()


In [7]:
# %pip install nest_asyncio


In [8]:
import re
import os
import time
import asyncio
import nest_asyncio

nest_asyncio.apply()


async def crawl_csv(input_file, output_file="../data/data_09_12_24/output_data.csv"):
    """Takes CSV file in the format Heading, Subheading, Title, URL and processes each URL."""

    # Read the input CSV file
    df = pd.read_csv(input_file)

    # Define a base directory within the user's space
    base_dir = "../data/data_09_12_24/crawl/"

    # Create directories if they don't exist
    os.makedirs(os.path.join(base_dir, "html"), exist_ok=True)
    os.makedirs(os.path.join(base_dir, "pdf"), exist_ok=True)
    os.makedirs(os.path.join(base_dir, "others"), exist_ok=True)

    output_data = []

    async def process_row(row):
        heading = row[0]
        sub_heading = row[1]
        title = row[2]
        url = row[3]

        # Edit the title to become filename
        filename = title.replace(" ", "-")
        filename = re.sub(r"[^a-zA-Z0-9-]", "", filename)

        # Determine the filepaths
        html_filepath = os.path.join(base_dir, "html", f"{filename}.html")
        pdf_filepath = os.path.join(base_dir, "pdf", f"{filename}.pdf")

        # Skip fetching if the file already exists
        if os.path.exists(html_filepath) or os.path.exists(pdf_filepath):
            print(f"File already exists for {title}. Skipping fetch.")
            return

        retry_attempts = 3

        while retry_attempts > 0:
            try:
                time.sleep(3)
                response = requests.get(url)
                response.raise_for_status()  # http errors
                content_type = response.headers.get("content-type")

                if "text/html" in content_type:
                    content = response.text.encode("utf-8")
                    filepath = html_filepath
                    with open(filepath, "w", encoding="utf-8") as f:
                        f.write(response.text)

                elif "application/pdf" in content_type:
                    content = response.content
                    filepath = pdf_filepath
                    with open(filepath, "wb") as f:
                        f.write(response.content)

                else:
                    # Handle other content types by saving with the correct extension
                    file_extension = content_type.split("/")[-1].split(";")[0]
                    filepath = os.path.join(
                        base_dir, "others", f"{filename}.{file_extension}"
                    )
                    content = response.content
                    with open(filepath, "wb") as f:
                        f.write(response.content)

                # Create content hash
                content_hash = generate_content_hash(content)

                # Append to the output list
                output_data.append(
                    [
                        heading,
                        sub_heading,
                        title,
                        url,
                        filepath,
                        content_type.split("/")[1].split(";")[0],
                        content_hash,
                    ]
                )
                break  # Exit retry loop after successful fetch

            except requests.exceptions.HTTPError as http_err:
                if response.status_code == 403:
                    print(
                        f"Access forbidden for {url}: {http_err}. Using Playwright to fetch HTML."
                    )
                    html_filepath = os.path.join(base_dir, "html", f"{filename}.html")
                    await fetch_content_with_playwright(url, html_filepath)
                    output_data.append(
                        [
                            heading,
                            sub_heading,
                            title,
                            url,
                            html_filepath,
                            "text/html",
                            None,
                        ]
                    )
                    break  # Don't retry if it's a 403 error
                else:
                    print(f"HTTP error occurred for {url}: {http_err}")
                    retry_attempts -= 1
                    if retry_attempts > 0:
                        print(f"Retrying in 10 seconds...")
                        time.sleep(10)
                    else:
                        output_data.append(
                            [
                                heading,
                                sub_heading,
                                title,
                                url,
                                str(http_err),
                                str(response.status_code),
                                None,
                            ]
                        )

            except requests.exceptions.RequestException as err:
                print(f"Error occurred for {url}: {err}")
                retry_attempts -= 1
                if retry_attempts > 0:
                    print(f"Retrying in 10 seconds...")
                    time.sleep(10)
                else:
                    output_data.append(
                        [heading, sub_heading, title, url, str(err), "Error", None]
                    )

    # Create a list of tasks for asyncio to run
    tasks = [process_row(row) for _, row in df.iterrows()]

    # Run the tasks asynchronously
    await asyncio.gather(*tasks)

    # Create a DataFrame from the output data
    output_df = pd.DataFrame(
        output_data,
        columns=[
            "Heading",
            "Subheading",
            "Title",
            "URL",
            "Filepath",
            "Content Type",
            "Content Hash",
        ],
    )

    # Append to the existing CSV file or create a new one if it doesn't exist
    if os.path.exists(output_file):
        output_df.to_csv(output_file, mode="a", header=False, index=False)
    else:
        output_df.to_csv(output_file, index=False)

    print(f"Processing completed. Output saved to {output_file}")


In [9]:
index_path = "../data/data_09_12_24/index/"


In [10]:
async def main():
    for filename in os.listdir(index_path):
        print(filename)
        if filename.endswith(".csv"):
            print(f"Now handling {filename}!")
            await crawl_csv(index_path + filename)


In [None]:
if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())


In [None]:
dfout = pd.read_csv("../data/data_09_12_24/output_data.csv")
dfout.head()


In [13]:
df = pd.read_csv("../data/data_09_12_24/output_data.csv")

# Filter out rows with '#' in the URL
# df_filtered = df[~df['URL'].str.contains('#')]

# # Save the filtered DataFrame back to CSV
# df_filtered.to_csv('output_data.csv', index=False)


In [None]:
df_error = dfout[(dfout["Content Type"] == "403") | (dfout["Content Type"] == "404")]
df_error


In [15]:
df_error.to_csv("../data/data_09_12_24/error_file.csv", mode="w", index=False)


In [None]:
# read the len of files in the directory: ../data/data_09_12_24/crawl/html
import os

path = "../data/data_09_12_24/crawl/html"
html_files = os.listdir(path)
html_total = str(len(html_files))
print("len of files in html: " + html_total)

# read the len of files in the directory: ../data/data_09_12_24/crawl/pdf
path = "../data/data_09_12_24/crawl/pdf"
pdf_files = os.listdir(path)
pdf_total = str(len(pdf_files))
print("len of files in pdf: " + pdf_total)

# read the len of files in the directory: ../data/data_09_12_24/crawl/others
path = "../data/data_09_12_24/crawl/others"
other_files = os.listdir(path)
other_totals = str(len(other_files))
print("len of files in others: " + other_totals)

total = int(html_total) + int(pdf_total) + int(other_totals)
print("Total number of files: " + str(total))


# Crawl whatsapp links


whatsapp links has some problems with the crawler above, so I will use a different approach to get the links


In [10]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [11]:
import os
import time
from urllib.parse import urljoin, urlparse
import re
import pandas as pd

from playwright.async_api import async_playwright


In [12]:
# get the files "acm_site.csv" and "missionary.csv" and merge them into a single dataframe
acm_site = pd.read_csv("../data/data_09_12_24/index/acm.csv")
missionary = pd.read_csv("../data/data_09_12_24/index/missionary.csv")
stdhndbk = pd.read_csv("../data/data_09_12_24/index/stdhndbk.csv")

df = pd.concat([acm_site, missionary, stdhndbk], ignore_index=True)

# sort by title
df = df.sort_values(by="Title")


In [13]:
# get olny the URL from whatsapp
df_links_whatsapp = df[df["URL"].str.contains("whatsapp")]
df_links_whatsapp


Unnamed: 0,Section,Subsection,Title,URL
509,WhatsApp,Android,Adding and Removing Group Members on Android,https://faq.whatsapp.com/841426356990637/?cms_...
521,WhatsApp,Desktop,Adding and Removing Group Members on a Computer,https://faq.whatsapp.com/841426356990637/?help...
530,WhatsApp,iPhone,Adding and Removing Group Members on iPhone,https://faq.whatsapp.com/841426356990637/?cms_...
503,WhatsApp,,Avoiding WhatsApp cancelling my account for spam?,https://faq.whatsapp.com/361005896189245?helpr...
517,WhatsApp,Communication,Can't Send or Receive Messages,https://faq.whatsapp.com/5155925751185676/?hel...
510,WhatsApp,Android,Downloading WhatsApp on Android,https://faq.whatsapp.com/807139050546238/?help...
522,WhatsApp,Desktop,Downloading WhatsApp on a Computer,https://faq.whatsapp.com/1513589699119080/?hel...
531,WhatsApp,iPhone,Downloading WhatsApp on iPhone,https://faq.whatsapp.com/807139050546238/?cms_...
505,WhatsApp,,How do I add students to a WhatsApp group?,https://faq.whatsapp.com/361005896189245?helpr...
506,WhatsApp,,How do I create a WhatsApp group?,https://faq.whatsapp.com/3242937609289432/?hel...


### create the Playwright


In [14]:
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=False)
page = await browser.new_page()


In [15]:
save_path = "../data/data_09_12_24/crawl/html/"

post_xpath = "/html/body/div[1]/div/div/div/div[2]/div/div/div[1]/div[1]/div[2]/div[2]/div/div/div[1]/div/div/div/div/div/div/div"


In [16]:
# loop through the whatsapp links
for idx, row in df_links_whatsapp.iterrows():
    url = row["URL"]
    print(url)
    await page.goto(url)
    await page.wait_for_load_state()
    post = await page.query_selector(f"xpath={post_xpath}")
    post_content = await post.inner_html()
    if post:
        df_links_whatsapp.at[idx, "content"] = post_content
    else:
        print(f"Error with {url}")
    # break


https://faq.whatsapp.com/841426356990637/?cms_platform=android&helpref=platform_switcher


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_links_whatsapp.at[idx, "content"] = post_content


https://faq.whatsapp.com/841426356990637/?helpref=hc_fnav&cms_platform=web
https://faq.whatsapp.com/841426356990637/?cms_platform=iphone&helpref=platform_switcher
https://faq.whatsapp.com/361005896189245?helpref=faq_content
https://faq.whatsapp.com/5155925751185676/?helpref=hc_fnav
https://faq.whatsapp.com/807139050546238/?helpref=hc_fnav&cms_platform=android
https://faq.whatsapp.com/1513589699119080/?helpref=hc_fnav&cms_platform=windows-desktop
https://faq.whatsapp.com/807139050546238/?cms_platform=iphone&helpref=platform_switcher
https://faq.whatsapp.com/361005896189245?helpref=faq_content
https://faq.whatsapp.com/3242937609289432/?helpref=uf_share
https://faq.whatsapp.com/859240711908360/?cms_platform=android&helpref=platform_switcher
https://faq.whatsapp.com/859240711908360/?cms_platform=web&helpref=platform_switcher
https://faq.whatsapp.com/859240711908360/?cms_platform=iphone&helpref=platform_switcher
https://faq.whatsapp.com/3240283752856917/?cms_platform=android&helpref=platfor

In [17]:
# save the content to a file

for idx, row in df_links_whatsapp.iterrows():
    # Edit the title to become filename
    filename = row["Title"].replace(" ", "-")
    filename = re.sub(r"[^a-zA-Z-]", "", filename)
    filename = filename + ".html"
    with open(f"{save_path}{filename}", "w") as f:
        f.write(row["content"])


In [18]:
# close the browser
await browser.close()
