In [None]:
#%pip install requests bs4 pandas

In [1]:
import pandas as pd
import requests

In [2]:
df = pd.read_csv('stdhndbk.csv')
df2 = pd.read_csv('missionary.csv')
# df2.head(3)
df.head(3)

Unnamed: 0,Section,Subsection,Title,URL
0,1. Background and Foundation,,"1.1 The History of PathwayConnect, Online Lear...",https://www.byupathway.edu/policies/handbook/1...
1,1. Background and Foundation,,1.2 Institutes of Religion & PEF/ Self-Relianc...,https://www.byupathway.edu/policies/handbook/1...
2,1. Background and Foundation,,1.3 Program Objectives,https://www.byupathway.edu/policies/handbook/1...


In [3]:
import hashlib
def generate_content_hash(content):
    '''Generate a SHA-256 hash of the content.'''
    return hashlib.sha256(content).hexdigest()

In [None]:
%pip install playwright

In [4]:

from playwright.async_api import async_playwright
async def fetch_content_with_playwright(url, filepath):
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        await page.goto(url)
        content = await page.content()
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(content)
        await browser.close()

In [None]:
%pip install nest_asyncio

In [10]:
import re
import os
import time
import asyncio
import nest_asyncio

nest_asyncio.apply()
async def crawl_csv(input_file, output_file='./output_data.csv'):
    '''Takes CSV file in the format Heading, Subheading, Title, URL and processes each URL.'''
    
    # Read the input CSV file
    df = pd.read_csv(input_file)
    
    # Define a base directory within the user's space
    base_dir = './data/'
    
    # Create directories if they don't exist
    os.makedirs(os.path.join(base_dir, 'html'), exist_ok=True)
    os.makedirs(os.path.join(base_dir, 'pdf'), exist_ok=True)
    os.makedirs(os.path.join(base_dir, 'others'), exist_ok=True)
    
    output_data = []
    
    async def process_row(row):
        heading = row[0]
        sub_heading = row[1]
        title = row[2]
        url = row[3]
        
        # Edit the title to become filename
        filename = title.replace(' ', '-')
        filename = re.sub(r'[^a-zA-Z-]', '', filename)
        
        # Determine the filepaths
        html_filepath = os.path.join(base_dir, 'html', f'{filename}.html')
        pdf_filepath = os.path.join(base_dir, 'pdf', f'{filename}.pdf')
        
        # Skip fetching if the file already exists
        if os.path.exists(html_filepath) or os.path.exists(pdf_filepath):
            print(f"File already exists for {title}. Skipping fetch.")
            return
        
        retry_attempts = 3
        
        while retry_attempts > 0:
            try:
                time.sleep(3)  
                response = requests.get(url)
                response.raise_for_status()  # http errors
                content_type = response.headers.get('content-type')
                
                if 'text/html' in content_type:
                    content = response.text.encode('utf-8')
                    filepath = html_filepath
                    with open(filepath, 'w', encoding='utf-8') as f:
                        f.write(response.text)
                        
                elif 'application/pdf' in content_type:
                    content = response.content
                    filepath = pdf_filepath
                    with open(filepath, 'wb') as f:
                        f.write(response.content)
                        
                else:
                    # Handle other content types by saving with the correct extension
                    file_extension = content_type.split('/')[-1].split(';')[0]
                    filepath = os.path.join(base_dir, 'others', f'{filename}.{file_extension}')
                    content = response.content
                    with open(filepath, 'wb') as f:
                        f.write(response.content)
                
                # Create content hash
                content_hash = generate_content_hash(content)
                
                # Append to the output list
                output_data.append([heading, sub_heading, title, url, filepath, content_type.split('/')[1].split(';')[0], content_hash])
                break  # Exit retry loop after successful fetch
            
            except requests.exceptions.HTTPError as http_err:
                if response.status_code == 403:
                    print(f"Access forbidden for {url}: {http_err}. Using Playwright to fetch HTML.")
                    html_filepath = os.path.join(base_dir, 'html', f'{filename}.html')
                    await fetch_content_with_playwright(url, html_filepath)
                    output_data.append([heading, sub_heading, title, url, html_filepath, 'text/html', None])
                    break  # Don't retry if it's a 403 error
                else:
                    print(f"HTTP error occurred for {url}: {http_err}")
                    retry_attempts -= 1
                    if retry_attempts > 0:
                        print(f"Retrying in 10 seconds...")
                        time.sleep(10)
                    else:
                        output_data.append([heading, sub_heading, title, url, str(http_err), str(response.status_code), None])
            
            except requests.exceptions.RequestException as err:
                print(f"Error occurred for {url}: {err}")
                retry_attempts -= 1
                if retry_attempts > 0:
                    print(f"Retrying in 10 seconds...")
                    time.sleep(10)
                else:
                    output_data.append([heading, sub_heading, title, url, str(err), 'Error', None])

    # Create a list of tasks for asyncio to run
    tasks = [process_row(row) for _, row in df.iterrows()]
    
    # Run the tasks asynchronously
    await asyncio.gather(*tasks)

    # Create a DataFrame from the output data
    output_df = pd.DataFrame(output_data, columns=['Heading', 'Subheading', 'Title', 'URL', 'Filepath', 'Content Type', 'Content Hash'])
    
    # Append to the existing CSV file or create a new one if it doesn't exist
    if os.path.exists(output_file):
        output_df.to_csv(output_file, mode='a', header=False, index=False)
    else:
        output_df.to_csv(output_file, index=False)

    print(f"Processing completed. Output saved to {output_file}")

In [25]:
async def main():
  for filename in os.listdir('.'):
    if filename.endswith('.csv'):
      print(f'Now handling {filename}!')
      await crawl_csv(filename)
      

In [26]:
if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())

Now handling missionary.csv!


  heading = row[0]
  sub_heading = row[1]
  title = row[2]
  url = row[3]


Access forbidden for https://www.byui.edu/learning-model/: 403 Client Error: Forbidden for url: https://www.byui.edu/learning-model/. Using Playwright to fetch HTML.
File already exists for BYU-Pathway Worldwide History. Skipping fetch.
Access forbidden for https://www.byui.edu/academic-calendar/: 403 Client Error: Forbidden for url: https://www.byui.edu/academic-calendar/. Using Playwright to fetch HTML.
Access forbidden for https://www.byui.edu/transfer-evaluation/course-transfer-tool: 403 Client Error: Forbidden for url: https://www.byui.edu/transfer-evaluation/course-transfer-tool. Using Playwright to fetch HTML.
Access forbidden for https://office365lds.sharepoint.com/sites/EnglishConnect: 403 Client Error: Forbidden for url: https://office365lds.sharepoint.com/sites/EnglishConnect. Using Playwright to fetch HTML.
Access forbidden for https://office365lds.sharepoint.com/sites/BYUPW-MissionaryServices/SitePages/SHEP--Improving-Your-Role-in-the-gathering.aspx: 403 Client Error: Forb

  heading = row[0]
  sub_heading = row[1]
  title = row[2]
  url = row[3]


Processing completed. Output saved to ./output_data.csv
Now handling acc_site.csv!


  heading = row[0]
  sub_heading = row[1]
  title = row[2]
  url = row[3]


File already exists for Student Issue Escalation Process. Skipping fetch.
File already exists for Email Data Privacy. Skipping fetch.
File already exists for Updating Group Information. Skipping fetch.
Access forbidden for https://office365lds.sharepoint.com/sites/BYU-PathwayTechnicalSupport/SitePages/System-Status.aspx?xsdata=MDV8MDJ8ZWR3YXJkc3JpY2tpZUBieXVwYXRod2F5Lm9yZ3w1YWFlYjFhNDllOTY0MzZkNTE3MzA4ZGM2Zjg3OGY2OHw2MWU2ZWViMzVmZDc0YWFhYWUzYzYxZThkZWIwOWI3OXwwfDB8NjM4NTA3ODc1MjE5Nzg2NDg1fFVua25vd258VFdGcGJHWnNiM2Q4ZXlKV0lqb2lNQzR3TGpBd01EQWlMQ0pRSWpvaVYybHVNeklpTENKQlRpSTZJazFoYVd3aUxDSlhWQ0k2TW4wPXwwfHx8&sdata=QnlHMGZ5SXd5enNYRUM3bktoa0lNZ0Z3VE84Yk9EK0hhYmhLakM5aXVhcz0%3D&CT=1715195813843&OR=OWA-NT-Mail&CID=a0a1c942-2326-f38f-5387-ba456eb48a6a&clickParams=eyJYLUFwcE5hbWUiOiJNaWNyb3NvZnQgT3V0bG9vayBXZWIgQXBwIiwiWC1BcHBWZXJzaW9uIjoiMjAyNDA0MTkwMDcuMzUiLCJPUyI6Ik1hYyBPUyBYIHVuZGVmaW5lZCJ9: 403 Client Error: Forbidden for url: https://office365lds.sharepoint.com/sites/BYU-PathwayTechnica

In [27]:
dfout = pd.read_csv('output_data.csv')
dfout.head()

Unnamed: 0,Heading,Subheading,Title,URL,Filepath,Content Type,Content Hash
0,BYU-Pathway Support Knowledge Articles,,Using BYU-Pathway Support Knowledge Articles,https://missionaries.prod.byu-pathway.psdops.c...,./data/pdf/Using-BYU-Pathway-Support-Knowledge...,pdf,195352aea2bda5641631964b1231d27b198c0519948458...
1,BYU-Pathway Worldwide,BYU-Pathway Worldwide Overview,Service Missionaries Flyer,https://missionaries.prod.byu-pathway.psdops.c...,./data/pdf/Service-Missionaries-Flyer.pdf,pdf,8dae12cfc07f3ed267dd3e50b9ff19593c4a2924d6ad24...
2,BYU-Pathway Worldwide,BYU-Pathway Worldwide Overview,5 things to know about BYU-Pathway Worldwide,https://www.byupathway.edu/articles/feature/5-...,./data/html/-things-to-know-about-BYU-Pathway-...,html,b1ef011ab925d213007ff7bb49e338937c168a1671cdfa...
3,BYU-Pathway Worldwide,BYU-Pathway Worldwide Overview,BYU-Pathway Student Honor Code,https://www.byupathway.edu/student-wellness/st...,./data/html/BYU-Pathway-Student-Honor-Code.html,html,5e67fe69b5dab7665f8db3c8672bd182a0720ec7510362...
4,BYU-Pathway Worldwide,BYU-Pathway Worldwide Overview,BYU-Pathway Worldwide History,https://missionaries.prod.byu-pathway.psdops.c...,./data/pdf/BYU-Pathway-Worldwide-History.pdf,pdf,6d7250be622c3655dd9c59301e778be3126e659f0fc926...


In [20]:
df = pd.read_csv('output_data.csv')

# Filter out rows with '#' in the URL
# df_filtered = df[~df['URL'].str.contains('#')]

# # Save the filtered DataFrame back to CSV
# df_filtered.to_csv('output_data.csv', index=False)

In [28]:
df_error = dfout[(dfout['Content Type'] == '403') | (dfout['Content Type'] == '404')]
df_error

Unnamed: 0,Heading,Subheading,Title,URL,Filepath,Content Type,Content Hash
190,PathwayConnect (PC),PathwayConnect General Information,Students Visiting Another Gathering,https://missionaries.prod.byu-pathway.psdops.c...,404 Client Error: Not Found for url: https://m...,404,


In [29]:
df_error.to_csv('error_file.csv', mode='w', index=False)