In [1]:
%pip install requests bs4 pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import requests

In [3]:
df = pd.read_csv('stdhndbk.csv')
df2 = pd.read_csv('missionary.csv')
# df2.head(3)
df.head(3)

Unnamed: 0,Section,Subsection,Title,URL
0,1. Background and Foundation,,"1.1 The History of PathwayConnect, Online Lear...",https://www.byupathway.edu/policies/handbook/1...
1,1. Background and Foundation,,1.2 Institutes of Religion & PEF/ Self-Relianc...,https://www.byupathway.edu/policies/handbook/1...
2,1. Background and Foundation,,1.3 Program Objectives,https://www.byupathway.edu/policies/handbook/1...


In [4]:
import hashlib
def generate_content_hash(content):
    '''Generate a SHA-256 hash of the content.'''
    return hashlib.sha256(content).hexdigest()

In [17]:
import re
import os
import time

def crawl_csv(input_file, output_file='./output_data.csv'):
    '''Takes CSV file in the format Heading, Subheading, Title, URL and processes each URL.'''
    
    # Read the input CSV file
    df = pd.read_csv(input_file)
    
    # Define a base directory within the user's space
    base_dir = './data/'
    
    # Create directories if they don't exist
    os.makedirs(os.path.join(base_dir, 'html'), exist_ok=True)
    os.makedirs(os.path.join(base_dir, 'pdf'), exist_ok=True)
    os.makedirs(os.path.join(base_dir, 'others'), exist_ok=True)
    
    output_data = []
    
    # Loop through each row in the input CSV
    for index, row in df.iterrows():
        heading = row[0]
        sub_heading = row[1]
        title = row[2]
        url = row[3]
        
        # Edit the title to become filename
        filename = title.replace(' ', '-')
        filename = re.sub(r'[^a-zA-Z-]', '', filename)
        
        # Determine the filepaths
        html_filepath = os.path.join(base_dir, 'html', f'{filename}.html')
        pdf_filepath = os.path.join(base_dir, 'pdf', f'{filename}.pdf')
        
        # Skip fetching if the file already exists
        if os.path.exists(html_filepath) or os.path.exists(pdf_filepath):
            print(f"File already exists for {title}. Skipping fetch.")
            continue
        
        retry_attempts = 3
        
        while retry_attempts > 0:
            try:
                time.sleep(3)  # Pause for 3 seconds before each request
                response = requests.get(url)
                response.raise_for_status()  # Ensure we catch HTTP errors
                content_type = response.headers.get('content-type')
                
                if 'text/html' in content_type:
                    content = response.text.encode('utf-8')
                    filepath = html_filepath
                    with open(filepath, 'w', encoding='utf-8') as f:
                        f.write(response.text)
                        
                elif 'application/pdf' in content_type:
                    content = response.content
                    filepath = pdf_filepath
                    with open(filepath, 'wb') as f:
                        f.write(response.content)
                        
                else:
                    # Handle other content types by saving with the correct extension
                    file_extension = content_type.split('/')[-1].split(';')[0]
                    filepath = os.path.join(base_dir, 'others', f'{filename}.{file_extension}')
                    content = response.content
                    with open(filepath, 'wb') as f:
                        f.write(response.content)
                
                # Create content hash
                content_hash = generate_content_hash(content)
                
                # Append to the output list
                output_data.append([heading, sub_heading, title, url, filepath, content_type.split('/')[1].split(';')[0], content_hash])
                break  # Exit retry loop after successful fetch
            
            except requests.exceptions.HTTPError as http_err:
                if response.status_code == 403:
                    print(f"Access forbidden for {url}: {http_err}")
                    output_data.append([heading, sub_heading, title, url, str(http_err), str(response.status_code), None])
                    break  # Don't retry if it's a 403 error
                else:
                    print(f"HTTP error occurred for {url}: {http_err}")
                    retry_attempts -= 1
                    if retry_attempts > 0:
                        print(f"Retrying in 10 seconds...")
                        time.sleep(3)
                    else:
                        output_data.append([heading, sub_heading, title, url, str(http_err), str(response.status_code), None])
            
            except requests.exceptions.RequestException as err:
                print(f"Error occurred for {url}: {err}")
                retry_attempts -= 1
                if retry_attempts > 0:
                    print(f"Retrying in 10 seconds...")
                    time.sleep(3)
                else:
                    output_data.append([heading, sub_heading, title, url, str(err), 'Error', None])

    # Create a DataFrame from the output data
    output_df = pd.DataFrame(output_data, columns=['Heading', 'Subheading', 'Title', 'URL', 'Filepath', 'Content Type', 'Content Hash'])
    
    # Append to the existing CSV file or create a new one if it doesn't exist
    if os.path.exists(output_file):
        output_df.to_csv(output_file, mode='a', header=False, index=False)
    else:
        output_df.to_csv(output_file, index=False)

    print(f"Processing completed. Output saved to {output_file}")

In [18]:
for filename in os.listdir('.'):
  if filename.endswith('.csv'):
    print(f'Now handling {filename}!')
    crawl_csv(filename)

Now handling missionary.csv!
File already exists for Apply For and Check Status of HJG Scholarship.pdf. Skipping fetch.
File already exists for Area Manager Assignments.pdf. Skipping fetch.
File already exists for Area Structure and Communication.pdf. Skipping fetch.
File already exists for BYU-Pathway History-Full.pdf. Skipping fetch.
File already exists for BYU-Pathway History-Short.pdf. Skipping fetch.
File already exists for BYU-Pathway Worldwide History.pdf. Skipping fetch.
File already exists for BYU-PW Strategy.pdf. Skipping fetch.
File already exists for Certificate-First Approach.pdf. Skipping fetch.
File already exists for Check Student History for Past Grades.pdf. Skipping fetch.
File already exists for CHINESE CSM Application Oct2021.pdf. Skipping fetch.
File already exists for Church HR Annual Workforce Training.pdf. Skipping fetch.
File already exists for Create a Relationship with Your Mentor.pdf. Skipping fetch.
File already exists for Defining Characteristics.pdf. Skip

  heading = row[0]
  sub_heading = row[1]
  title = row[2]
  url = row[3]


File already exists for Policy - Missionary Dress & Grooming policy.pdf. Skipping fetch.
File already exists for Policy-PATH_Naming Convention for Groups.pdf. Skipping fetch.
File already exists for PORTUGUESE CSM Application Oct2021.pdf. Skipping fetch.
File already exists for Restrictions for Applying to BYUPW.pdf. Skipping fetch.
File already exists for RUSSIAN CSM Application Oct2021.pdf. Skipping fetch.
File already exists for Scholarship Information.pdf. Skipping fetch.
File already exists for Service Missionaries-2.pdf. Skipping fetch.
File already exists for SPANISH CSM Application Oct2021.pdf. Skipping fetch.
File already exists for TAGALOG CSM Application Oct2021.pdf. Skipping fetch.
File already exists for Three-Year Degree.pdf. Skipping fetch.
File already exists for Updating Group Information.pdf. Skipping fetch.
File already exists for Different Places for my Courses. Skipping fetch.
File already exists for Quick Start:  Using Canvas on a Web Browser. Skipping fetch.
File

  heading = row[0]
  sub_heading = row[1]
  title = row[2]
  url = row[3]


Processing completed. Output saved to ./output_data.csv
Now handling acc_site.csv!


  heading = row[0]
  sub_heading = row[1]
  title = row[2]
  url = row[3]


Error occurred for https://missionaries.prod.byu-pathway.psdops.com/International-Area-Transitioning-to-Block-a-Guide: HTTPSConnectionPool(host='missionaries.prod.byu-pathway.psdops.com', port=443): Max retries exceeded with url: /International-Area-Transitioning-to-Block-a-Guide (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7fc5fb395340>: Failed to resolve 'missionaries.prod.byu-pathway.psdops.com' ([Errno -3] Temporary failure in name resolution)"))
Retrying in 10 seconds...
Error occurred for https://missionaries.prod.byu-pathway.psdops.com/International-Area-Transitioning-to-Block-a-Guide: HTTPSConnectionPool(host='missionaries.prod.byu-pathway.psdops.com', port=443): Max retries exceeded with url: /International-Area-Transitioning-to-Block-a-Guide (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7fc5fb360590>: Failed to resolve 'missionaries.prod.byu-pathway.psdops.com' ([Errno -3] Temporary failure in name resolutio

In [20]:
dfout = pd.read_csv('output_data.csv')
dfout.head()

Unnamed: 0,Heading,Subheading,Title,URL,Filepath,Content Type,Content Hash
0,BYU-Pathway Worldwide,BYU-Pathway Worldwide Overview,Peer Mentor Group Assignments.xlsx,https://missionaries.prod.byu-pathway.psdops.c...,./data/others/Peer-Mentor-Group-Assignmentsxls...,vnd.openxmlformats-officedocument.spreadsheetm...,8f72a5ed90d49870067947277a4f27f4672d28174b93f6...
1,Canvas – Student Questions,Grades and Feedback,How do I check what-if grades?,https://resourcecenter.byupathway.edu/faqs/can...,./data/html/How-do-I-check-what-if-grades.html,html,3db709ca388ae284866c20ad9148d60bbc178ed7a833d4...
2,EnglishConnect 3 (EC3),EnglishConnect 3,English Connect 3 Video July 27.mp4,https://missionaries.prod.byu-pathway.psdops.c...,./data/others/English-Connect--Video-July-mp.mp4,mp4,30098f82520c7cf569be75cd6f8673b933203e5a13713d...
3,Friends of the Church,,Give a BOM to friends of the church,https://www.churchofjesuschrist.org/study/manu...,./data/html/Give-a-BOM-to-friends-of-the-churc...,html,03266ecf7acb69ffd8dfe2e9b23d6b68b3455083cf2809...
4,Friends of the Church,,Help a Friend Connect with the Missionaries,https://www.churchofjesuschrist.org/serve/shar...,./data/html/Help-a-Friend-Connect-with-the-Mis...,html,3bcca84c1482b8e5e837581ea06ae9090c56d516309687...


In [24]:
df_error = dfout[(dfout['Content Type'] == '403') | (dfout['Content Type'] == '404')]
df_error

Unnamed: 0,Heading,Subheading,Title,URL,Filepath,Content Type,Content Hash
11,Gatherings,Other Gathering Resources,Improving Your Role in the Gathering,https://office365lds.sharepoint.com/sites/BYUP...,403 Client Error: Forbidden for url: https://o...,403,
44,Software Systems,Troubleshooting,Check to See if a BYU-Pathway Worldwide System...,https://office365lds.sharepoint.com/sites/BYU-...,403 Client Error: Forbidden for url: https://o...,403,
73,PathwayConnect (PC),PathwayConnect General Information,Students Visiting Another Gathering.pdf,https://missionaries.prod.byu-pathway.psdops.c...,404 Client Error: Not Found for url: https://m...,404,
289,Missionary Software & Uses,Troubleshooting,Check to See if a BYU-Pathway Worldwide System...,https://office365lds.sharepoint.com/sites/BYU-...,403 Client Error: Forbidden for url: https://o...,403,
