In [1]:
%pip install requests bs4 pandas

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import requests

In [6]:
df = pd.read_csv('stdhndbk.csv')
df2 = pd.read_csv('missionary.csv')
df2.head(3)
df.head(3)

Unnamed: 0,Section,Subsection,Title,URL
0,1. Background and Foundation,,"1.1 The History of PathwayConnect, Online Lear...",https://www.byupathway.edu/policies/handbook/1...
1,1. Background and Foundation,,1.2 Institutes of Religion & PEF/ Self-Relianc...,https://www.byupathway.edu/policies/handbook/1...
2,1. Background and Foundation,,1.3 Program Objectives,https://www.byupathway.edu/policies/handbook/1...


In [14]:
import hashlib
def generate_content_hash(content):
    '''Generate a SHA-256 hash of the content.'''
    return hashlib.sha256(content).hexdigest()

In [22]:
import requests
import re
import os

def crawl_csv(input_file, output_file='./output_data.csv'):
    '''Takes CSV file in the format Heading, Subheading, Title, URL and processes each URL.'''
    
    # Read the input CSV file
    df = pd.read_csv(input_file)
    
    # Define a base directory within the user's space
    base_dir = './data/'
    
    # mkdirs if they don't exist
    os.makedirs(os.path.join(base_dir, 'html'), exist_ok=True)
    os.makedirs(os.path.join(base_dir, 'pdf'), exist_ok=True)
    os.makedirs(os.path.join(base_dir, 'others'), exist_ok=True)
    
  
    output_data = []
    
    # loop through each row in the input CSV
    for index, row in df.iterrows():
        heading = row[0]
        sub_heading = row[1]
        title = row[2]
        url = row[3]
        
        try:
            response = requests.get(url)
            response.raise_for_status()  # check for http errors
            
            # Edit the title to become filename
            filename = title.replace(' ', '-')
            filename = re.sub(r'[^a-zA-Z-]', '', filename)
            content_type = response.headers.get('content-type')
            
            if 'text/html' in content_type:
                filepath = os.path.join(base_dir, 'html', f'{filename}.html')
                content = response.text.encode('utf-8')
                with open(filepath, 'w', encoding='utf-8') as f:
                    f.write(response.text)
                    
            elif 'application/pdf' in content_type:
                filepath = os.path.join(base_dir, 'pdf', f'{filename}.pdf')
                content = response.content
                with open(filepath, 'wb') as f:
                    f.write(response.content)
                    
            else:
                # Get the correct extensions for other file types
                file_extension = content_type.split('/')[-1]
                filepath = os.path.join(base_dir, 'others', f'{filename}.{file_extension}')
                content = response.content
                with open(filepath, 'wb') as f:
                    f.write(response.content)
            
            # create content hash
            content_hash = generate_content_hash(content)
            
            # Append to the output list
            output_data.append([heading, sub_heading, title, url, filepath, content_type.split('/')[1].split(';')[0], content_hash])
        
        except requests.exceptions.RequestException as e:
            print(f"Failed to retrieve {url}: {e}")
            # In case of failure, append the error information
            output_data.append([heading, sub_heading, title, url, None, 'Error', None])

    # Create a DataFrame from the output data
    output_df = pd.DataFrame(output_data, columns=['Heading', 'Subheading', 'Title', 'URL', 'Filepath', 'Content Type', 'Content Hash'])
    
    # Append to the existing CSV file or create a new one if it doesn't exist
    if os.path.exists(output_file):
        output_df.to_csv(output_file, mode='a', header=False, index=False)
    else:
        output_df.to_csv(output_file, index=False)

    print(f"Processing completed. Output saved to {output_file}")

In [25]:
for filename in os.listdir('.'):
  if filename.endswith('.csv'):
    print(f'Now handling {filename}!')
    crawl_csv(filename)

Now handling missionary.csv!


  heading = row[0]
  sub_heading = row[1]
  title = row[2]
  url = row[3]


Failed to retrieve https://office365lds.sharepoint.com/sites/BYUPW-MissionaryServices/SitePages/SHEP--Improving-Your-Role-in-the-gathering.aspx: 403 Client Error: Forbidden for url: https://office365lds.sharepoint.com/sites/BYUPW-MissionaryServices/SitePages/SHEP--Improving-Your-Role-in-the-gathering.aspx
Failed to retrieve https://office365lds.sharepoint.com/sites/BYU-PathwayTechnicalSupport/SitePages/System-Status.aspx?xsdata=MDV8MDJ8ZWR3YXJkc3JpY2tpZUBieXVwYXRod2F5Lm9yZ3w1YWFlYjFhNDllOTY0MzZkNTE3MzA4ZGM2Zjg3OGY2OHw2MWU2ZWViMzVmZDc0YWFhYWUzYzYxZThkZWIwOWI3OXwwfDB8NjM4NTA3ODc1MjE5Nzg2NDg1fFVua25vd258VFdGcGJHWnNiM2Q4ZXlKV0lqb2lNQzR3TGpBd01EQWlMQ0pRSWpvaVYybHVNeklpTENKQlRpSTZJazFoYVd3aUxDSlhWQ0k2TW4wPXwwfHx8&sdata=QnlHMGZ5SXd5enNYRUM3bktoa0lNZ0Z3VE84Yk9EK0hhYmhLakM5aXVhcz0%3D&CT=1715195813843&OR=OWA-NT-Mail&CID=a0a1c942-2326-f38f-5387-ba456eb48a6a&clickParams=eyJYLUFwcE5hbWUiOiJNaWNyb3NvZnQgT3V0bG9vayBXZWIgQXBwIiwiWC1BcHBWZXJzaW9uIjoiMjAyNDA0MTkwMDcuMzUiLCJPUyI6Ik1hYyBPUyBYIHVuZGVmaW5lZ

  heading = row[0]
  sub_heading = row[1]
  title = row[2]
  url = row[3]


Processing completed. Output saved to ./output_data.csv
Now handling acc_site.csv!


  heading = row[0]
  sub_heading = row[1]
  title = row[2]
  url = row[3]


Failed to retrieve https://office365lds.sharepoint.com/sites/BYU-PathwayTechnicalSupport/SitePages/System-Status.aspx?xsdata=MDV8MDJ8ZWR3YXJkc3JpY2tpZUBieXVwYXRod2F5Lm9yZ3w1YWFlYjFhNDllOTY0MzZkNTE3MzA4ZGM2Zjg3OGY2OHw2MWU2ZWViMzVmZDc0YWFhYWUzYzYxZThkZWIwOWI3OXwwfDB8NjM4NTA3ODc1MjE5Nzg2NDg1fFVua25vd258VFdGcGJHWnNiM2Q4ZXlKV0lqb2lNQzR3TGpBd01EQWlMQ0pRSWpvaVYybHVNeklpTENKQlRpSTZJazFoYVd3aUxDSlhWQ0k2TW4wPXwwfHx8&sdata=QnlHMGZ5SXd5enNYRUM3bktoa0lNZ0Z3VE84Yk9EK0hhYmhLakM5aXVhcz0%3D&CT=1715195813843&OR=OWA-NT-Mail&CID=a0a1c942-2326-f38f-5387-ba456eb48a6a&clickParams=eyJYLUFwcE5hbWUiOiJNaWNyb3NvZnQgT3V0bG9vayBXZWIgQXBwIiwiWC1BcHBWZXJzaW9uIjoiMjAyNDA0MTkwMDcuMzUiLCJPUyI6Ik1hYyBPUyBYIHVuZGVmaW5lZCJ9: 403 Client Error: Forbidden for url: https://office365lds.sharepoint.com/sites/BYU-PathwayTechnicalSupport/SitePages/System-Status.aspx?xsdata=MDV8MDJ8ZWR3YXJkc3JpY2tpZUBieXVwYXRod2F5Lm9yZ3w1YWFlYjFhNDllOTY0MzZkNTE3MzA4ZGM2Zjg3OGY2OHw2MWU2ZWViMzVmZDc0YWFhYWUzYzYxZThkZWIwOWI3OXwwfDB8NjM4NTA3ODc1MjE5Nzg

In [28]:
dfout = pd.read_csv('output_data.csv')
dfout

Unnamed: 0,Heading,Subheading,Title,URL,Filepath,Content Type,Content Hash
0,BYU-Pathway Worldwide,BYU-Pathway Worldwide Overview,Apply For and Check Status of HJG Scholarship.pdf,https://missionaries.prod.byu-pathway.psdops.c...,./data/pdf/Apply-For-and-Check-Status-of-HJG-S...,pdf,f4037bb1aa9a14e28200af2a334c3ab45e1eeedfd06aab...
1,BYU-Pathway Worldwide,BYU-Pathway Worldwide Overview,Area Manager Assignments.pdf,https://missionaries.prod.byu-pathway.psdops.c...,./data/pdf/Area-Manager-Assignmentspdf.pdf,pdf,74378f11227dae89562d11a07a7121f5b1537a00bcfc07...
2,BYU-Pathway Worldwide,BYU-Pathway Worldwide Overview,Area Structure and Communication.pdf,https://missionaries.prod.byu-pathway.psdops.c...,./data/pdf/Area-Structure-and-Communicationpdf...,pdf,e298fd7abcd56223e147a9e259a92cc7a7231dd74e554d...
3,BYU-Pathway Worldwide,BYU-Pathway Worldwide Overview,BYU-Pathway History-Full.pdf,https://missionaries.prod.byu-pathway.psdops.c...,./data/pdf/BYU-Pathway-History-Fullpdf.pdf,pdf,aa0009e6dea84caa2688b394d758d442100a4be6644a1f...
4,BYU-Pathway Worldwide,BYU-Pathway Worldwide Overview,BYU-Pathway History-Short.pdf,https://missionaries.prod.byu-pathway.psdops.c...,./data/pdf/BYU-Pathway-History-Shortpdf.pdf,pdf,c211fb911700bb092e8b1dcc51c517b0fe4784118cc201...
...,...,...,...,...,...,...,...
514,Policies,,Policy PATH EnglishConnect Naming Convention f...,https://missionaries.prod.byu-pathway.psdops.c...,./data/pdf/Policy-PATH-EnglishConnect-Naming-C...,pdf,df64222e930bb67a242c004e826e30d6bc7aa85b86c2d8...
515,Policies,,Policy PATH Naming Convention for Groups.pdf,https://missionaries.prod.byu-pathway.psdops.c...,./data/pdf/Policy-PATH-Naming-Convention-for-G...,pdf,7973e56498e70b3e4b2aed20a7b76b04d89c7230aa343e...
516,Policies,,Policy Service Missionary Name Badges.pdf,https://missionaries.prod.byu-pathway.psdops.c...,./data/pdf/Policy-Service-Missionary-Name-Badg...,pdf,d96200430e850fdc5667d2935f973501b8ef625e50bce6...
517,Policies,,Policy Student Grievance Against Missionary Pu...,https://missionaries.prod.byu-pathway.psdops.c...,./data/pdf/Policy-Student-Grievance-Against-Mi...,pdf,76b8ca0083abf0549d0b5c14cf10699c2afa5c7e5f3ae9...


In [29]:
df_error = dfout[dfout['Content Type'] == 'Error']
df_error

Unnamed: 0,Heading,Subheading,Title,URL,Filepath,Content Type,Content Hash
123,Gatherings,Other Gathering Resources,Improving Your Role in the Gathering,https://office365lds.sharepoint.com/sites/BYUP...,,Error,
156,Software Systems,Troubleshooting,Check to See if a BYU-Pathway Worldwide System...,https://office365lds.sharepoint.com/sites/BYU-...,,Error,
187,PathwayConnect (PC),PathwayConnect General Information,Students Visiting Another Gathering.pdf,https://missionaries.prod.byu-pathway.psdops.c...,,Error,
501,Missionary Software & Uses,Troubleshooting,Check to See if a BYU-Pathway Worldwide System...,https://office365lds.sharepoint.com/sites/BYU-...,,Error,
