## Scrape with Firecrawl

Output html, markdown

In [None]:
import pandas as pd
from firecrawl import FirecrawlApp
import os
from dotenv import load_dotenv
from time import sleep
import chardet

load_dotenv()

overwrite_existing = False # Set to True to overwrite existing files
html_output_base_path = '../processing/ug_cat/fc_html/'
md_output_base_path = '../processing/ug_cat/fc_md/'
metadata_path = '../staged/ug_cat/ug_cat_metadata.csv'
expected_catalog_years = 5  # Expected number of unique catalog years

# Detect encoding and read csv
with open(metadata_path, 'rb') as f:
    result = chardet.detect(f.read(10000))  # Read first 10KB to detect
    encoding = result['encoding']
    print(f"Detected encoding: {encoding}")
df = pd.read_csv(metadata_path, encoding=encoding)


# Identify unique catalog years
catalog_years = df['catalog_year'].unique()
print (f'Found {len(catalog_years)} unique catalog years: {catalog_years}')
# Warn if not exactly 5 unique catalog years
if len(catalog_years) != expected_catalog_years:
    print(f'Warning: Expected {expected_catalog_years} unique catalog years, found {len(catalog_years)}. Please check the metadata file.')

# Initialize FirecrawlApp with API key from environment variable
firecrawl = FirecrawlApp(api_key=os.getenv('FIRECRAWL_API_KEY'))

for year in catalog_years:
    print(f'Preprocessing catalog year: {year}')

    df_year = df[df['catalog_year'] == year]

    for index, row in df_year.iterrows():
        #Construct the filepath for each row
        html_file_path = html_output_base_path + year + '/' + row['file_name'] + '.html'
        md_file_path = md_output_base_path + year + '/' + row['file_name'] + '.md'

        url = row['url']
    
        # If both files already exists, skip
        if os.path.exists(html_file_path) and os.path.exists(md_file_path):
            print(f"File {row['file_name']} already exists, skipping scrape entirely.")
            continue

        # Scrape and write files
        fc_response = firecrawl.scrape_url(url, formats=['markdown', 'html'])
        html_content = fc_response.html
        md_content = fc_response.markdown

        if os.path.exists(html_file_path):
            if overwrite_existing:
                print(f"Overwriting existing file: {html_file_path}")
                with open(html_file_path, 'w', encoding='utf-8') as f:
                    f.write(html_content)
                print(f"Scraped {url} and saved to {html_file_path}")
            else:
                print(f"File {html_file_path} already exists, skipping write.")
        else:
            with open(html_file_path, 'w', encoding='utf-8') as f:
                f.write(html_content)
            print(f"Scraped {url} and saved to {html_file_path}")


        if os.path.exists(md_file_path):
            if overwrite_existing:
                print(f"Overwriting existing file: {md_file_path}")
                with open(md_file_path, 'w', encoding='utf-8') as f:
                    f.write(md_content)
                print(f"Scraped {url} and saved to {md_file_path}")
            else:
                # If not overwriting, skip
                print(f"File {md_file_path} already exists, skipping.")
        else:
            with open(md_file_path, 'w', encoding='utf-8') as f:
                f.write(md_content)
            print(f"Scraped {url} and saved to {md_file_path}")



        # Sleep to avoid hitting rate limits
        sleep(17)    



Detected encoding: ISO-8859-1
Found 5 unique catalog years: ['21_22' '22_23' '23_24' '24_25' '25_26']
Preprocessing catalog year: 21_22
File BSCE Civil Engineering already exists, skipping.
File BSCE Civil Engineering, Energy Infrastructure Concentration already exists, skipping.
File BSCE Civil Engineering, Land Development Engineering Concentration already exists, skipping.
File BSCPE Computer Engineering already exists, skipping.
File BSCPE Computer Engineering, Machine Learning Concentration already exists, skipping.
File BS Dual Degree Physics and Computer Engineering already exists, skipping.
File BSEE Electrical Engineering already exists, skipping.
File BSEE Electrical Engineering, Machine Learning Concentration already exists, skipping.
File BSEE Electrical Engineering, Power and Energy Systems Concentration already exists, skipping.
File BS Dual Degree Physics and Electrical Engineering already exists, skipping.
File BSME Mechanical Engineering already exists, skipping.
File 

TODO consider switching to OpenAI
TODO detailed quality checks to ensure the output is good

Detected encoding: ISO-8859-1
Found 5 unique catalog years: ['21_22' '22_23' '23_24' '24_25' '25_26']
Processing catalog year: 21_22
File BSCE Civil Engineering already exists, skipping.
File BSCE Civil Engineering, Energy Infrastructure Concentration already exists, skipping.
File BSCE Civil Engineering, Land Development Engineering Concentration already exists, skipping.
File BSCPE Computer Engineering already exists, skipping.
File BSCPE Computer Engineering, Machine Learning Concentration already exists, skipping.
File BS Dual Degree Physics and Computer Engineering already exists, skipping.
File BSEE Electrical Engineering already exists, skipping.
File BSEE Electrical Engineering, Machine Learning Concentration already exists, skipping.
File BSEE Electrical Engineering, Power and Energy Systems Concentration already exists, skipping.
File BS Dual Degree Physics and Electrical Engineering already exists, skipping.
File BSME Mechanical Engineering already exists, skipping.
File BSM