In [4]:
#Load some library
import requests
from bs4 import BeautifulSoup
import csv
import zipfile
import io
from urllib.parse import urljoin
import os
import pandas as pd
import sys
import shutil

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import upload_csv_aws
from scripts.utils.write_metadata import append_metadata

In [6]:
@append_metadata
def scrape_website(base_url, starting_string, gathered_links, download_dir, final_csv_file_name, export=False, varname=''):
    '''
Web scrapes EPA's annual summarized air quality index (AQI) data. The data cleaned and eventually merged into a single .csv file containing California county air quality data from 1980-2023. Data was sourced from the Environmental Protection Agency (EPA).
        
    Methods
    -------
    Data was scraped by looking for a link keyword shared by all counties and data years.
    Data was subsetted to California and separate files were merged into a single resulting csv file
    
    Parameters
    ----------
    base_url: string
              Use the url to EPA's annual summary AQI data: 
              https://aqs.epa.gov/aqsweb/airdata/download_files.html#Annual
    starting_string: string
              A shared string that all links to the data within the url share: annual_aqi_by_county
    gathered_links: string
              Name a place to store all of the url references before data is converted to a folder with all the
              data
    download_dir: string
              Name of the folder which will hold data csv files
    final_csv_file_name: string
              Name of the final single csv file containing all CA county air quality data 1980-2023
    export: bool
              If True, exports final csv to AWS bucket
    
    Script
    ------
    epa_air_quality_pull.ipynb
    '''
    print('Data transformation: Data subsetted for California.')
    print('Data transformation: Data files merged into a single csv file.')
    
    if export == False:
        return
    else:
        scraped_links = []
        try:
            # Send an HTTP GET request to the specified base_url with SSL certificate verification disabled
            response = requests.get(base_url, verify=True)

            # Check if the request was successful (status code 200)
            if response.status_code == 200:
                # Parse the HTML content of the page
                soup = BeautifulSoup(response.text, 'html.parser')

                links = soup.find_all('a')
                for link in links:
                    link_text = link.text
                    link_href = link.get('href')
                    if link_href and link_href.startswith(starting_string):
                        scraped_links.append((link_text, link_href))

                # Write the scraped links to a CSV file
                with open('gathered_links.csv', 'w', newline='') as csv_file:

                    csv_writer = csv.writer(csv_file)
                    csv_writer.writerow(['Link Text', 'Link Href'])  # Write header row
                    for link in scraped_links:
                        csv_writer.writerow(link)

                # Download and extract data from ZIP files
                for link_text, link_href in scraped_links:
                    zip_url = urljoin(base_url, link_href)  # Correctly construct the URL
                    response = requests.get(zip_url)
                    if response.status_code == 200:
                        zip_data = response.content
                        try:
                            with zipfile.ZipFile(io.BytesIO(zip_data), 'r') as zip_ref:
                                zip_ref.extractall(download_dir)
                        except zipfile.BadZipFile:
                            print(f"Failed to extract: {link_text} ({link_href}) is not a valid ZIP file.")

            else:
                print(f"Failed to retrieve data. Status code: {response.status_code}")

        except requests.exceptions.RequestException as e:
            print(f"An error occurred: {e}")

        # Directory where the CSV files are located
        input_folder = download_dir

        # Output CSV file to store the filtered data
        output_csv_file = "all_ca_air_quality_data.csv"

        # Initialize an empty list to store the filtered data
        filtered_data = []

        # Define the filter condition (filtering files for California)
        filter_condition = "California"

        # Iterate through CSV files in the input folder
        for root, _, files in os.walk(input_folder):
            for file in files:
                if file.endswith(".csv"): 
                    file_path = os.path.join(root, file)
                    with open(file_path, 'r') as csv_file:
                        csv_reader = csv.reader(csv_file)
                        header = next(csv_reader)  # Read the header row
                        state_index = header.index('State')  # Find the index of 'State' column
                        for row in csv_reader:
                            if row[state_index] == filter_condition:
                                filtered_data.append(row)

        # Write the filtered data to a single CSV file
        with open(output_csv_file, 'w', newline='') as csv_file:
            csv_writer = csv.writer(csv_file)
            csv_writer.writerow(header)  # Write the header row
            for row in filtered_data:
                csv_writer.writerow(row)
                
        # Read in new CSV file with all CA county data for all years
        ca_air_quality = pd.read_csv('all_ca_air_quality_data.csv')
        # Drop duplicate years per county
        ca_air_quality = ca_air_quality.drop_duplicates(subset=['Year', 'County'])
        ca_air_quality.to_csv(final_csv_file_name) #saving CA air quality data that has been cleaned of repeats

        if export == True:
            bucket_name = 'ca-climate-index'
            directory = '1_pull_data/natural_systems/ecosystem_condition/epa'
            export_filename = [final_csv_file_name]
            upload_csv_aws(export_filename, bucket_name, directory)

        if os.path.exists:
            os.remove('gathered_links.csv')
            os.remove('all_ca_air_quality_data.csv')

        # Define the folders you want to remove
        folders_to_remove = ['air_quality_csv_files']

        # Remove each folder
        for folder in folders_to_remove:
            if os.path.exists(folder):
                shutil.rmtree(folder)
            else:
                print(f"Folder not found: {folder}")

In [7]:
 # Run the function to extract EPA's county level annual air quality data      
scrape_website('https://aqs.epa.gov/aqsweb/airdata/download_files.html#Annual', 'annual_aqi_by_county', 'gathered_links', 'air_quality_csv_files', 'natural_epa_air_quality.csv', export=False, varname='natural_epa_air_quality')