<a href="https://colab.research.google.com/github/Chu-Yichen/QM2-Group-19/blob/main/QM2_Group_Haversine_Function_%2B_Data_Fetching.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install haversine
import pandas as pd
import numpy as np
import math
import haversine
import requests

Collecting haversine
  Downloading haversine-2.9.0-py2.py3-none-any.whl.metadata (5.8 kB)
Downloading haversine-2.9.0-py2.py3-none-any.whl (7.7 kB)
Installing collected packages: haversine
Successfully installed haversine-2.9.0


In [None]:
def get_site_distances_km(site_locations, lat, lng):
  site_distances = {}
  for site_id, site_lat, site_lng in site_locations.itertuples():
    site_distances[site_id] = haversine.haversine((lat, lng), (site_lat, site_lng))

  return site_distances # A function that finds the distance between inputted co-ordinates and each site. This returns a dictionary of site number and distance.



Collecting haversine
  Downloading haversine-2.9.0-py2.py3-none-any.whl.metadata (5.8 kB)
Downloading haversine-2.9.0-py2.py3-none-any.whl (7.7 kB)
Installing collected packages: haversine
Successfully installed haversine-2.9.0


  monitors = pd.read_csv('https://aqs.epa.gov/aqsweb/airdata/aqs_monitors.zip', converters={'State Code': str, 'County Code': str, 'Site Number': str})


In [None]:
monitors = pd.read_csv('https://aqs.epa.gov/aqsweb/airdata/aqs_monitors.zip', converters={'State Code': str, 'County Code': str, 'Site Number': str})
monitors['Site Id'] = monitors['State Code'] + '-' + monitors['County Code'] + '-' + monitors['Site Number'] # Creates a unique ID for each site
monitors = monitors[monitors['Parameter Code'].isin([81102, 88101, 44201, 42401, 42101, 42602])] # Extracting monitors that contain our desired pollutants (PM10, PM2.5, Ozone, SO2, NO2, CO)

site_locations = monitors[['Site Id', 'Latitude', 'Longitude']] # Filter the dataframe by Site ID and co-ordinates
site_locations = site_locations.drop_duplicates()
site_locations = site_locations.dropna() # Creates unique rows for each site and removes any sites that do not contain Latitudes and Longitudes
site_locations = site_locations.set_index('Site Id')

distances = get_site_distances_km(site_locations, 33.6324, -84.4333) # These are the co-ordinates for Hartsfield-Jackson Atlanta International Airport into our function.

nearby_site_ids = [site_id for site_id, distance in distances.items() if distance < 15] # This retrieves all of the site IDs within a 15KM radius from the airport.

# This is looking at the minimum and maximum date-range of each pollutant for all the nearby sites to Hartsfield-Jackson Atlanta International Airport to assess whether this works for our desired time range of 25 years.
monitors[monitors['Site Id'].isin(nearby_site_ids)][['Site Id', 'Parameter Code', 'Parameter Name', 'First Year of Data', 'Last Sample Date']].dropna().groupby(['Parameter Code', 'Parameter Name']).agg({'First Year of Data': 'min', 'Last Sample Date': 'max'})


Unnamed: 0_level_0,Unnamed: 1_level_0,First Year of Data,Last Sample Date
Parameter Code,Parameter Name,Unnamed: 2_level_1,Unnamed: 3_level_1
42101,Carbon monoxide,1972.0,2024-06-30
42401,Sulfur dioxide,1962.0,2024-06-30
42602,Nitrogen dioxide (NO2),1972.0,2024-05-31
44201,Ozone,1974.0,2024-05-31
81102,PM10 Total 0-10um STP,1993.0,2024-06-30
88101,PM2.5 - Local Conditions,1999.0,2024-06-30


In [None]:
nearby_site_ids

['13-063-0091',
 '13-089-0002',
 '13-121-0001',
 '13-121-0002',
 '13-121-0031',
 '13-121-0041',
 '13-121-0043',
 '13-121-0047',
 '13-121-0053',
 '13-121-0055',
 '13-121-1001',
 '13-121-4001',
 '13-121-5001',
 '13-121-5002']

In [17]:
import datetime
import time
import requests
import pandas as pd

EMAIL_ADDRESS = "uclqirz@ucl.ac.uk"
KEY = "aquaosprey23"
API_BASE_URL = "https://aqs.epa.gov/data/api/"

# This date function is from Gemini, I don't actually know what it does
def split_date_range_by_year(start_date, end_date):
  """Splits a date range into year ranges.

  Args:
    start_date: The start date of the range.
    end_date: The end date of the range.

  Returns:
    A list of tuples, where each tuple contains the start and end dates for a year.
  """
  date_ranges = []
  current_start_date = start_date
  while current_start_date <= end_date:
      current_end_date = min(datetime.datetime(current_start_date.year, 12, 31), end_date)
      date_ranges.append((current_start_date, current_end_date))
      current_start_date = current_end_date + datetime.timedelta(days=1)
  return date_ranges

def get_daily_aqs_data(start_date, end_date, state_code, county_code, site_code, parameter_code):
  """Gets daily AQS data using the EPA AQS API for a given date range for a single site and parameter code.

  This will batch the requests per year and also wait 10 seconds between requests to avoid hitting the API limits.

  See: https://aqs.epa.gov/aqsweb/documents/data_api.html

  Args:
    start_date: The start date of the range to fetch data.
    end_date: The end date of the range to fetch data.
    state_code: The state code of the site.
    county_code: The county code of the site.
    site_code: The site code of the site.
    parameter_code: The parameter code of the pollutant.

  Returns:
    A pandas dataframe containing the AQS data, this is also saved to a CSV file.
  """
  # Split date range into year ranges
  date_ranges = split_date_range_by_year(start_date, end_date)
  print(f"Making {len(date_ranges)} requests for data for site {state_code}-{county_code}-{site_code} and parameter {parameter_code}")

  dfs = []
  for start, end in date_ranges:
    print(f"Making request for data from {start} to {end}")
    request_url = f"{API_BASE_URL}dailyData/bySite"
    request_params = {
        "email": EMAIL_ADDRESS,
        "key": KEY,
        "param": parameter_code,
        "bdate": start.strftime("%Y%m%d"),
        "edate": end.strftime("%Y%m%d"),
        "state": state_code,
        "county": county_code,
        "site": site_code,
    }
    response = requests.get(request_url, params=request_params)
    if response.ok:
      response_json = response.json()
      data = response_json["Data"]
      if data:
        df = pd.DataFrame(data)
        dfs.append(df)
        print(f"Got data from {start} to {end}")
      else:
        print(f"No data found from {start} to {end}")
    else:
      print("Oh no! We can't fetch your data. Try again later...")
      return None

    # Sleep between requests to avoid hitting API limits (the EPA will ban us if we do requests too quickly)
    print("Sleeping for 10 seconds")
    time.sleep(10)
  combined_df = pd.concat(dfs)
  combined_df.to_csv(f"{state_code}-{county_code}-{site_code}-{parameter_code}.csv", index=False)
  return combined_df



In [18]:
# Example - fetches ozone data for a monitor in Raleigh, NC between 1st Jan 2021 and 1st Nov 2024
data = get_daily_aqs_data(datetime.datetime(2021, 1, 1), datetime.datetime(2024,11,1), 37, 183, "0014", 44201)

Making 4 requests for data for site 37-183-0014 and parameter 44201
Making request for data from 2021-01-01 00:00:00 to 2021-12-31 00:00:00
Got data from 2021-01-01 00:00:00 to 2021-12-31 00:00:00
Sleeping for 10 seconds
Making request for data from 2022-01-01 00:00:00 to 2022-12-31 00:00:00
Got data from 2022-01-01 00:00:00 to 2022-12-31 00:00:00
Sleeping for 10 seconds
Making request for data from 2023-01-01 00:00:00 to 2023-12-31 00:00:00
Got data from 2023-01-01 00:00:00 to 2023-12-31 00:00:00
Sleeping for 10 seconds
Making request for data from 2024-01-01 00:00:00 to 2024-11-01 00:00:00
Got data from 2024-01-01 00:00:00 to 2024-11-01 00:00:00
Sleeping for 10 seconds
