# 📥 Data Pull: OpenAQ API

This notebook pulls air quality measurements from the OpenAQ API and saves them locally for further analysis in downstream notebooks.


In [None]:
# Imports and Environment Setup
    # Import libraries
    # Set paths / env vars / API key / headers

# Standard libraries
import os
import sys
import json
import requests
import pandas as pd
from time import sleep
from datetime import datetime

# Add parent directory to sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Import API keys securely from config file, which loads them from the .env file
from scripts.config import OPENAQ_API_KEY, GITHUB_PAT

# define headers for API call
headers = {
    "accept": "application/json",
    "X-API-Key": OPENAQ_API_KEY
}

In [3]:
# Pull all locations in the US that monitor PM2.5
url_locations = "https://api.openaq.org/v3/locations"
params = {
    "country": "US",
    "parameters": "pm25",
    "limit": 1000  # paginate later if needed
}
response = requests.get(url_locations, headers=headers, params=params)
response.raise_for_status()
locations = response.json()["results"]

In [None]:
# Filter for California locations
def in_california(lat, lon):
    # Rough bounding box for CA
    return 32.5 <= lat <= 42.0 and -124.5 <= lon <= -114.0

ca_sensors = []

for loc in locations:
    coords = loc.get("coordinates", {})
    lat = coords.get("latitude")
    lon = coords.get("longitude")
    if lat and lon and in_california(lat, lon):
        for sensor in loc.get("sensors", []):
            if sensor.get("parameter", {}).get("name") == "pm25":
                ca_sensors.append({
                    "sensor_id": sensor["id"],
                    "location_id": loc["id"],
                    "location_name": loc["name"],
                    "lat": lat,
                    "lon": lon,
                    "datetimeLast": loc.get("datetimeLast")
                })

[{'sensor_id': 350,
  'location_id': 207,
  'location_name': 'MMFRA1001',
  'lat': 39.482481,
  'lon': -121.221235,
  'datetimeLast': {'utc': '2016-03-16T22:00:00Z',
   'local': '2016-03-16T15:00:00-07:00'}},
 {'sensor_id': 354,
  'location_id': 211,
  'location_name': 'Felton Cal-Fire',
  'lat': 37.0481,
  'lon': -122.074603,
  'datetimeLast': {'utc': '2022-04-08T18:00:00Z',
   'local': '2022-04-08T11:00:00-07:00'}},
 {'sensor_id': 357,
  'location_id': 214,
  'location_name': 'MMFRA1001',
  'lat': 39.482385,
  'lon': -121.221128,
  'datetimeLast': {'utc': '2016-03-16T05:00:00Z',
   'local': '2016-03-15T22:00:00-07:00'}},
 {'sensor_id': 401,
  'location_id': 237,
  'location_name': 'San Ysidro',
  'lat': 32.543475,
  'lon': -117.029028,
  'datetimeLast': {'utc': '2016-03-22T15:00:00Z',
   'local': '2016-03-22T08:00:00-07:00'}},
 {'sensor_id': 25272,
  'location_id': 276,
  'location_name': 'Morro Bay',
  'lat': 35.36639,
  'lon': -120.8426,
  'datetimeLast': {'utc': '2023-06-29T16:00:

In [20]:
# Define utility + API functions
    # build save path
    # function to fetch hourly data with pagination for a single sensor
    # wrapper to loop over sensor list, with throttling, progress tracking and optional intermediate saving

# build save path
def build_save_path(prefix, region, date_from, date_to, ext="csv"):
    start = datetime.fromisoformat(date_from.replace("Z", "")).strftime("%Y%m%d")
    end = datetime.fromisoformat(date_to.replace("Z", "")).strftime("%Y%m%d")
    return f"../data/{prefix}_{region}_{start}_to_{end}.{ext}"

# function to fetch hourly data with pagination for a single sensor
def fetch_sensor_hourly_data(sensor_id, headers, date_from=None, date_to=None, max_pages=100):
    """
    Fetch hourly aggregated data for a single sensor from OpenAQ API, paginated.

    Args:
        sensor_id (int): The sensor ID to query.
        headers (dict): HTTP headers including the API key.
        date_from (str): ISO8601 datetime string to filter start date (optional).
        date_to (str): ISO8601 datetime string to filter end date (optional).
        max_pages (int): Max pages to fetch to avoid infinite loops.

    Returns:
        pd.DataFrame: DataFrame containing hourly data for this sensor.
    """
    base_url = f"https://api.openaq.org/v3/sensors/{sensor_id}/hours"
    all_records = []

    for page in range(1, max_pages + 1):
        params = {
            "limit": 1000,
            "page": page,
        }
        if date_from:
            params["datetime_from"] = date_from
        if date_to:
            params["datetime_to"] = date_to

        response = requests.get(base_url, headers=headers, params=params)
        if response.status_code != 200:
            print(f"Warning: sensor {sensor_id} page {page} returned status {response.status_code}")
            break

        data = response.json()
        results = data.get("results", [])
        if not results:
            break  # No more data

        for entry in results:
            all_records.append({
                "sensor_id": sensor_id,
                "value": entry.get("value"),
                "units": entry.get("parameter", {}).get("units"),
                "pollutant": entry.get("parameter", {}).get("name"),
                "datetime_from_utc": entry.get("period", {}).get("datetimeFrom", {}).get("utc"),
                "datetime_from_local": entry.get("period", {}).get("datetimeFrom", {}).get("local"),
                "datetime_to_utc": entry.get("period", {}).get("datetimeTo", {}).get("utc"),
                "datetime_to_local": entry.get("period", {}).get("datetimeTo", {}).get("local"),
                "coverage_pct": entry.get("coverage", {}).get("percentCoverage")
            })

        # Optional: Track API quota if headers have rate-limit info
        rate_limit = response.headers.get("X-RateLimit-Limit")
        remaining = response.headers.get("X-RateLimit-Remaining")
        reset = response.headers.get("X-RateLimit-Reset")
        print(f"Sensor {sensor_id} page {page}: fetched {len(results)} records; API quota {remaining}/{rate_limit} reset in {reset}s")

        sleep(1.2)  # throttle requests to avoid rate limits

    return pd.DataFrame(all_records)

# wrapper to loop over all sensors, with throttling, progress tracking and optional intermediate saving
def fetch_multiple_sensors_hourly_data(sensor_ids, headers, date_from=None, date_to=None, save_path=None, save_interval=10):
    """
    Fetch hourly data for multiple sensors, accumulating into one DataFrame.

    Args:
        sensor_ids (list): List of sensor IDs.
        headers (dict): HTTP headers including API key.
        date_from (str): ISO8601 start date (optional).
        date_to (str): ISO8601 end date (optional).
        save_path (str): File path to save intermediate CSV files (optional).
        save_interval (int): How often to save intermediate results (every N sensors).

    Returns:
        pd.DataFrame: Combined DataFrame for all sensors.
    """
    combined_df = pd.DataFrame()

    for i, sensor_id in enumerate(sensor_ids):
        print(f"\nFetching data for sensor {i+1}/{len(sensor_ids)}: ID {sensor_id}")
        sensor_df = fetch_sensor_hourly_data(sensor_id, headers, date_from, date_to)
        combined_df = pd.concat([combined_df, sensor_df], ignore_index=True)

        if save_path and (i + 1) % save_interval == 0:
            combined_df.to_csv(save_path, index=False, encoding="utf-8")
            print(f"Intermediate data saved to {save_path} after {i+1} sensors.")

    # Final save after all sensors
    if save_path:
        combined_df.to_csv(save_path, index=False, encoding="utf-8")
        print(f"Final data saved to {save_path}")

    return combined_df

In [None]:
# Define parameters

# Sensor IDs (from the list of CA sensors)
sensor_ids = [s["sensor_id"] for s in ca_sensors]

# Date range for pulling data
date_from = "2016-01-01T00:00:00Z"
date_to = "2025-07-31T23:59:59Z"

# Save path
save_path = build_save_path("openaq_hourly", "ca", date_from, date_to)

../data/openaq_hourly_ca_20160101_to_20250731.csv


In [22]:
# Execute data pull

df_all = fetch_multiple_sensors_hourly_data(
    sensor_ids,
    headers,
    date_from=date_from,
    date_to=date_to,
    save_path=save_path,
    save_interval=10
)


Fetching data for sensor 1/59: ID 350
Sensor 350 page 1: fetched 64 records; API quota 59/60 reset in 60s

Fetching data for sensor 2/59: ID 354
Sensor 354 page 1: fetched 1000 records; API quota 57/60 reset in 58s
Sensor 354 page 2: fetched 1000 records; API quota 56/60 reset in 55s
Sensor 354 page 3: fetched 1000 records; API quota 55/60 reset in 52s
Sensor 354 page 4: fetched 1000 records; API quota 54/60 reset in 50s
Sensor 354 page 5: fetched 1000 records; API quota 53/60 reset in 46s
Sensor 354 page 6: fetched 1000 records; API quota 52/60 reset in 42s
Sensor 354 page 7: fetched 1000 records; API quota 51/60 reset in 38s
Sensor 354 page 8: fetched 1000 records; API quota 50/60 reset in 33s
Sensor 354 page 9: fetched 1000 records; API quota 59/60 reset in 60s
Sensor 354 page 10: fetched 1000 records; API quota 58/60 reset in 55s
Sensor 354 page 11: fetched 1000 records; API quota 57/60 reset in 49s
Sensor 354 page 12: fetched 1000 records; API quota 56/60 reset in 43s
Sensor 354 

In [23]:
print(df_all.shape)

(851736, 9)
