# 🧪 Exploratory Development Workspace: OpenAQ API

This notebook is our scratchpad for pulling air quality measurements from the OpenAQ API, cleaning and transforming the data, conducting exploratory data analysis and modeling, and finally visualizing the data and extracting narrative insights.

- test API endpoints
- print out JSON blobs
- try different params and debug errors
- Rough out logic for looping, pagination, etc.
- clean and transform data
- EDA
- modeling
- produce visualizations and insights

In [3]:
# Imports and Environment Setup
 
# Standard libraries
import os
import sys
import json
import requests
import pandas as pd

# Add parent directory to sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Import API keys securely from config file, which loads them from the .env file
from scripts.config import OPENAQ_API_KEY, GITHUB_PAT

# define headers for API call
headers = {
    "accept": "application/json",
    "X-API-Key": OPENAQ_API_KEY
}

In [None]:
# make a simple API Call; quick verification
url = "https://api.openaq.org/v3/countries"

response = requests.get(url, headers=headers)
response.raise_for_status()  # raises error if something went wrong

data = response.json()
print(data['results'][:3])  # print first 3 results

[{'id': 1, 'code': 'ID', 'name': 'Indonesia', 'datetimeFirst': '2016-01-30T01:00:00Z', 'datetimeLast': '2025-07-29T21:00:54.579000Z', 'parameters': [{'id': 1, 'name': 'pm10', 'units': 'µg/m³', 'displayName': None}, {'id': 2, 'name': 'pm25', 'units': 'µg/m³', 'displayName': None}, {'id': 3, 'name': 'o3', 'units': 'µg/m³', 'displayName': None}, {'id': 10, 'name': 'o3', 'units': 'ppm', 'displayName': None}, {'id': 11, 'name': 'bc', 'units': 'µg/m³', 'displayName': None}, {'id': 15, 'name': 'no2', 'units': 'ppb', 'displayName': None}, {'id': 19, 'name': 'pm1', 'units': 'µg/m³', 'displayName': None}, {'id': 21, 'name': 'co2', 'units': 'ppm', 'displayName': None}, {'id': 24, 'name': 'no', 'units': 'ppb', 'displayName': None}, {'id': 98, 'name': 'relativehumidity', 'units': '%', 'displayName': None}, {'id': 100, 'name': 'temperature', 'units': 'c', 'displayName': None}, {'id': 125, 'name': 'um003', 'units': 'particles/cm³', 'displayName': None}, {'id': 128, 'name': 'temperature', 'units': 'f'

In [None]:
# call the /parameters endpoint to list all available parameters

url_parameters = "https://api.openaq.org/v3/parameters"
response = requests.get(url_parameters, headers=headers)
response.raise_for_status()
parameters_data = response.json()
print(parameters_data['results'])

[{'id': 1, 'name': 'pm10', 'units': 'µg/m³', 'displayName': 'PM10', 'description': 'Particulate matter less than 10 micrometers in diameter mass concentration'}, {'id': 2, 'name': 'pm25', 'units': 'µg/m³', 'displayName': 'PM2.5', 'description': 'Particulate matter less than 2.5 micrometers in diameter mass concentration'}, {'id': 3, 'name': 'o3', 'units': 'µg/m³', 'displayName': 'O₃ mass', 'description': 'Ozone mass concentration'}, {'id': 4, 'name': 'co', 'units': 'µg/m³', 'displayName': 'CO mass', 'description': 'Carbon Monoxide mass concentration'}, {'id': 5, 'name': 'no2', 'units': 'µg/m³', 'displayName': 'NO₂ mass', 'description': 'Nitrogen Dioxide mass concentration'}, {'id': 6, 'name': 'so2', 'units': 'µg/m³', 'displayName': 'SO₂ mass', 'description': 'Sulfur Dioxide mass concentration'}, {'id': 7, 'name': 'no2', 'units': 'ppm', 'displayName': 'NO₂', 'description': 'Nitrogen Dioxide concentration'}, {'id': 8, 'name': 'co', 'units': 'ppm', 'displayName': 'CO', 'description': 'Car

In [5]:
# reshape parameters data
params_list = parameters_data['results']

# Convert to DataFrame
df_params = pd.DataFrame(params_list)

# Select and reorder columns for better readability
columns_to_show = ['id', 'name', 'displayName', 'units', 'description']
df_params = df_params[columns_to_show]

# Display the DataFrame nicely
pd.set_option('display.max_colwidth', 100)  # to avoid truncation of descriptions
print(df_params)

       id              name           displayName          units  \
0       1              pm10                  PM10          µg/m³   
1       2              pm25                 PM2.5          µg/m³   
2       3                o3               O₃ mass          µg/m³   
3       4                co               CO mass          µg/m³   
4       5               no2              NO₂ mass          µg/m³   
5       6               so2              SO₂ mass          µg/m³   
6       7               no2                   NO₂            ppm   
7       8                co                    CO            ppm   
8       9               so2                   SO₂            ppm   
9      10                o3                    O₃            ppm   
10     11                bc                    BC          µg/m³   
11     15               no2                   NO₂            ppb   
12     19               pm1                   PM1          µg/m³   
13     21               co2                   CO

# Next: Identify active PM2.5 sensors in California

We’ll do this by querying the /v3/locations endpoint, filtering for:
-  country=US
- parameters=pm25
- isActive=true
- Bounding box for California
- Optionally sort by recency or number of measurements

In [None]:
# define bounding box for California
bbox_ca = {
    "coordinates": [-124.48, 32.53, -114.13, 42.01]  # [west, south, east, north]
}

In [None]:
# Minimal call on /locations endpoint
url_locations = "https://api.openaq.org/v3/locations"
response = requests.get(url_locations, headers=headers)
response.raise_for_status()
locations_data = response.json()
print(locations_data['results'])

[{'id': 3, 'name': 'NMA - Nima', 'locality': None, 'timezone': 'Africa/Accra', 'country': {'id': 152, 'code': 'GH', 'name': 'Ghana'}, 'owner': {'id': 4, 'name': 'Unknown Governmental Organization'}, 'provider': {'id': 209, 'name': 'Dr. Raphael E. Arku and Colleagues'}, 'isMobile': False, 'isMonitor': True, 'instruments': [{'id': 2, 'name': 'Government Monitor'}], 'sensors': [{'id': 6, 'name': 'pm10 µg/m³', 'parameter': {'id': 1, 'name': 'pm10', 'units': 'µg/m³', 'displayName': 'PM10'}}, {'id': 5, 'name': 'pm25 µg/m³', 'parameter': {'id': 2, 'name': 'pm25', 'units': 'µg/m³', 'displayName': 'PM2.5'}}], 'coordinates': {'latitude': 5.58389, 'longitude': -0.19968}, 'licenses': None, 'bounds': [-0.19968, 5.58389, -0.19968, 5.58389], 'distance': None, 'datetimeFirst': None, 'datetimeLast': None}, {'id': 4, 'name': 'NMT - Nima', 'locality': None, 'timezone': 'Africa/Accra', 'country': {'id': 152, 'code': 'GH', 'name': 'Ghana'}, 'owner': {'id': 4, 'name': 'Unknown Governmental Organization'}, '

In [None]:
# Minimal call on /locations endpoint, including country code "US"

params_us = {
    "country": "US",
    "parameters": "pm25",
    "limit": 500  # or higher
}

# Define bounding box for California
def in_california(lat, lon):
    return (
        32.53 <= lat <= 42.01 and # CA latitudes
        -124.48 <= lon <= -114.13 # CA longitudes
    )

# API call
response = requests.get(url_locations, headers=headers, params=params_us)
response.raise_for_status()
locations_data = response.json()

# Count how many fall in CA
ca_count = 0
for loc in locations_data["results"]:
    coords = loc.get("coordinates", {})
    lat = coords.get("latitude")
    lon = coords.get("longitude")
    if lat and lon and in_california(lat, lon):
        ca_count += 1

print(f"Found {ca_count} PM2.5-monitoring locations in California.")

Found 15 PM2.5-monitoring locations in California.


In [None]:
# Collect CA sensor IDs

ca_sensor_ids = []

for loc in locations_data["results"]:
    coords = loc.get("coordinates", {})
    lat = coords.get("latitude")
    lon = coords.get("longitude")

    if lat and lon and in_california(lat, lon):
        sensors = loc.get("sensors", [])
        for sensor in sensors:
            param = sensor.get("parameter", {})
            if param.get("name") == "pm25":
                ca_sensor_ids.append(sensor["id"])

print(f"Found {len(ca_sensor_ids)} PM2.5 sensors in CA.")
print(ca_sensor_ids)

Found 10 PM2.5 sensors in CA.
[5, 8, 9, 12, 13, 15, 18, 19, 21, 62]


In [None]:
# Query measurements for a sensor

sensor_id = ca_sensor_ids[0]
url_sensor = f"https://api.openaq.org/v3/sensors/{sensor_id}/measurements"
params = {"limit": 100}
response = requests.get(url, headers=headers, params=params)
response.raise_for_status()
data = response.json()
display(data)
# unfortunately, there appears to be no data for any of the sensors we just saw in CA!

{'meta': {'name': 'openaq-api',
  'website': '/',
  'page': 1,
  'limit': 100,
  'found': 0},
 'results': []}

In [None]:
# Pull all locations in the US that monitor PM2.5
url_locations = "https://api.openaq.org/v3/locations"
params = {
    "country": "US",
    "parameters": "pm25",
    "limit": 1000  # paginate later if needed
}
response = requests.get(url_locations, headers=headers, params=params)
response.raise_for_status()
locations = response.json()["results"]

In [56]:
# Filter to locations inside California
def in_california(lat, lon):
    # Rough bounding box for CA
    return 32.5 <= lat <= 42.0 and -124.5 <= lon <= -114.0

ca_sensors = []

for loc in locations:
    coords = loc.get("coordinates", {})
    lat = coords.get("latitude")
    lon = coords.get("longitude")
    if lat and lon and in_california(lat, lon):
        # print('CA HIT')
        for sensor in loc.get("sensors", []):
            if sensor.get("parameter", {}).get("name") == "pm25":
                # print('PM25 HIT')
                ca_sensors.append({
                    "sensor_id": sensor["id"],
                    "location_id": loc["id"],
                    "location_name": loc["name"],
                    "lat": lat,
                    "lon": lon,
                    "datetimeLast": loc.get("datetimeLast")
                })

display(ca_sensors)

[{'sensor_id': 350,
  'location_id': 207,
  'location_name': 'MMFRA1001',
  'lat': 39.482481,
  'lon': -121.221235,
  'datetimeLast': {'utc': '2016-03-16T22:00:00Z',
   'local': '2016-03-16T15:00:00-07:00'}},
 {'sensor_id': 354,
  'location_id': 211,
  'location_name': 'Felton Cal-Fire',
  'lat': 37.0481,
  'lon': -122.074603,
  'datetimeLast': {'utc': '2022-04-08T18:00:00Z',
   'local': '2022-04-08T11:00:00-07:00'}},
 {'sensor_id': 357,
  'location_id': 214,
  'location_name': 'MMFRA1001',
  'lat': 39.482385,
  'lon': -121.221128,
  'datetimeLast': {'utc': '2016-03-16T05:00:00Z',
   'local': '2016-03-15T22:00:00-07:00'}},
 {'sensor_id': 401,
  'location_id': 237,
  'location_name': 'San Ysidro',
  'lat': 32.543475,
  'lon': -117.029028,
  'datetimeLast': {'utc': '2016-03-22T15:00:00Z',
   'local': '2016-03-22T08:00:00-07:00'}},
 {'sensor_id': 25272,
  'location_id': 276,
  'location_name': 'Morro Bay',
  'lat': 35.36639,
  'lon': -120.8426,
  'datetimeLast': {'utc': '2023-06-29T16:00:

In [53]:
# Try querying a recent sensor

sensor_id = ca_sensors[0]["sensor_id"]  # or loop
url = f"https://api.openaq.org/v3/sensors/{sensor_id}/hours"
params = {"limit": 100}
response = requests.get(url, headers=headers, params=params)
data = response.json()
display(data)


{'meta': {'name': 'openaq-api',
  'website': '/',
  'page': 1,
  'limit': 100,
  'found': 64},
 'results': [{'value': 9.0,
   'flagInfo': {'hasFlags': False},
   'parameter': {'id': 2,
    'name': 'pm25',
    'units': 'µg/m³',
    'displayName': None},
   'period': {'label': '1hour',
    'interval': '01:00:00',
    'datetimeFrom': {'utc': '2016-03-06T19:00:00Z',
     'local': '2016-03-06T11:00:00-08:00'},
    'datetimeTo': {'utc': '2016-03-06T20:00:00Z',
     'local': '2016-03-06T12:00:00-08:00'}},
   'coordinates': None,
   'summary': {'min': 9.0,
    'q02': 9.0,
    'q25': 9.0,
    'median': 9.0,
    'q75': 9.0,
    'q98': 9.0,
    'max': 9.0,
    'avg': 9.0,
    'sd': None},
   'coverage': {'expectedCount': 1,
    'expectedInterval': '01:00:00',
    'observedCount': 1,
    'observedInterval': '01:00:00',
    'percentComplete': 100.0,
    'percentCoverage': 100.0,
    'datetimeFrom': {'utc': '2016-03-06T19:00:00Z',
     'local': '2016-03-06T11:00:00-08:00'},
    'datetimeTo': {'utc':

In [None]:
# Query the average (mean) value for each hour, by sensor

url_hours = f"https://api.openaq.org/v3/sensors/{sensor_id}/hours"
params = {
    "limit": 100,              # max number of records
    "page": 1,                 # pagination
    "date_from": "2024-12-01", # optional
    "date_to": "2025-01-01",   # optional
}
response = requests.get(url_hours, headers=headers, params=params)
response.raise_for_status()
hourly_data = response.json()
display(hourly_data["results"])
display(hourly_data)

[{'value': 9.0,
  'flagInfo': {'hasFlags': False},
  'parameter': {'id': 2,
   'name': 'pm25',
   'units': 'µg/m³',
   'displayName': None},
  'period': {'label': '1hour',
   'interval': '01:00:00',
   'datetimeFrom': {'utc': '2016-03-06T19:00:00Z',
    'local': '2016-03-06T11:00:00-08:00'},
   'datetimeTo': {'utc': '2016-03-06T20:00:00Z',
    'local': '2016-03-06T12:00:00-08:00'}},
  'coordinates': None,
  'summary': {'min': 9.0,
   'q02': 9.0,
   'q25': 9.0,
   'median': 9.0,
   'q75': 9.0,
   'q98': 9.0,
   'max': 9.0,
   'avg': 9.0,
   'sd': None},
  'coverage': {'expectedCount': 1,
   'expectedInterval': '01:00:00',
   'observedCount': 1,
   'observedInterval': '01:00:00',
   'percentComplete': 100.0,
   'percentCoverage': 100.0,
   'datetimeFrom': {'utc': '2016-03-06T19:00:00Z',
    'local': '2016-03-06T11:00:00-08:00'},
   'datetimeTo': {'utc': '2016-03-06T20:00:00Z',
    'local': '2016-03-06T12:00:00-08:00'}}},
 {'value': 0.0,
  'flagInfo': {'hasFlags': False},
  'parameter': {

{'meta': {'name': 'openaq-api',
  'website': '/',
  'page': 1,
  'limit': 100,
  'found': 64},
 'results': [{'value': 9.0,
   'flagInfo': {'hasFlags': False},
   'parameter': {'id': 2,
    'name': 'pm25',
    'units': 'µg/m³',
    'displayName': None},
   'period': {'label': '1hour',
    'interval': '01:00:00',
    'datetimeFrom': {'utc': '2016-03-06T19:00:00Z',
     'local': '2016-03-06T11:00:00-08:00'},
    'datetimeTo': {'utc': '2016-03-06T20:00:00Z',
     'local': '2016-03-06T12:00:00-08:00'}},
   'coordinates': None,
   'summary': {'min': 9.0,
    'q02': 9.0,
    'q25': 9.0,
    'median': 9.0,
    'q75': 9.0,
    'q98': 9.0,
    'max': 9.0,
    'avg': 9.0,
    'sd': None},
   'coverage': {'expectedCount': 1,
    'expectedInterval': '01:00:00',
    'observedCount': 1,
    'observedInterval': '01:00:00',
    'percentComplete': 100.0,
    'percentCoverage': 100.0,
    'datetimeFrom': {'utc': '2016-03-06T19:00:00Z',
     'local': '2016-03-06T11:00:00-08:00'},
    'datetimeTo': {'utc':

In [None]:
# let's collect the hourly average (mean) value for the sensors we found in CA
