The Sensor Locations were acquired from OpenAQ locally and the csv was imported into databricks using the following code:

In [0]:
pip install openaq, python-dotenv

In [0]:
from openaq import OpenAQ
import os
from dotenv import load_dotenv
api_key = os.getenv('OPENAQ_API_KEY')

client = OpenAQ(api_key=api_key)

try:
    # Make a single request to fetch the first page of locations in the US
    response = client.locations.list(iso="US", limit=2)  # Fetch 10 results
    locations = response.results  # Get the list of locations

    # Inspect the first location object
    for location in locations:
        print(location)  # Print the location object to inspect its structure

finally:
    # Ensure the client is properly closed
    client.close()

In [0]:
dbutils.library.restartPython()

In [0]:
from openaq import OpenAQ
import csv
from math import ceil
import os
from dotenv import load_dotenv
api_key = os.getenv('OPENAQ_API_KEY')

client = OpenAQ(api_key=api_key)

output_file = "/dbfs/FileStore/us_sensor_locations.csv"

try:
    # Define the CSV header with fields you want to capture.
    fieldnames = [
        "location_id", "name", "locality", "timezone",
        "country_id", "country_code", "country_name",
        "owner_id", "owner_name",
        "provider_id", "provider_name",
        "is_mobile", "is_monitor",
        "instruments", "sensors",
        "latitude", "longitude", "bounds",
        "distance", "datetime_first", "datetime_last"
    ]
    
    with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        # Start pagination (you can also compute total pages if desired)
        page = 1
        limit = 1000

        while True:
            print(f"Fetching page {page}...")
            response = client.locations.list(iso="US", limit=limit, page=page)
            locations = response.results

            if not locations:
                break

            for location in locations:
                # Skip locations without coordinates (or handle them as you wish)
                if not location.coordinates:
                    continue

                # Flatten nested objects.
                country = location.country if hasattr(location, "country") else None
                owner = location.owner if hasattr(location, "owner") else None
                provider = location.provider if hasattr(location, "provider") else None

                # For lists like instruments and sensors, join the names (or any field) with a separator.
                instruments = (
                    "; ".join([inst.name for inst in location.instruments])
                    if location.instruments else ""
                )
                # For sensors, you might also want to include parameter details.
                sensors = ""
                if location.sensors:
                    sensors_list = []
                    for sensor in location.sensors:
                        param = sensor.parameter if hasattr(sensor, "parameter") else sensor.get("parameter", {})
                        sensor_str = f"{sensor.id}:{sensor.name} ({param.name if hasattr(param, 'name') else param.get('name', '')})"
                        sensors_list.append(sensor_str)
                    sensors = "; ".join(sensors_list)

                writer.writerow({
                    "location_id": location.id,
                    "name": location.name,
                    "locality": location.locality,
                    "timezone": location.timezone,
                    "country_id": country.id if country and hasattr(country, "id") else (country.get("id") if country else None),
                    "country_code": country.code if country and hasattr(country, "code") else (country.get("code") if country else None),
                    "country_name": country.name if country and hasattr(country, "name") else (country.get("name") if country else None),
                    "owner_id": owner.id if owner and hasattr(owner, "id") else (owner.get("id") if owner else None),
                    "owner_name": owner.name if owner and hasattr(owner, "name") else (owner.get("name") if owner else None),
                    "provider_id": provider.id if provider and hasattr(provider, "id") else (provider.get("id") if provider else None),
                    "provider_name": provider.name if provider and hasattr(provider, "name") else (provider.get("name") if provider else None),
                    "is_mobile": location.is_mobile,
                    "is_monitor": location.is_monitor,
                    "instruments": instruments,
                    "sensors": sensors,
                    "latitude": location.coordinates.latitude,
                    "longitude": location.coordinates.longitude,
                    "bounds": ", ".join(str(b) for b in location.bounds) if location.bounds else "",
                    "distance": location.distance,
                    "datetime_first": location.datetime_first,
                    "datetime_last": location.datetime_last
                })
            page += 1

    print(f"Data saved to {output_file}")

finally:
    client.close()

In [0]:
sensor_locations_df = spark.read.option("header", "true").option("inferSchema", "true").csv("dbfs:/FileStore/AirQuality/us_all_location_data.csv")

In [0]:
display(sensor_locations_df)

In [0]:
# Validate Coordinates are within the U.S.
# Define the bounding box for the U.S.
min_lat, max_lat = 18.0, 71.538800
min_lon, max_lon = -179.148909, -66.93457

# Filter the DataFrame
valid_locations_df = sensor_locations_df.filter(
    (sensor_locations_df["latitude"] >= min_lat) &
    (sensor_locations_df["latitude"] <= max_lat) &
    (sensor_locations_df["longitude"] >= min_lon) &
    (sensor_locations_df["longitude"] <= max_lon)
)

# Validate that all sensor locations are in the United States including Alaska and Hawaii
valid_locations_df.count() == sensor_locations_df.count()

In [0]:
# Define the bounding box excluding HI and AK
min_lat, max_lat =  24.396308, 49.384358
min_lon, max_lon = -125.0, -66.93457

# Filter the DataFrame
valid_locations_c_df = sensor_locations_df.filter(
    (sensor_locations_df["latitude"] >= min_lat) &
    (sensor_locations_df["latitude"] <= max_lat) &
    (sensor_locations_df["longitude"] >= min_lon) &
    (sensor_locations_df["longitude"] <= max_lon)
)

# Validate that all sensor locations are in the United States excluding Alaska and Hawaii
valid_locations_c_df.count() == sensor_locations_df.count()

This verifies that there are sensors included outside of the continental US, which is a good thing!

Now Check for duplicates and Dedupe.


In [0]:
# Group by all columns and count occurrences
duplicates_df = sensor_locations_df.groupBy(sensor_locations_df.columns).count()

# Filter rows where count > 1 (indicating duplicates)
duplicates_df = duplicates_df.filter(duplicates_df["count"] > 1)

# Show duplicate rows
duplicates_df.show()

In [0]:
# Count the number of duplicate rows
duplicate_count = sensor_locations_df.groupBy(sensor_locations_df.columns).count().filter("count > 1").count()
print(f"Number of duplicate rows: {duplicate_count}")

Drop the columns that won't be used, especially those that don't need to be flattened:

location_id	
name	
locality	
timezone	
country_id	
country_code	
country_name	
owner_id	
owner_name	
provider_id	
provider_name	
is_mobile	
is_monitor	
instruments	
sensors	
latitude	
longitude	
bounds	
distance	
datetime_first	
datetime_last


In [0]:
sensor_locations_df = sensor_locations_df.select(
    "location_id"		
    , "sensors"	
    , "latitude"	
    , "longitude"		
    , "datetime_last")

Flatten the Sensors and datetime_last columns

In [0]:
from pyspark.sql.functions import regexp_extract, col, to_date, lit

# Convert the datetimeLast column from having both utc and local in a complex string to just date in utc
sensor_locations_df = sensor_locations_df.withColumn("datetime_last", to_date(regexp_extract("datetime_last", r"utc='(.*?)'", 1)))

In [0]:
display(sensor_locations_df)

In [0]:
from pyspark.sql.functions import split, explode, trim, regexp_extract

# Assume sensor_locations_df is your existing DataFrame with:
# location_id, sensors, latitude, longitude, datetime_last

# Step 1: Split the sensors string by semicolon to create an array, then explode it.
df = sensor_locations_df.withColumn("sensor_list", split("sensors", ";"))
df = df.withColumn("sensor_item", explode("sensor_list"))

# Step 2: Trim any extra whitespace from each sensor item.
df = df.withColumn("sensor_item", trim("sensor_item"))

# Step 3: Extract sensor_id, parameter_name, and parameter_units from each sensor_item.
#   - sensor_id: the digits before the colon.
#   - parameter_name: the token immediately after the colon.
#   - parameter_units: the token following the parameter_name.
df = df.withColumn("sensor_id", regexp_extract("sensor_item", r"^\s*(\d+):", 1))
df = df.withColumn("parameter_name", regexp_extract("sensor_item", r":\s*([^ ]+)", 1))
df = df.withColumn("parameter_units", regexp_extract("sensor_item", r":\s*[^ ]+\s+([^ ]+)", 1))

# Final DataFrame: select all original columns plus the new sensor fields.
result_df = df.select("location_id", "sensors", "latitude", "longitude", "datetime_last",
                        "sensor_id", "parameter_name", "parameter_units").drop("sensors")

display(result_df)

At this point, we need to filter any sensors with Null datetime_Last parameters as well as any that don't have a datetime_last in 2025, as they are no longer active sensors and shouldn't be used for the nearest neighbor calculations.

In [0]:
from pyspark.sql.functions import col, to_date, lit

# Count null columns for the datetimeLast - indicates that the sensor activity is not collecting data
result_df.filter(col("datetime_last").isNull()).count()

There are null values, which indicates that there is no recorded data from those sensors at all. 
Next, I need to clean up the datetimeLast column - this is the last date that the sensor was active, but it has recorded both utc and local time in the same string within the column. The FIRMS data only works on utc, so I only want to keep this, and I also only want to filter by date, not time. I can use regex to remove the second half of the column. 

In [0]:
# Filter the Nulls and Dates that are prior to 2025-01-01, as they are not active for the data range needed
filtered_sensors = result_df.filter(
    (col("datetime_last").isNotNull()) &  # Exclude rows where datetimeLast_utc is null
    (col("datetime_last") >= to_date(lit("2025-01-01")))  # Keep rows with datetimeLast_utc >= 2025-01-01
)

In [0]:
display(filtered_sensors)

Verify still no duplicates

In [0]:
duplicate_count = filtered_sensors.groupBy(filtered_sensors.columns).count().filter("count > 1").count()
print(f"Number of duplicate rows: {duplicate_count}")

In [0]:
from pyspark.sql import functions as F

# This will create a new DataFrame where each column shows the number of nulls in that column.
null_counts = filtered_sensors.select(
    *[F.sum(F.when(F.col(col).isNull(), 1).otherwise(0)).alias(col)
      for col in filtered_sensors.columns]
)

null_counts.show()

In [0]:
# Option 2: Save the Delta table to an explicit DBFS path and register it.
filtered_sensors.write.format("delta") \
    .mode("overwrite") \
    .option("path", "dbfs:/delta/filtered_sensors") \
    .saveAsTable("filtered_sensors")

In [0]:
%sql
SELECT * 
FROM filtered_sensors