In [0]:
%pip install awscli


In [0]:
%sh aws configure set region us-east-1

In [0]:
dbutils.library.restartPython()

The below backfill only collected files from the LA region. I need to collect from the elsewhere in addition, but I don't need to do a backfill on this right now. 

In [0]:
# import os
# import time
# import subprocess
# from pyspark.sql import SparkSession

# # Initialize the Spark session (Databricks automatically creates one)
# spark = SparkSession.builder.getOrCreate()

# # Read the Delta table that contains the location_id column.
# # Adjust the path as needed for your environment.
# delta_df = spark.read.format("delta").load("/Volumes/tabular/dataexpert/freshoats_capstone/la_sensors_delta_table")

# # Collect the location_ids into a Python list (assuming the column name is "location_id")
# location_ids = [row.location_id for row in delta_df.select("location_id").collect()]

# # Settings for the AWS S3 backfill
# year = '2025'
# month = '02'
# base_s3_path = "s3://openaq-data-archive/records/csv.gz/"

# # New folder in your volume where files will be saved
# local_base_dir = "/Workspace/Users/justin.papreck@gmail.com/AirQuality/AirQualityBackfill"

# # Create the new folder if it doesn't already exist
# if not os.path.exists(local_base_dir):
#     os.makedirs(local_base_dir)

# # Pause duration (in seconds) before retrying if an error is encountered
# retry_delay_sec = 60

# for loc_id in location_ids:
#     # Construct the S3 folder path for the specific location_id, year, and month
#     s3_path = f"{base_s3_path}locationid={loc_id}/year={year}/month={month}/"
    
#     # Create a subdirectory for this location within the new folder
#     local_dest = os.path.join(local_base_dir, f"location-{loc_id}")
#     if not os.path.exists(local_dest):
#         os.makedirs(local_dest)
    
#     # Build the AWS CLI command
#     command = f"aws s3 cp --no-sign-request --recursive {s3_path} {local_dest}"
#     print(f"Starting download for location {loc_id}")
#     print(f"Executing: {command}")
    
#     # Retry loop: if the command exits with a nonzero code (error), pause and retry.
#     while True:
#         result = subprocess.run(command, shell=True)
#         if result.returncode == 0:
#             print(f"Download for location {loc_id} completed successfully.\n")
#             break
#         else:
#             print(f"Error encountered for location {loc_id}. Pausing for {retry_delay_sec} seconds before retrying...\n")
#             time.sleep(retry_delay_sec)

Next, open and append these files to a table for sensor measurements, that will function as the primary table to write to. 

In [0]:
import os
import gzip
import shutil

# Define the base directory where all location folders are stored
base_dir = "/Workspace/Users/justin.papreck@gmail.com/AirQuality/AirQualityBackfill"

# Define the target directory where decompressed CSV files will be saved
# This directory will be: AirQualityBackfill/Decompressed/
target_dir = os.path.join(base_dir, "Decompressed")
os.makedirs(target_dir, exist_ok=True)

# Iterate over each folder in the base directory
for folder in os.listdir(base_dir):
    folder_path = os.path.join(base_dir, folder)
    
    # Process only folders that represent locations (and skip the target directory)
    if os.path.isdir(folder_path) and folder.startswith("location-"):
        # Iterate over each file in the location folder
        for file_name in os.listdir(folder_path):
            if file_name.endswith(".gz"):
                gz_file_path = os.path.join(folder_path, file_name)
                
                # Remove the '.gz' extension to get the CSV file name
                csv_file_name = file_name[:-3]
                # Prefix with the location folder name to ensure unique names
                output_csv_filename = f"{folder}-{csv_file_name}"
                
                # Define the full output path in the target directory
                csv_file_path = os.path.join(target_dir, output_csv_filename)
                
                # Decompress only if the CSV file does not already exist
                if not os.path.exists(csv_file_path):
                    print(f"Decompressing {gz_file_path} to {csv_file_path} ...")
                    with gzip.open(gz_file_path, 'rb') as f_in, open(csv_file_path, 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)
                    print(f"Decompressed to {csv_file_path}")
                else:
                    print(f"CSV file already exists: {csv_file_path}")

In [0]:
import os
from pyspark.sql import SparkSession

# Create or retrieve the Spark session
spark = SparkSession.builder.getOrCreate()

# Define the local directory where the decompressed CSV files are located.
local_decompressed_dir = "/Workspace/Users/justin.papreck@gmail.com/AirQuality/AirQualityBackfill/Decompressed"

# Define the target directory in DBFS.
dbfs_target_dir = "dbfs:/FileStore/Decompressed"

# Create the DBFS directory if it doesn't already exist.
dbutils.fs.mkdirs(dbfs_target_dir)

# Iterate over every file in the local decompressed directory.
for file_name in os.listdir(local_decompressed_dir):
    if file_name.endswith(".csv"):
        # Construct the full source and destination paths.
        source_path = f"file:{local_decompressed_dir}/{file_name}"
        destination_path = f"{dbfs_target_dir}/{file_name}"
        print(f"Copying {source_path} to {destination_path}...")
        # Copy the file from the local filesystem to DBFS.
        dbutils.fs.cp(source_path, destination_path)
        print(f"Copied {file_name} successfully.")

In [0]:
%sql
DROP TABLE  sensor_measurements

In [0]:
# Read all CSV files from the DBFS directory (using a wildcard pattern)
df_all = spark.read.option("header", "true") \
                   .option("inferSchema", "true") \
                   .csv("dbfs:/FileStore/Decompressed/*.csv")

# Optionally, check the count or preview the data
print("Total rows read:", df_all.count())
df_all.show(10)

In [0]:
display(df_all)

In [0]:
df_all.write.format("delta").mode("append").saveAsTable("sensor_measurements")

In [0]:
%sql
SELECT * 
FROM sensor_measurements

Prepare the Backfill data for the permanent measurement table 

In [0]:
backfill_df = spark.read.table("sensor_measurements")

In [0]:
display(backfill_df)

In [0]:
backfill_filtered = backfill_df.withColumnRenamed("sensors_id", "sensor_id").select("location_id", "sensor_id", "parameter", "units", "value", "datetime", "lat", "lon")
display(backfill_filtered)

# Validate Idempotence before creating Permanent Table

In [0]:
# Validate that your enriched data has no duplicates
duplicate_check_df = backfill_filtered.groupBy("location_id", "sensors_id", "datetime").count().filter("count > 1")

if duplicate_check_df.count() > 0:
    print("Duplicates detected in the enriched data:")
    duplicate_check_df.show()
else:
    print("No duplicates found. Data is idempotent based on the common key and timestamp.")

In [0]:
backfill_filtered.write.format("delta").mode("append").saveAsTable("permanent_sensor_measurements")