# Ingestion of Wildfire Data
1. This ingests data for 3 days, which includes the current day and two days prior. This will ensure that all delayed data is collected, and may also give insights to the region prior to detection
2. The data is partitioned by acquisition date and then zordered by longitude

In [0]:
pip install python-dotenv

In [0]:
# from delta.tables import DeltaTable
# import pandas as pd

# import os
# from dotenv import load_dotenv
# api_key = os.getenv('FIRMS_API_KEY')

# # API details
# api_key = api_key
# dataset = "VIIRS_NOAA20_NRT"  # VIIRS NOAA-20 Near Real-Time data
# country_code = "USA"          # Country code for the United States
# days = 2                      # Number of days of data

# # Construct the API URL
# api_url = f'https://firms.modaps.eosdis.nasa.gov/api/country/csv/{api_key}/{dataset}/{country_code}/{days}'

# # Fetch the new data from the FIRMS API
# new_data_df = pd.read_csv(api_url)

# # Convert the Pandas DataFrame to a Spark DataFrame
# new_spark_df = spark.createDataFrame(new_data_df)

# # Set the table name to be used in the metastore
# table_name = "firms_data"

# # Check if the table already exists using the Spark catalog
# if spark.catalog.tableExists(table_name):
#     # Load the existing Delta table using its table name
#     delta_table = DeltaTable.forName(spark, table_name)

#     # Merge new data with the existing table
#     delta_table.alias("target").merge(
#         new_spark_df.alias("source"),
#         """
#         target.latitude = source.latitude AND
#         target.longitude = source.longitude AND
#         target.acq_date = source.acq_date AND
#         target.acq_time = source.acq_time
#         """
#     ).whenMatchedUpdateAll() \
#      .whenNotMatchedInsertAll() \
#      .execute()

#     # Optimize the table (if your environment supports it)
#     spark.sql(f"OPTIMIZE {table_name} ZORDER BY (longitude)")
# else:
#     # If the table doesn't exist, create it as a managed Delta table with partitioning
#     new_spark_df.write.format("delta") \
#         .partitionBy("acq_date") \
#         .mode("overwrite") \
#         .saveAsTable(table_name)

#     # Optimize after initial write
#     spark.sql(f"OPTIMIZE {table_name} ZORDER BY (longitude)")

# print("Data merged and Delta table optimized.")

In [0]:
from delta.tables import DeltaTable
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import time

import os
from dotenv import load_dotenv
api_key = os.getenv('FIRMS_API_KEY')

# API details
api_key = api_key
dataset = "VIIRS_NOAA20_NRT"  # VIIRS NOAA-20 Near Real-Time data
country_code = "USA"          # Country code for the United States
days = 3                      # Number of days of data
table_name = "firms_data"     # Delta table name

# Function to fetch data with retry logic
def fetch_data_with_retries(api_url, retries=3, backoff_factor=1, timeout=60):
    session = requests.Session()
    retry = Retry(
        total=retries,
        backoff_factor=backoff_factor,
        status_forcelist=[429, 500, 502, 503, 504],  # Retry on these HTTP status codes
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount("https://", adapter)
    session.mount("http://", adapter)

    try:
        response = session.get(api_url, timeout=timeout)  # Set a timeout for the request
        response.raise_for_status()  # Raise an exception for HTTP errors
        return response.content
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None

# Function to split the days into smaller chunks
def split_days(total_days, chunk_size):
    return [range(i, min(i + chunk_size, total_days + 1)) for i in range(1, total_days + 1, chunk_size)]

# Main function to orchestrate data collection and merging
def collect_and_merge_data():
    # Split the days into smaller chunks (e.g., 1 day per chunk)
    day_chunks = split_days(days, chunk_size=1)

    all_data = []  # To store all the collected data

    for chunk in day_chunks:
        print(f"Processing days: {list(chunk)}")
        for day in chunk:
            # Construct the API URL for the specific day
            api_url = f'https://firms.modaps.eosdis.nasa.gov/api/country/csv/{api_key}/{dataset}/{country_code}/{day}'
            try:
                # Fetch data with retry logic
                csv_data = fetch_data_with_retries(api_url, retries=3, backoff_factor=2, timeout=60)
                if csv_data:
                    # Read the CSV data into a Pandas DataFrame
                    daily_data = pd.read_csv(pd.compat.StringIO(csv_data.decode('utf-8')))
                    all_data.append(daily_data)
                    print(f"Successfully fetched data for day {day}")
                else:
                    print(f"Failed to fetch data for day {day}")
            except Exception as e:
                print(f"Unexpected error for day {day}: {e}")
                continue  # Skip to the next day if there's an error

        # Add a delay between chunks to avoid overwhelming the API
        time.sleep(5)

    # Combine all the collected data into a single DataFrame
    if all_data:
        new_data_df = pd.concat(all_data, ignore_index=True)

        # Convert the Pandas DataFrame to a Spark DataFrame
        new_spark_df = spark.createDataFrame(new_data_df)

        # Check if the Delta table already exists
        if spark.catalog.tableExists(table_name):
            # Load the existing Delta table
            delta_table = DeltaTable.forName(spark, table_name)

            # Merge new data with the existing table
            delta_table.alias("target").merge(
                new_spark_df.alias("source"),
                """
                target.latitude = source.latitude AND
                target.longitude = source.longitude AND
                target.acq_date = source.acq_date AND
                target.acq_time = source.acq_time
                """
            ).whenMatchedUpdateAll() \
             .whenNotMatchedInsertAll() \
             .execute()

            # Optimize the table (if your environment supports it)
            spark.sql(f"OPTIMIZE {table_name} ZORDER BY (longitude)")
        else:
            # If the table doesn't exist, create it as a managed Delta table with partitioning
            new_spark_df.write.format("delta") \
                .partitionBy("acq_date") \
                .mode("overwrite") \
                .saveAsTable(table_name)

            # Optimize after initial write
            spark.sql(f"OPTIMIZE {table_name} ZORDER BY (longitude)")

        print("Data merged and Delta table optimized.")
    else:
        print("No data was collected.")

# Run the function
collect_and_merge_data()

Validate that there are no duplicates or missing dates

In [0]:
# %sql
# SELECT * 
# FROM firms_data
# WHERE acq_date = '2025-03-05' AND confidence = 'h'

