In [0]:
# ==============================================================
# 🧱 DATABRICKS HACKATHON PROJECT
# ==============================================================
# NOTEBOOK: 01_Historical_Weather_AirQuality_Ingestion
# PURPOSE:  Fetch historical weather & air quality data from Open-Meteo API 
#           for all active cities in City Master, and store in Bronze layer.
# AUTHOR:   Chintan Shah
# ==============================================================

# 🌍 Weather & Air Quality Bronze Layer

### Notebook: `01_Batch_Ingestion`

Unified ingestion pipeline for **Weather + Air Quality** data using the **Open-Meteo APIs**.

- Automatically creates Delta tables if missing  
- Fetches hourly and daily data for all active cities  
- Logs each run in `pipeline_log`  
- Uses caching and retry for API reliability  

**References**
- [Weather API Docs](https://open-meteo.com/en/docs)
- [Air Quality API Docs](https://open-meteo.com/en/docs/air-quality-api)


## 1️⃣ Setup and Imports

In [0]:
# Install required packages, and restart the Python kernel
# Can add it to environment init script
%pip install openmeteo_requests
%pip install requests_cache
%pip install retry_requests
dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
from datetime import datetime, timedelta, timezone
import pandas as pd
import numpy as np
import traceback
import openmeteo_requests
import requests_cache
from retry_requests import retry

from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException

from delta.tables import DeltaTable
from pyspark.sql.functions import col, min as spark_min, max as spark_max
import pyspark.sql.functions as F

spark = SparkSession.builder.getOrCreate()
try:
    spark.sql("SET spark.databricks.delta.properties.defaults.enableChangeDataFeed = true")
except Exception as e:
    print(f"⚠️ Skipping CDF config (not supported in this workspace): {str(e)}")

⚠️ Skipping CDF config (not supported in this workspace): [CONFIG_NOT_AVAILABLE] Configuration spark.databricks.delta.properties.defaults.enableChangeDataFeed is not available. SQLSTATE: 42K0I;
SetCommand (spark.databricks.delta.properties.defaults.enableChangeDataFeed,Some(true))


JVM stacktrace:
org.apache.spark.sql.catalyst.ExtendedAnalysisException
	at com.databricks.sql.connect.SparkConnectConfig$.assertConfigAllowed(SparkConnectConfig.scala:284)
	at com.databricks.sql.connect.SparkConnectSetFilteringValidationCheck.$anonfun$apply$1(SparkConnectSetFilteringValidationCheck.scala:33)
	at com.databricks.sql.connect.SparkConnectSetFilteringValidationCheck.$anonfun$apply$1$adapted(SparkConnectSetFilteringValidationCheck.scala:27)
	at org.apache.spark.sql.catalyst.trees.TreeNode.foreach(TreeNode.scala:302)
	at com.databricks.sql.connect.SparkConnectSetFilteringValidationCheck.apply(SparkConnectSetFilteringValidationCheck.scala:27)
	at com.databricks.sql.connect.SparkConnectSetFiltering

## 2️⃣ Parameters and Config

In [0]:
# --- Pipeline context ---
pipeline_name = "BATCH_INGEST"
# TODO: Determine if Manual run or Scheduled run
triggered_by = "Manual"   # Can be Scheduled/Event driven if automated

# Set defaults
run_type = "HOURLY"   # Change to HISTORICAL / DAILY / 15MIN as per notebook context
weather_url = "https://api.open-meteo.com/v1/forecast"

# Define a default (yesterday → today+6 days for example)
today_utc = datetime.now(timezone.utc).date()
default_start = (today_utc - timedelta(days=1))
default_end   = (today_utc + timedelta(days=6))

# Get widget values safely
start_date_str = dbutils.widgets.get("start_date") or default_start.strftime("%Y-%m-%d")
end_date_str   = dbutils.widgets.get("end_date") or default_end.strftime("%Y-%m-%d")

# Validate & parse
try:
    start_date = datetime.strptime(start_date_str, "%Y-%m-%d").date()
    end_date   = datetime.strptime(end_date_str, "%Y-%m-%d").date()
    if start_date < default_start: 
        run_type = "HISTORICAL" 
        weather_url = "https://archive-api.open-meteo.com/v1/archive"
        if end_date > today_utc: 
            print(f"⚠️ Adjusting end_date from {end_date} → {today} (API historical limit)")
            end_date = today_utc.strftime("%Y-%m-%d")
except ValueError:
    raise ValueError(f"Invalid start_date or end_date format. Must be YYYY-MM-DD. Got: {start_date_str}, {end_date_str}")

# end_date >= start_date Validation
assert end_date >= start_date, "Validation Error: end_date must be greater than or equal to start_date"

print(f"Processing {run_type} data for cities from {start_date} to {end_date}")

catalog = "env_catalog"
schema = "env_data"

city_master_tbl = f"{catalog}.{schema}.city_master"
pipeline_log_tbl = f"{catalog}.{schema}.pipeline_log"

weather_hourly_tbl = f"{catalog}.{schema}.bronze_weather_hourly"
weather_daily_tbl = f"{catalog}.{schema}.bronze_weather_daily"
air_hourly_tbl = f"{catalog}.{schema}.bronze_air_hourly"

Processing HISTORICAL data for cities from 2025-02-01 to 2025-02-01


## 3️⃣ Setup Open-Meteo Client (Cache + Retry)

In [0]:
cache_session = requests_cache.CachedSession(".cache", expire_after=3600*6) # 6 hours
retry_session = retry(cache_session, retries=3, backoff_factor=0.5)
openmeteo = openmeteo_requests.Client(session=retry_session)

## 4️⃣ Load City Master (Active Cities)

In [0]:
city_df = spark.table(city_master_tbl).filter("is_active = true and is_latest = true").toPandas()
cities_dict = city_df.to_dict("records")
print(f"✅ Found {len(cities_dict)} active cities to process")

✅ Found 2 active cities to process


## 5️⃣ Pipeline Logging — Capture Run Start, Completion, and Status

This section wraps the ingestion logic with automatic pipeline logging.
- Creates a structured `run_id` based on pipeline type and ingestion timestamp  
- Inserts a "RUNNING" entry into the `pipeline_log`  
- Updates the log with `SUCCESS` or `FAILED` at the end  
- Captures record counts and earliest/latest timestamps from the data  
- Ensures consistent auditability across Historical, Daily, and 15-min pipelines  

In [0]:
# --- Generate run_id with current timestamp for readability ---
start_time = datetime.now(timezone.utc)
run_id = f"{run_type}_{pipeline_name.upper()}_{start_time.strftime('%Y%m%d_%H%M%S')}"
print(f"🔹 Generated run_id: {run_id}")

# --- Log the start of the run ---
spark.sql(f"""
INSERT INTO env_catalog.env_data.pipeline_log
(run_id, pipeline_name, run_type, start_time, status, triggered_by, created_ts)
VALUES ('{run_id}', '{pipeline_name}', '{run_type}', TIMESTAMP '{start_time}', 'RUNNING', '{triggered_by}', CURRENT_TIMESTAMP())
""")

print(f"✅ Pipeline started: {run_id}")

🔹 Generated run_id: HISTORICAL_BATCH_INGEST_20251112_124744
✅ Pipeline started: HISTORICAL_BATCH_INGEST_20251112_124744


## 6️⃣ Define API Parameter Lists (Weather + Air Quality)

In [0]:
weather_hourly_params = [
    "weather_code", 
    "temperature_2m", "apparent_temperature", "relative_humidity_2m",
    "pressure_msl", "surface_pressure",
    "wind_speed_10m", "wind_gusts_10m", "wind_direction_10m",
    "cloud_cover", "visibility",
    "precipitation", "rain", "showers", "snowfall",
    "uv_index", "uv_index_clear_sky", "is_day", 
]

weather_daily_params = [
    "weather_code",
    "temperature_2m_mean", "temperature_2m_max", "temperature_2m_min",
    "apparent_temperature_mean", "apparent_temperature_max", "apparent_temperature_min",
    "relative_humidity_2m_mean", "relative_humidity_2m_max", "relative_humidity_2m_min",
    "pressure_msl_mean", "pressure_msl_max", "pressure_msl_min",
    "surface_pressure_mean", "surface_pressure_max", "surface_pressure_min",
    "wind_speed_10m_max", "wind_gusts_10m_max", "wind_direction_10m_dominant",
    "cloud_cover_mean", "cloud_cover_max", "cloud_cover_min",
    "visibility_mean", "visibility_max", "visibility_min",
    "precipitation_hours", "precipitation_sum", "rain_sum", "showers_sum", "snowfall_sum",
    "uv_index_max", "uv_index_clear_sky_max",
    "sunrise", "sunset", "daylight_duration", "sunshine_duration"
]

air_quality_params = [
    "pm10", "pm2_5", "carbon_monoxide", "carbon_dioxide",
    "nitrogen_monoxide", "nitrogen_dioxide", "sulphur_dioxide",
    "ozone", "aerosol_optical_depth", "dust",
    "uv_index", "uv_index_clear_sky", "ammonia", "methane",
    "european_aqi", "us_aqi"
]

## 7️⃣ Helper function to create delta tables

In [0]:
def create_table_if_missing(table_name: str, df_spark, partition_col="city"):
    """
    If table doesn't exist, create it using the dataframe schema and partition by partition_col.
    """
    if spark.catalog.tableExists(table_name):
        print(f"✅ Table {table_name} exists")
    # except AnalysisException:
    else:
        print(f"⚙️ Creating table {table_name} (partitioned by {partition_col})...")
        # create table with partitioning
        # Append Only
        # df_spark.write.format("delta").option("delta.enableChangeDataFeed", "true") \
        #     .partitionBy(partition_col).saveAsTable(table_name)
        # Merge supported
        df_spark.write.format("delta").option("overwriteSchema", "true") \
            .partitionBy(partition_col).mode("overwrite").saveAsTable(table_name)
        print(f"✅ Created detla table: {table_name}")

## 8️⃣ Function to run the entire loading process in a try-catch loop


### 🔹 Timestamp Design (Reference)

| Column | Meaning | Source |
|---------|----------|--------|
| `data_timestamp` | Actual time of the event / reading from source (e.g. API timestamp). Used for joins | Source system |
| `load_timestamp` | Ingestion time into Databricks (Bronze → Silver → Gold). Used for auditing, backfills, and debugging and incremental load to next layers | Current UTC timestamp (`current_timestamp()` or `datetime.now(timezone.utc)`) |

In [0]:
def fetch_and_process_data(cities, start_date, end_date, cache):
    weather_hourly_all, weather_daily_all, aq_hourly_all = [], [], []

    # API Calls (One per dataset for all cities)
    weather_resp = openmeteo.weather_api(weather_url,
        params={
            "latitude": [c["latitude"] for c in cities],
            "longitude": [c["longitude"] for c in cities],
            "start_date": start_date,
            "end_date": end_date,
            "hourly": weather_hourly_params,
            "daily": weather_daily_params,
            "timezone": "GMT"
        }
    )

    air_resp = openmeteo.weather_api(
        "https://air-quality-api.open-meteo.com/v1/air-quality",
        params={
            "latitude": [c["latitude"] for c in cities],
            "longitude": [c["longitude"] for c in cities],
            "start_date": start_date,
            "end_date": end_date,
            "hourly": air_quality_params,
            "timezone": "GMT"
        }
    )

    # Process API Responses into Pandas DataFrames
    for i, city_info in enumerate(cities):
        city = city_info["city_name"]
        dim_key = city_info["dim_key"]
        
        # Hourly
        hourly = weather_resp[i].Hourly()
        hourly_df = {
            "dim_key": dim_key,
            "city": city,
            "run_id": run_id,
            "load_timestamp": datetime.now(timezone.utc),
            "data_timestamp": pd.date_range(
                start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
                end =  pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
                freq = pd.Timedelta(seconds = hourly.Interval()),
                inclusive = "left"
            )
        }
        #print(hourly_df)
        for index, param in enumerate(weather_hourly_params):
            var_obj = hourly.Variables(index)
            try:
                values = var_obj.ValuesAsNumpy()
            except ValueError:
                # Fallback for categorical/string variables
                values = np.array(var_obj.Values(), dtype=object)
            hourly_df[param] = values
        weather_hourly_all.append(pd.DataFrame(hourly_df))
        #print(weather_hourly_all)
        
        # # Daily
        daily = weather_resp[i].Daily()
        daily_df = {
            "dim_key": dim_key,
            "city": city,
            "run_id": run_id,
            "load_timestamp": datetime.now(timezone.utc),
            "data_timestamp": pd.date_range(
                start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
                end =  pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
                freq = pd.Timedelta(seconds = daily.Interval()),
                inclusive = "left"
            )
        }
        #print(daily_df)
        for index, param in enumerate(weather_daily_params):
            var_obj = daily.Variables(index)
            try:
                values = var_obj.ValuesAsNumpy()
            except ValueError:
                # Fallback for categorical/string variables
                values = np.array(var_obj.Values(), dtype=object)
            daily_df[param] = values
        weather_daily_all.append(pd.DataFrame(daily_df))
        #print(weather_daily_all)

        # # Air Quality
        aq_hourly = air_resp[i].Hourly()
        aq_hourly_df = {
            "dim_key": dim_key,
            "city": city,
            "run_id": run_id,
            "load_timestamp": datetime.now(timezone.utc),
            "data_timestamp": pd.date_range(
                start = pd.to_datetime(aq_hourly.Time(), unit = "s", utc = True),
                end =  pd.to_datetime(aq_hourly.TimeEnd(), unit = "s", utc = True),
                freq = pd.Timedelta(seconds = aq_hourly.Interval()),
                inclusive = "left"
            )
        }
        #print(aq_hourly_df)
        for index, param in enumerate(air_quality_params):
            var_obj = aq_hourly.Variables(index)
            try:
                values = var_obj.ValuesAsNumpy()
            except ValueError:
                # Fallback for categorical/string variables
                values = np.array(var_obj.Values(), dtype=object)
            aq_hourly_df[param] = values
        aq_hourly_all.append(pd.DataFrame(aq_hourly_df))
        #print(aq_hourly_all)

    weather_hourly_df = pd.concat(weather_hourly_all)
    weather_daily_df = pd.concat(weather_daily_all)
    air_hourly_df = pd.concat(aq_hourly_all)

    return weather_hourly_df, weather_daily_df, air_hourly_df

## 9️⃣ Function to merge the new rows in bronze table

🔁 Idempotent Merge for Bronze Tables (City + Timestamp/Date)

This replaces append writes with Delta `MERGE` (upsert).
- Matches on (`city`, `data_timestamp`)
- Updates all columns when matched, inserts when not matched.
- Automatically creates the target Delta table if it doesn’t exist.
- Updates pipeline_log with `records_processed`, `earliest_ts`, `latest_ts`.

In [0]:
def merge_from_sdf(target_table, staging_sdf, merge_keys, exclude_update_cols=None):
    """
    Merge staging_sdf into target_table on merge_keys.
    Returns: (rows_processed, earliest_ts, latest_ts)
    """
    if staging_sdf is None:
        print(f"Skipping merge for {target_table} - no staging dataframe.")
        return 0, None, None

    # normalize timestamp/date types if present
    if "load_timestamp" in staging_sdf.columns:
        staging_sdf = staging_sdf.withColumn("load_timestamp", F.to_timestamp(F.col("load_timestamp")))
    if "data_timestamp" in staging_sdf.columns:
        staging_sdf = staging_sdf.withColumn("data_timestamp", F.to_timestamp(F.col("data_timestamp")))
    
    # if "date" in staging_sdf.columns:
    #     # cast string dates to date if required
    #     if dict(staging_sdf.dtypes).get("date") == "string":
    #         staging_sdf = staging_sdf.withColumn("date", F.to_date(F.col("date")))

    # ensure target exists
    create_table_if_missing(target_table, staging_sdf, partition_col=merge_keys[0])
    print("Bronze tables found/created")
    
    # create staging temp view (unique per run)
    safe_run = (run_id if 'run_id' in globals() else str(np.random.randint(1e9))).replace("-", "_")
    tmp_view = f"stg_{target_table.replace('.', '_')}_{safe_run}"
    staging_sdf.createOrReplaceTempView(tmp_view)
    print("Staging temp view created")
    
    cols = staging_sdf.columns
    for k in merge_keys:
        if k not in cols:
            raise ValueError(f"Merge key '{k}' not found in staging columns: {cols}")

    exclude_update_cols = exclude_update_cols or []
    update_cols = [c for c in cols if c not in merge_keys + exclude_update_cols]
    if not update_cols:
        raise ValueError("No updateable columns available (all are keys or excluded).")

    on_clause = " AND ".join([f"t.`{k}` = s.`{k}`" for k in merge_keys])
    update_set = ", ".join([f"t.`{c}` = s.`{c}`" for c in update_cols])
    insert_cols = ", ".join([f"`{c}`" for c in cols])
    insert_vals = ", ".join([f"s.`{c}`" for c in cols])

    merge_sql = f"""
      MERGE INTO {target_table} t
      USING {tmp_view} s
      ON {on_clause}
      WHEN MATCHED THEN UPDATE SET {update_set}
      WHEN NOT MATCHED THEN INSERT ({insert_cols}) VALUES ({insert_vals})
    """

    print(f"Executing MERGE INTO {target_table} ...")
    spark.sql(merge_sql)
    rowcount = staging_sdf.count()

    # compute earliest/latest ts if any candidate present
    ts_col = next((c for c in ("data_timestamp", "date", "time") if c in cols), None)
    earliest_ts = latest_ts = None
    if ts_col:
        agg = staging_sdf.agg(F.min(F.col(ts_col)).alias("min_ts"), F.max(F.col(ts_col)).alias("max_ts")).collect()[0]
        earliest_ts, latest_ts = agg["min_ts"], agg["max_ts"]

    print(f"MERGE done for {target_table} — staged_rows={rowcount}, earliest_ts={earliest_ts}, latest_ts={latest_ts}")
    return rowcount, earliest_ts, latest_ts

## 🔟 Execute Data Ingestion with Logging Control
Wrap the full ingestion process inside a try/except/finally block  
so any failures are properly logged in the `pipeline_log`.

In [0]:
try:
    # -----------------------------------------------
    # 🌀 Main Ingestion Logic — Weather + Air Quality
    # -----------------------------------------------
    weather_df_hourly, weather_df_daily, air_df_hourly = fetch_and_process_data(
        cities=cities_dict,
        start_date=start_date,
        end_date=end_date,
        cache=cache_session
    )

    # Auto-Create Bronze Delta Tables (If Not Exists)
    # create_table_if_missing(weather_hourly_tbl, weather_df_hourly)
    # create_table_if_missing(weather_daily_tbl, weather_df_daily)
    # create_table_if_missing(air_hourly_tbl, air_df_hourly)
    
    # Append Data into Delta Tables
    # spark.createDataFrame(weather_df_hourly).write.mode("append").format("delta").saveAsTable(weather_hourly_tbl)
    # spark.createDataFrame(weather_df_daily).write.mode("append").format("delta").saveAsTable(weather_daily_tbl)
    # spark.createDataFrame(air_df_hourly).write.mode("append").format("delta").saveAsTable(air_hourly_tbl)

    # Merge Data into Delta Tables
    total_records = 0
    agg_earliest = None
    agg_latest = None

    targets = [
        (weather_hourly_tbl, spark.createDataFrame(weather_df_hourly),  ["city", "data_timestamp"]),
        (weather_daily_tbl,  spark.createDataFrame(weather_df_daily),    ["city", "data_timestamp"]),
        (air_hourly_tbl,     spark.createDataFrame(air_df_hourly),       ["city", "data_timestamp"])
    ]

    for tbl_name, sdf, keys in targets:
        recs, min_ts, max_ts = merge_from_sdf(tbl_name, sdf, merge_keys=keys, exclude_update_cols=[])  # optionally exclude ['run_id']
        total_records += recs
        if min_ts is not None:
            agg_earliest = min(min_ts, agg_earliest) if agg_earliest is not None else min_ts
        if max_ts is not None:
            agg_latest = max(max_ts, agg_latest) if agg_latest is not None else max_ts

    print(f"Total staged records processed: {total_records}  earliest={agg_earliest}  latest={agg_latest}")

    # Cleanup data older than 7 days
    spark.sql(f"""VACUUM {weather_hourly_tbl} RETAIN 168 HOURS""")
    spark.sql(f"""VACUUM {weather_daily_tbl} RETAIN 168 HOURS""")    
    spark.sql(f"""VACUUM {air_hourly_tbl} RETAIN 168 HOURS""")

    # --- Mark success ---
    status = "SUCCESS"
    remarks = f"Successfully loaded {total_records} records across weather & air quality tables."
    earliest_ts = agg_earliest # datetime.strptime(agg_earliest, "%Y-%m-%d")
    latest_ts = agg_latest #datetime.strptime(agg_latest, "%Y-%m-%d")

    print(f"🎯 Run {run_id} completed successfully for {len(cities_dict)} cities between {start_date} and {end_date}.")

except Exception as e:
    status = "FAILED"
    remarks = f"Pipeline failed due to: {str(e)[:200]}"
    total_records = 0
    earliest_ts = datetime.strptime('2000-01-01', "%Y-%m-%d")
    latest_ts = datetime.strptime('2000-01-01', "%Y-%m-%d")

    traceback.print_exc()

finally:
    # --- Always log end state ---
    end_time = datetime.now(timezone.utc)
    spark.sql(f"""
        UPDATE {pipeline_log_tbl}
        SET end_time = TIMESTAMP '{end_time}',
            status = '{status}',
            records_processed = {total_records},
            earliest_ts = TIMESTAMP '{earliest_ts}',
            latest_ts = TIMESTAMP '{latest_ts}',
            remarks = '{remarks}'
        WHERE run_id = '{run_id}'
    """)

    print(f"🏁 Pipeline {status}: {run_id}")

⚙️ Creating table env_catalog.env_data.bronze_weather_hourly (partitioned by city)...
✅ Created detla table: env_catalog.env_data.bronze_weather_hourly
Bronze tables found/created
Staging temp view created
Executing MERGE INTO env_catalog.env_data.bronze_weather_hourly ...
MERGE done for env_catalog.env_data.bronze_weather_hourly — staged_rows=48, earliest_ts=2025-02-01 00:00:00, latest_ts=2025-02-01 23:00:00
⚙️ Creating table env_catalog.env_data.bronze_weather_daily (partitioned by city)...
✅ Created detla table: env_catalog.env_data.bronze_weather_daily
Bronze tables found/created
Staging temp view created
Executing MERGE INTO env_catalog.env_data.bronze_weather_daily ...
MERGE done for env_catalog.env_data.bronze_weather_daily — staged_rows=2, earliest_ts=2025-02-01 00:00:00, latest_ts=2025-02-01 00:00:00
⚙️ Creating table env_catalog.env_data.bronze_air_hourly (partitioned by city)...
✅ Created detla table: env_catalog.env_data.bronze_air_hourly
Bronze tables found/created
Staging

## ✅ Summary
| Table | Type | Partition | Notes |
|--------|-------|------------|--------|
| `bronze_weather_hourly` | Weather (Hourly) | city | From Open-Meteo Archive / Weather API |
| `bronze_weather_daily` | Weather (Daily) | city | From Open-Meteo Archive / Weather API |
| `bronze_air_hourly` | Air Quality (Hourly) | city | From Air Quality API |

**Further Additons Planned:**  
- Create an autoloader based ingestion for past and next 24 hours data with load to volumes at 15 min.
- However, except for North America and Central Europe data is interpolated for 15 min. Hence is not much useful
- Autoloader can be executed separately 
- Add unit tests / validation 