In [1]:
# --- Cell 1: Notebook Header, Logging Configuration, and Library Imports ---

"""
Notebook: 01_ingest_earthquake_data.ipynb

Purpose:
This script manages the initial ingestion phase of raw earthquake data.
It connects to the USGS API to extract seismic information and defines the
necessary data schema for subsequent processing. The objective is to prepare
the data for storage in the Bronze layer of the Lakehouse architecture,
serving as a raw, untransformed data source.

Dependencies:
- Python 3.x
- requests library (for HTTP requests)
- pandas library (for data manipulation)
- pyspark library (for distributed processing and DataFrame operations)

Execution Environment:
This script is designed to run within an Apache Spark environment,
specifically optimized for platforms like Azure Fabric where a SparkSession
('spark') is pre-initialized.
"""

# Configures a basic logging system for effective monitoring in production environments.
# This setup allows capturing informational messages, warnings, and errors throughout
# the script's execution, facilitating debugging and operational oversight.
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Standard Python libraries are imported first for consistency.
import json
from datetime import datetime, timedelta, timezone # Added timezone for explicit UTC for robust time handling.

# Third-party libraries for data handling and HTTP requests are grouped.
import requests  # Used for making HTTP GET requests to the USGS API.
import pandas as pd  # Utilized for initial data structuring in a Pandas DataFrame.

# PySpark libraries for distributed processing and DataFrame manipulation are imported last.
# SparkSession: The entry point for programming Spark with the DataFrame and SQL API.
# StructType, StructField: Essential for defining the precise schema for Spark DataFrames,
#                           ensuring type consistency and data quality.
# Spark data types: Ensures correct data interpretation and type consistency across the pipeline.
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType, TimestampType

logger.info("All necessary libraries have been imported successfully.")

StatementMeta(, db61d955-af2b-4104-b29b-c5b09f57c8af, 3, Finished, Available, Finished)

2025-06-12 19:35:07,937 - INFO - All necessary libraries have been imported successfully.


In [2]:
# --- Cell 2: Data Schema Definition ---

# Defines the explicit data schema for earthquake records obtained from the USGS API,
# along with additional metadata columns added during ingestion for auditing and lineage.
# This schema is critical for:
# 1. Ensuring strict type consistency when loading data into a Spark DataFrame.
# 2. Preventing schema inference issues that can lead to data quality problems in production.
# 3. Providing a clear contract for the data structure within the Lakehouse's Bronze layer.
earthquake_schema = StructType([
    StructField("id", StringType(), True),           # Unique event ID provided by USGS.
    StructField("mag", DoubleType(), True),          # Magnitude of the earthquake event.
    StructField("place", StringType(), True),        # Geographical location description of the earthquake.
    StructField("time", LongType(), True),           # Time of the event in milliseconds since the Unix epoch.
    StructField("updated", LongType(), True),        # Time of the last update for the event (milliseconds epoch).
    StructField("tz", StringType(), True),           # Timezone offset from UTC in minutes (historical field, often null).
    StructField("url", StringType(), True),          # URL to the API for more event details.
    StructField("detail", StringType(), True),       # URL for additional event details.
    StructField("felt", LongType(), True),           # Number of reports from people who felt the earthquake.
    StructField("cdi", DoubleType(), True),          # Community Decimal Intensity (CDI), a measure of shaking intensity.
    StructField("mmi", DoubleType(), True),          # Modified Mercalli Intensity (MMI), a measure of shaking intensity.
    StructField("alert", StringType(), True),        # USGS alert level (e.g., green, yellow, orange, red).
    StructField("status", StringType(), True),       # Status of the event (e.g., automatic, reviewed, deleted).
    StructField("tsunami", LongType(), True),        # Tsunami warning indicator (1 if warning issued, 0 if not).
    StructField("sig", LongType(), True),            # Significance or impact of the event, a composite value.
    StructField("net", StringType(), True),          # Seismic network that reported the event.
    StructField("code", StringType(), True),         # Event identification code on the reporting network.
    StructField("ids", StringType(), True),          # Comma-separated list of related event IDs.
    StructField("sources", StringType(), True),      # Comma-separated list of data sources for the event.
    StructField("types", StringType(), True),        # Comma-separated list of event types.
    StructField("nst", LongType(), True),            # Number of monitoring stations used for event location.
    StructField("dmin", DoubleType(), True),         # Horizontal distance from epicenter to the nearest station (degrees).
    StructField("rms", DoubleType(), True),          # Root-mean-square of the travel time residuals (seconds).
    StructField("gap", DoubleType(), True),          # Largest azimuthal gap in degrees between stations.
    StructField("magType", StringType(), True),      # Type of magnitude reported (e.g., mb, ml, Mww).
    StructField("type", StringType(), True),         # Primary classification of the event (e.g., earthquake, quarry blast).
    StructField("title", StringType(), True),        # Descriptive title of the event.
    StructField("longitude", DoubleType(), True),    # Longitude coordinate of the epicenter.
    StructField("latitude", DoubleType(), True),     # Latitude coordinate of the epicenter.
    StructField("depth", DoubleType(), True),        # Depth of the event in kilometers.
    StructField("ingestion_timestamp_utc", TimestampType(), False) # Timestamp indicating when the record was ingested, always UTC.
])

logger.info("'earthquake_schema' data schema defined and ready for use, including 'id' and 'ingestion_timestamp_utc'.")

StatementMeta(, db61d955-af2b-4104-b29b-c5b09f57c8af, 4, Finished, Available, Finished)

2025-06-12 19:35:10,628 - INFO - 'earthquake_schema' data schema defined and ready for use, including 'id' and 'ingestion_timestamp_utc'.


In [3]:
# --- Cell 3: Configuration Parameters ---

# This section defines all key parameters for the data ingestion process.
# Centralizing these values ensures easy modification and maintainability,
# supporting environmental configuration if needed in future iterations and
# promoting best practices for parameter management in data pipelines.

# USGS API Endpoint:
# Base URL for querying earthquake data from the United States Geological Survey (USGS) API.
USGS_API_BASE_URL = "https://earthquake.usgs.gov/fdsnws/event/1/query"

# Data Fetching Window:
# Specifies the number of days prior to the current ingestion timestamp
# for which earthquake data will be retrieved. A longer window fetches more historical data,
# useful for initial full loads or backfilling missing data.
DAYS_TO_FETCH = 365 # Configured to fetch a full year of historical data.

# Minimum Magnitude Filter:
# Sets the lower threshold for earthquake magnitudes to be included in the ingestion.
# Adjusting this value directly impacts the volume and granularity of the ingested data.
# A value of 2.5 is chosen to ensure a sufficient dataset for analysis, including
# less significant, but still relevant, seismic events, while filtering out micro-earthquakes.
MIN_MAGNITUDE = 2.5

# Ingestion Timestamp:
# Captures the exact UTC timestamp when this particular ingestion run initiated.
# Using UTC for consistency across different environments, time zones, and for
# robust data lineage. This value is crucial for auditing, tracking data freshness,
# and can be used for partitioning raw data in the Lakehouse (e.g., year, month, day of ingestion).
INGESTION_DATETIME = datetime.now(timezone.utc) # Explicitly set to UTC for universal consistency.

# Lakehouse Bronze Layer Target:
# Defines the fully qualified name of the table in the Bronze layer of the Lakehouse
# where the raw, untransformed data will be stored. This serves as the immutable
# landing zone for all ingested earthquake records, providing a reliable source
# for subsequent transformations.
BRONZE_TABLE_NAME = "bronze_usgs_earthquakes"

# Log the configured parameters for traceability and debugging in production environments.
# This helps in understanding the exact parameters used for a specific run.
logger.info("Ingestion Configuration Loaded:")
logger.info(f"  USGS API Base URL: {USGS_API_BASE_URL}")
logger.info(f"  Data Fetching Window: Last {DAYS_TO_FETCH} days")
logger.info(f"  Minimum Magnitude: {MIN_MAGNITUDE}")
logger.info(f"  Ingestion Datetime (UTC): {INGESTION_DATETIME.isoformat()}") # Using ISO format for clear timestamp.
logger.info(f"  Bronze Target Table: {BRONZE_TABLE_NAME}")

# Spark Session Initialization:

try:
    # In Azure Fabric notebooks, the 'spark' session is typically pre-initialized and available globally.
    # However, using `SparkSession.builder.getOrCreate()` is a robust pattern as it
    # either retrieves the existing session or creates a new one if necessary, making the script
    # more portable across different Spark environments.
    spark = SparkSession.builder \
                        .appName("USGSEarthquakeIngestion") \
                        .getOrCreate()
    logger.info("Spark Session initialized or retrieved successfully.")
except Exception as e:
    logger.error(f"Error initializing or retrieving Spark Session: {e}")
    raise # Re-raise the exception to halt execution if Spark cannot be started.

StatementMeta(, db61d955-af2b-4104-b29b-c5b09f57c8af, 5, Finished, Available, Finished)

2025-06-12 19:35:10,907 - INFO - Ingestion Configuration Loaded:
2025-06-12 19:35:10,908 - INFO -   USGS API Base URL: https://earthquake.usgs.gov/fdsnws/event/1/query
2025-06-12 19:35:10,908 - INFO -   Data Fetching Window: Last 365 days
2025-06-12 19:35:10,909 - INFO -   Minimum Magnitude: 2.5
2025-06-12 19:35:10,910 - INFO -   Ingestion Datetime (UTC): 2025-06-12T19:35:10.907037+00:00
2025-06-12 19:35:10,911 - INFO -   Bronze Target Table: bronze_usgs_earthquakes


In [4]:
# --- Cell 4: Function to Extract Earthquake Data ---

def fetch_earthquake_data(start_time: datetime, end_time: datetime, min_magnitude: float = 2.5, limit: int = 20000) -> pd.DataFrame:
    """
    Fetches earthquake data from the USGS API for a specified time range and minimum magnitude.
    This function handles API pagination to retrieve all available records up to the configured limit
    per request, iteratively querying the API until all data is retrieved or the API signals no more.

    The USGS API has a typical limit of 20,000 events per request. This function
    iteratively fetches data by incrementing the 'offset' parameter until all
    records for the specified time range are retrieved or no more data is available
    (e.g., when the number of returned features is less than the requested limit).

    Args:
        start_time (datetime): The start datetime for the data query (inclusive, UTC).
        end_time (datetime): The end datetime for the data query (inclusive, UTC).
        min_magnitude (float): The minimum magnitude of earthquakes to fetch. Defaults to 2.5.
        limit (int): The maximum number of events to fetch per single API request. Defaults to 20000.

    Returns:
        pd.DataFrame: A Pandas DataFrame containing the extracted earthquake records,
                      structured according to the defined schema.
                      Returns an empty DataFrame if no data is found or an error occurs
                      during the API interaction or data processing.
    """
    # Parameters for the USGS API request.
    # Datetime objects are formatted to ISO 8601 string (`%Y-%m-%dT%H:%M:%S`) as required by the API.
    params = {
        'format': 'geojson',             # Request data in GeoJSON format.
        'starttime': start_time.strftime('%Y-%m-%dT%H:%M:%S'),
        'endtime': end_time.strftime('%Y-%m-%dT%H:%M:%S'),
        'minmagnitude': min_magnitude,   # Filter by minimum earthquake magnitude.
        'limit': limit,                  # Number of results per page/request.
        'offset': 1                      # USGS API offset is 1-based for pagination.
    }
    
    all_features = [] # List to accumulate all earthquake features across multiple API calls.
    
    logger.info(f"Initiating data fetch from USGS API for time range: {start_time.isoformat()} to {end_time.isoformat()} with min magnitude {min_magnitude}.")
    
    # Loop for pagination: Continuously fetch data until no more records are available
    # or the last page has been retrieved based on the 'limit' parameter.
    request_count = 0
    while True:
        request_count += 1
        logger.info(f"Making API request {request_count} with offset: {params['offset']}")
        try:
            # Execute HTTP GET request to the USGS API.
            # A timeout is critical for robust production systems to prevent indefinite waits
            # and ensure the script doesn't hang on unresponsive API endpoints.
            response = requests.get(USGS_API_BASE_URL, params=params, timeout=120) # Increased timeout to 120s for large responses.
            response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx status codes).

            # Parse the JSON response body.
            data = response.json()
            
            # Extract the 'features' array, which contains the individual earthquake event details.
            features = data.get('features', [])
            all_features.extend(features) # Add newly fetched features to the collective list.
            logger.info(f"Received {len(features)} features in current API response. Total features collected: {len(all_features)}.")
            
            # Pagination check:
            # The loop breaks if the number of features received in the current response is less than
            # the requested limit. This typically indicates that the last page of data has been fetched,
            # as there are no more records to fill a full 'limit' page.
            if len(features) < params['limit']:
                logger.info("Reached end of data or received fewer features than limit, stopping pagination.")
                break # No more data to fetch or last page reached.
            
            # Increment the offset for the next request to fetch the subsequent page of data.
            params['offset'] += params['limit']
            
        # Comprehensive error handling for various request issues ensures robustness.
        except requests.exceptions.Timeout as e:
            logger.error(f"API request timed out after {response.request.timeout} seconds: {e}")
            return pd.DataFrame() # Return empty DataFrame to signal failure without crashing the pipeline.
        except requests.exceptions.RequestException as e:
            logger.error(f"Error fetching data from USGS API (RequestException - e.g., network error, bad URL): {e}")
            return pd.DataFrame() # Return empty DataFrame on HTTP/network errors.
        except json.JSONDecodeError as e:
            logger.error(f"Error decoding JSON response from USGS API: {e}. Response content (first 500 chars): {response.text[:500]}...")
            return pd.DataFrame() # Return empty DataFrame on invalid JSON.
        except Exception as e:
            logger.error(f"An unexpected error occurred during API fetch: {e}", exc_info=True) # exc_info=True logs the full traceback.
            return pd.DataFrame()

    if not all_features:
        logger.warning("No earthquake features found for the specified criteria. Returning empty DataFrame.")
        return pd.DataFrame()

    # --- Data Transformation to Pandas DataFrame ---
    # Iterates through each extracted GeoJSON feature and constructs a dictionary
    # mapping API fields to the desired column names (matching `earthquake_schema`).
    # This ensures consistency for downstream processing and storage in the Lakehouse.
    earthquakes_list = []
    for feature in all_features:
        properties = feature.get('properties', {}) # Extract event properties.
        geometry = feature.get('geometry', {})     # Extract geometry information (coordinates).
        # Coordinates are typically an array: [longitude, latitude, depth_km].
        coordinates = geometry.get('coordinates', [None, None, None]) 
        
        earthquake_record = {
            'id': feature.get('id'),                # Directly from the top-level feature object.
            'mag': properties.get('mag'),
            'place': properties.get('place'),
            'time': properties.get('time'),
            'updated': properties.get('updated'),
            'tz': properties.get('tz'),
            'url': properties.get('url'),
            'detail': properties.get('detail'),
            'felt': properties.get('felt'),
            'cdi': properties.get('cdi'),
            'mmi': properties.get('mmi'),
            'alert': properties.get('alert'),
            'status': properties.get('status'),
            'tsunami': properties.get('tsunami', 0), # Default to 0 if not present to ensure a value.
            'sig': properties.get('sig'),
            'net': properties.get('net'),
            'code': properties.get('code'),
            'ids': properties.get('ids'),
            'sources': properties.get('sources'),
            'types': properties.get('types'),
            'nst': properties.get('nst'),
            'dmin': properties.get('dmin'),
            'rms': properties.get('rms'),
            'gap': properties.get('gap'),
            'magType': properties.get('magType'),
            'type': properties.get('type'),
            'title': properties.get('title'),
            'longitude': coordinates[0] if len(coordinates) > 0 else None, # Safely access coordinate elements.
            'latitude': coordinates[1] if len(coordinates) > 1 else None,
            'depth': coordinates[2] if len(coordinates) > 2 else None,
            'ingestion_timestamp_utc': INGESTION_DATETIME # Add ingestion timestamp for data lineage and auditing.
        }
        earthquakes_list.append(earthquake_record)
    
    # Create Pandas DataFrame from the list of dictionaries.
    # This DataFrame will be subsequently converted to a Spark DataFrame.
    df = pd.DataFrame(earthquakes_list)
    logger.info(f"Successfully extracted and prepared {len(df)} earthquake records as a Pandas DataFrame.")
    return df

StatementMeta(, db61d955-af2b-4104-b29b-c5b09f57c8af, 6, Finished, Available, Finished)

In [5]:
# --- Cell 5: Execute Data Extraction ---

# This cell orchestrates the data extraction by defining the precise time window
# for the API query and calling the `fetch_earthquake_data` function.
# It serves as the main execution point for the data retrieval logic.

# Determine the end time for data retrieval. Using `datetime.now(timezone.utc)`
# ensures consistency with the `INGESTION_DATETIME` and avoids timezone-related issues,
# critical for accurate historical data queries.
end_datetime_utc = datetime.now(timezone.utc)

# Calculate the start time based on the configured `DAYS_TO_FETCH` and the `end_datetime_utc`.
# This defines the historical window for which data will be ingested.
start_datetime_utc = end_datetime_utc - timedelta(days=DAYS_TO_FETCH)

# Log the exact time range for which data will be fetched. This is useful for auditing,
# understanding the scope of the current ingestion run, and debugging.
logger.info(f"Initiating earthquake data fetch for the period: {start_datetime_utc.isoformat()} to {end_datetime_utc.isoformat()} (UTC).")

# Call the data fetching function with the defined parameters.
# The result is stored in a Pandas DataFrame (`df_earthquakes_pd`), which will then be
# transformed into a Spark DataFrame in the next step.
df_earthquakes_pd = fetch_earthquake_data(start_datetime_utc, end_datetime_utc, MIN_MAGNITUDE)

# Validate the outcome of the data extraction.
if not df_earthquakes_pd.empty:
    # Log the successful retrieval and show a sample of the data.
    # Displaying the head() provides a quick visual check of the data structure and content.
    logger.info(f"Successfully fetched {len(df_earthquakes_pd)} earthquake records.")
    logger.info(f"First 5 records of the fetched data:\n{df_earthquakes_pd.head().to_string()}")
else:
    # Log a warning if no data was fetched. This indicates a potential issue
    # with the API, specified parameters, or simply a lack of data for the period.
    logger.warning("No earthquake data was fetched for the specified criteria. The DataFrame is empty.")
    # In a production pipeline, depending on criticality, an empty DataFrame might:
    # 1. Trigger an alert or fail the job if data presence is a strict requirement.
    # 2. Be handled gracefully by downstream processes (as is done here).

StatementMeta(, db61d955-af2b-4104-b29b-c5b09f57c8af, 7, Finished, Available, Finished)

2025-06-12 19:35:11,536 - INFO - Initiating earthquake data fetch for the period: 2024-06-12T19:35:11.536063+00:00 to 2025-06-12T19:35:11.536063+00:00 (UTC).
2025-06-12 19:35:11,537 - INFO - Initiating data fetch from USGS API for time range: 2024-06-12T19:35:11.536063+00:00 to 2025-06-12T19:35:11.536063+00:00 with min magnitude 2.5.
2025-06-12 19:35:11,537 - INFO - Making API request 1 with offset: 1
2025-06-12 19:35:15,737 - INFO - Received 20000 features in current API response. Total features collected: 20000.
2025-06-12 19:35:15,738 - INFO - Making API request 2 with offset: 20001
2025-06-12 19:35:19,408 - INFO - Received 4588 features in current API response. Total features collected: 24588.
2025-06-12 19:35:19,409 - INFO - Reached end of data or received fewer features than limit, stopping pagination.
2025-06-12 19:35:19,609 - INFO - Successfully extracted and prepared 24588 earthquake records as a Pandas DataFrame.
2025-06-12 19:35:19,639 - INFO - Successfully fetched 24588 ear

In [6]:
#--- Cell 6: Convert Pandas DataFrame to Spark DataFrame and Save to Bronze Layer ---

# This cell performs the critical step of transforming the Pandas DataFrame
# containing the extracted data into a Spark DataFrame. It then persists this
# Spark DataFrame to the Bronze layer of the Lakehouse, which serves as the
# raw, immutable landing zone for all ingested data, providing a single source
# of truth for raw seismic information.

if not df_earthquakes_pd.empty:
    logger.info(f"Converting Pandas DataFrame with {len(df_earthquakes_pd)} records to Spark DataFrame.")

    try:
        # In Azure Fabric notebooks, the 'spark' session is typically pre-initialized and available globally.
        # This check verifies its availability and attempts to get or create one if not found,
        # ensuring the script can run in various Spark environments.
        if 'spark' not in globals() or not isinstance(spark, SparkSession):
            logger.error("SparkSession 'spark' is not initialized. Attempting to get or create a new one.")
            spark = SparkSession.builder.appName("USGSEarthquakeIngestion").getOrCreate()
        
        # Create the Spark DataFrame using the explicitly defined schema (`earthquake_schema`).
        # This is a critical step for production environments as it ensures data types
        # are correctly interpreted and maintained, preventing potential schema inference issues
        # that could lead to data corruption or pipeline failures downstream.
        df_earthquakes_spark = spark.createDataFrame(df_earthquakes_pd, schema=earthquake_schema)
        logger.info(f"Successfully converted Pandas DataFrame to Spark DataFrame with {df_earthquakes_spark.count()} records.")
        logger.info("Spark DataFrame Schema:")
        df_earthquakes_spark.printSchema() # Log the Spark DataFrame schema for verification.

        # Write the Spark DataFrame to a Delta table in the Bronze layer of the Lakehouse.
        # This leverages Delta Lake's capabilities for robust data storage.
        # `format("delta")`: Specifies Delta Lake format for ACID properties, schema enforcement, etc.
        # `mode("overwrite")`: Replaces the entire table with the new data. This is suitable for
        #                      initial full loads or when the entire dataset is refreshed.
        #                      For incremental updates, `append` or `merge` modes are typically used.
        # `option("overwriteSchema", "true")`: Allows the target Delta table's schema to adapt
        #                                       to the incoming data's schema. While useful for
        #                                       Bronze to capture source schema evolution, use with
        #                                       caution in Silver/Gold layers where schema stability
        #                                       is more critical.
        # `saveAsTable(BRONZE_TABLE_NAME)`: Registers the data as a named table within the Lakehouse
        #                                    metadata, making it easily queryable via SQL.
        logger.info(f"Writing {df_earthquakes_spark.count()} records to Bronze table: {BRONZE_TABLE_NAME} using 'overwrite' mode.")
        df_earthquakes_spark.write \
                            .format("delta") \
                            .mode("overwrite") \
                            .option("overwriteSchema", "true") \
                            .saveAsTable(BRONZE_TABLE_NAME)
        
        logger.info(f"Data successfully persisted to Bronze table: {BRONZE_TABLE_NAME}.")
        

    except Exception as e:
        logger.error(f"An error occurred during Spark DataFrame conversion or saving to Bronze layer: {e}", exc_info=True)
else:
    logger.warning("Skipping Spark DataFrame processing and saving to Bronze layer as no data was fetched in the previous step.")

StatementMeta(, db61d955-af2b-4104-b29b-c5b09f57c8af, 8, Finished, Available, Finished)

2025-06-12 19:35:21,254 - INFO - Converting Pandas DataFrame with 24588 records to Spark DataFrame.
2025-06-12 19:35:23,992 - INFO - Writing 24588 records to Bronze table: bronze_usgs_earthquakes using 'overwrite' mode.
2025-06-12 19:35:54,636 - INFO - Data successfully persisted to Bronze table: bronze_usgs_earthquakes.


In [7]:
# --- Cell 7: Display Sample from Bronze Table (Optional Verification) ---

# This cell performs a quick verification by querying the newly created
# or updated Bronze Delta table and displaying a sample of its contents.
# This helps confirm that the data was written correctly, is accessible
# within the Lakehouse environment, and that the schema has been applied as expected.
# This step is optional but highly recommended for development and initial deployment.

if not df_earthquakes_pd.empty: # Only proceed with verification if data was actually ingested.
    try:
        # Ensure SparkSession is available to query the table.
        # This check is crucial if this cell were to be run in isolation or
        # after a Spark session might have expired or been reset.
        if 'spark' not in globals() or not isinstance(spark, SparkSession):
            logger.error("SparkSession 'spark' is not initialized for table verification. Attempting to get or create one.")
            spark = SparkSession.builder.appName("USGSEarthquakeVerification").getOrCreate()
            
        logger.info(f"Displaying a sample of 5 records from the Bronze table '{BRONZE_TABLE_NAME}' for verification.")
        # Retrieve the table as a Spark DataFrame using `spark.table()` and show its first 5 rows.
        # `truncate=False` ensures that column values are not truncated in the output,
        # providing a full view of the data for verification.
        spark.table(BRONZE_TABLE_NAME).show(5, truncate=False)
        
        # Log the schema and total count for further verification.
        # This provides a programmatic confirmation of the table's structure and size.
        logger.info(f"Bronze table '{BRONZE_TABLE_NAME}' schema:")
        spark.table(BRONZE_TABLE_NAME).printSchema()
        logger.info(f"Total records in Bronze table '{BRONZE_TABLE_NAME}': {spark.table(BRONZE_TABLE_NAME).count()}")

    except Exception as e:
        logger.error(f"An error occurred while trying to read and display data from Bronze table '{BRONZE_TABLE_NAME}': {e}", exc_info=True)
else:
    logger.info("Skipping Bronze table display as no data was processed and written to the table in prior steps.")

StatementMeta(, db61d955-af2b-4104-b29b-c5b09f57c8af, 9, Finished, Available, Finished)

2025-06-12 19:35:56,992 - INFO - Displaying a sample of 5 records from the Bronze table 'bronze_usgs_earthquakes' for verification.
2025-06-12 19:36:02,562 - INFO - Total records in Bronze table 'bronze_usgs_earthquakes': 24588
