In [2]:
# --- Cell 1: Notebook Header, Logging Configuration, and Library Imports ---

"""
Notebook: 06_ml_tsunami_prediction.ipynb

Purpose:
This notebook applies machine learning techniques to predict the likelihood of a tsunami
warning being issued based on earthquake characteristics. The goal is to build a binary
classification model and evaluate its performance.

"""

# Configure a basic logging system for effective monitoring in production.
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# PySpark libraries for DataFrame operations and functions.
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# PySpark ML libraries for feature engineering, modeling, and evaluation.
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

# Standard Python libraries for visualization and data manipulation.
import pandas as pd

# Initialize Spark Session.
try:
    if 'spark' not in globals() or not isinstance(spark, SparkSession):
        logger.info("SparkSession 'spark' not found. Attempting to get or create a new one.")
        spark = SparkSession.builder.appName("TsunamiPredictionML").getOrCreate()
        logger.info("Spark session initialized successfully.")
    else:
        logger.info("Spark session 'spark' is already initialized and available.")
except Exception as e:
    logger.error(f"FATAL ERROR: Failed to initialize or retrieve Spark session: {e}", exc_info=True)
    raise Exception(f"Failed to initialize Spark session: {e}")

StatementMeta(, 2d57995c-a530-4f2b-b9a8-44d67657fe31, 4, Finished, Available, Finished)

2025-07-29 03:38:26,647 - INFO - Spark session 'spark' is already initialized and available.


In [3]:
# --- Cell 2: Configuration Parameters ---

# This section defines all key parameters for the machine learning process.
# Centralizing these values ensures easy modification and maintainability.

# Source Table Name (Silver Layer):
# The cleaned and conformed data from the Silver layer will be used for training.
SILVER_TABLE_NAME = "silver_earthquakes_cleaned"

# Target Column (Label):
# The binary column we want to predict (True/False or 1/0).
LABEL_COL = "tsunami_warning"

# Feature Columns:
# The attributes from the Silver layer that will be used to train the model.
# Rationale for selection:
# - magnitude: The primary driver of tsunami potential.
# - depth_km: Shallow earthquakes are more likely to displace water and cause tsunamis.
# - latitude, longitude: Location is critical, as tsunamis are typically generated in subduction zones.
# - significance: A USGS-calculated score that reflects overall impact, which is correlated with tsunami potential.
FEATURE_COLS = ["magnitude", "depth_km", "latitude", "longitude", "significance"]

# Filter condition to ensure data quality for ML.
ML_FILTER_CONDITION = "event_type = 'earthquake'"

# Path for saving the trained model in the Lakehouse Files.
MODEL_SAVE_PATH = "Files/ml_models/tsunami_prediction_rf_model"

logger.info("Machine Learning Configuration Loaded:")
logger.info(f"  Reading from Silver table: {SILVER_TABLE_NAME}")
logger.info(f"  Label Column: {LABEL_COL}")
logger.info(f"  Feature Columns: {FEATURE_COLS}")
logger.info(f"  Model Save Path: {MODEL_SAVE_PATH}")

StatementMeta(, 2d57995c-a530-4f2b-b9a8-44d67657fe31, 5, Finished, Available, Finished)

2025-07-29 03:38:27,095 - INFO - Machine Learning Configuration Loaded:
2025-07-29 03:38:27,096 - INFO -   Reading from Silver table: silver_earthquakes_cleaned
2025-07-29 03:38:27,097 - INFO -   Feature Columns: ['magnitude', 'depth_km', 'latitude', 'longitude', 'significance']
2025-07-29 03:38:27,097 - INFO -   Model Save Path: Files/ml_models/tsunami_prediction_rf_model


In [4]:
# --- Cell 3: Load and Prepare Data for ML ---

# This cell loads the Silver data, applies necessary filters, and addresses
# a critical issue in classification problems: class imbalance.

try:
    # Load data from the Silver layer Delta table and apply initial filters.
    df_silver = spark.table(SILVER_TABLE_NAME).filter(ML_FILTER_CONDITION)
    
    # Select only the columns needed for the model to optimize performance.
    # Drop any rows with null values in these essential columns.
    df_ml_source = df_silver.select(FEATURE_COLS + [LABEL_COL]).na.drop()
    
    logger.info(f"Loaded and filtered {df_ml_source.count()} records for ML.")

    # --- Handling Class Imbalance ---
    # Tsunami warnings are rare events, leading to a highly imbalanced dataset.
    # If not handled, a model might achieve high accuracy by simply always predicting "no tsunami".
    # We will use downsampling of the majority class (no tsunami) to create a balanced training set.
    
    # Calculate the class distribution.
    df_ml_source.groupBy(LABEL_COL).count().show()
    
    # Separate the majority and minority classes.
    df_majority = df_ml_source.filter(F.col(LABEL_COL) == False)
    df_minority = df_ml_source.filter(F.col(LABEL_COL) == True)
    
    minority_count = df_minority.count()
    majority_count = df_majority.count()
    
    if minority_count == 0:
        logger.error("No positive samples (tsunami_warning = True) found. Cannot train the model.")
        raise Exception("No positive samples for tsunami prediction found.")
    
    # Calculate the ratio to downsample the majority class.
    ratio = minority_count / majority_count
    logger.warning(f"Class imbalance detected. Minority class count: {minority_count}, Majority class count: {majority_count}. Downsampling majority class with ratio: {ratio:.4f}")

    # Downsample the majority class to match the size of the minority class.
    df_majority_downsampled = df_majority.sample(withReplacement=False, fraction=ratio, seed=42)
    
    # Combine the downsampled majority class with the original minority class.
    df_balanced = df_majority_downsampled.unionAll(df_minority)
    
    logger.info(f"Balanced dataset created with {df_balanced.count()} records.")
    logger.info("Balanced dataset class distribution:")
    df_balanced.groupBy(LABEL_COL).count().show()

except Exception as e:
    logger.error(f"FATAL ERROR: Failed to load or prepare data for ML. Error: {e}", exc_info=True)
    raise Exception(f"Failed to prepare data for ML: {e}")

StatementMeta(, 2d57995c-a530-4f2b-b9a8-44d67657fe31, 6, Finished, Available, Finished)

+---------------+-----+
+---------------+-----+
|            0.0|24727|
|            1.0|  123|
+---------------+-----+

+---------------+-----+
+---------------+-----+
|            0.0|  130|
|            1.0|  123|
+---------------+-----+



2025-07-29 03:38:47,456 - INFO - Balanced dataset created with 253 records.
2025-07-29 03:38:47,457 - INFO - Balanced dataset class distribution:


In [5]:
# --- Cell 4: Feature Engineering and ML Pipeline Creation ---

# This cell defines the stages for transforming data and the machine learning model,
# then assembles them into a PySpark ML Pipeline.
# Using a Pipeline is a best practice as it ensures that the same transformations
# are consistently applied to both training and test data, preventing data leakage.

logger.info("Defining ML pipeline stages.")

# Stage 1: VectorAssembler
# Combines all feature columns into a single vector column named "unscaled_features".
# This is a required format for most Spark ML algorithms.
assembler = VectorAssembler(inputCols=FEATURE_COLS, outputCol="unscaled_features", handleInvalid="skip")

# Stage 2: StandardScaler
# Scales the feature vector to have a standard deviation of 1.
# This helps algorithms that are sensitive to feature scales (like SVM or logistic regression with regularization)
# and can improve the performance of tree-based models like Random Forest.
scaler = StandardScaler(inputCol="unscaled_features", outputCol="features", withStd=True, withMean=True)

# Stage 3: RandomForestClassifier Model
# An ensemble model that is robust, handles non-linear relationships well,
# and provides insights into feature importance.
# `featuresCol` is the output from the scaler, and `labelCol` is our target.
# `seed` is set for reproducibility.
rf_classifier = RandomForestClassifier(featuresCol="features", labelCol=LABEL_COL, seed=42)

# Assemble all stages into a Pipeline.
# The stages will be executed in the specified order.
pipeline = Pipeline(stages=[assembler, scaler, rf_classifier])
logger.info("ML Pipeline defined successfully.")

StatementMeta(, 2d57995c-a530-4f2b-b9a8-44d67657fe31, 7, Finished, Available, Finished)

2025-07-29 03:38:50,019 - INFO - Defining ML pipeline stages.
2025-07-29 03:38:50,166 - INFO - ML Pipeline defined successfully.


In [6]:
# --- Cell 5: Data Splitting and Model Training ---

# This cell splits the balanced dataset into training and testing sets,
# then trains the ML pipeline on the training data.

logger.info("Splitting data into training and testing sets.")
# Split the data into 80% for training and 20% for testing.
# A `seed` is used to ensure the split is reproducible.
(train_data, test_data) = df_balanced.randomSplit([0.8, 0.2], seed=42)

logger.info(f"Training data count: {train_data.count()}, Test data count: {test_data.count()}")

# Train the pipeline model.
# The `fit()` method runs the training data through all pipeline stages,
# learning the scaling parameters and training the Random Forest model.
logger.info("Training the Random Forest model...")
model = pipeline.fit(train_data)
logger.info("Model training complete.")

StatementMeta(, 2d57995c-a530-4f2b-b9a8-44d67657fe31, 8, Finished, Available, Finished)

2025-07-29 03:38:50,985 - INFO - Splitting data into training and testing sets.
2025-07-29 03:38:52,740 - INFO - Training data count: 205, Test data count: 48
2025-07-29 03:38:52,743 - INFO - Training the Random Forest model...
2025-07-29 03:38:58,757 - INFO - Request URL: 'https://onelake.dfs.fabric.microsoft.com/ed6f8caf-8e4a-4dc4-9934-0c0368d29949/7ada3b0a-f2f1-4e0f-8424-abae2a0e4039%2Fb44c410d-fb10-4de1-a40f-0b92020a0acc%2Fartifacts%2Festimator_info.json?resource=REDACTED'
Request method: 'PUT'
Request headers:
    'x-ms-version': 'REDACTED'
    'Accept': 'application/json'
    'User-Agent': 'azsdk-python-storage-dfs/12.14.0 Python/3.11.8 (Linux-5.15.182.1-1.cm2-x86_64-with-glibc2.35)'
    'x-ms-date': 'REDACTED'
    'x-ms-client-request-id': '8fa1f1c4-6c2d-11f0-9852-7ced8d70a41f'
    'Authorization': 'REDACTED'
No body was attached to the request
2025-07-29 03:38:58,902 - INFO - Response status: 201
Response headers:
    'Content-Length': '0'
    'Date': 'Tue, 29 Jul 2025 03:38:58

2025-07-29 03:39:27,394 - INFO - Model training complete.


In [7]:
# --- Cell 6: Prediction and Model Evaluation ---

# This cell uses the trained model to make predictions on the unseen test data
# and evaluates its performance using appropriate metrics for binary classification.

logger.info("Making predictions on the test data.")
# The `transform()` method applies the fitted pipeline to the test data.
predictions = model.transform(test_data)

logger.info("Sample predictions:")
predictions.select(LABEL_COL, "prediction", "probability").show(10, truncate=False)

# --- Model Evaluation ---
# For imbalanced classification problems, accuracy is often a misleading metric.
# We will use more robust metrics like Area Under ROC, Precision, Recall, and F1-score.

# 1. Area Under ROC (AUC-ROC)
# Measures the model's ability to distinguish between positive and negative classes.
# A value closer to 1.0 is better.
evaluator_roc = BinaryClassificationEvaluator(labelCol=LABEL_COL, rawPredictionCol="rawPrediction", metricName="areaUnderROC")
auc_roc = evaluator_roc.evaluate(predictions)
logger.info(f"Area Under ROC (AUC-ROC) on test data: {auc_roc:.4f}")

# 2. Area Under PR (AUC-PR)
# A more informative metric for highly imbalanced datasets.
evaluator_pr = BinaryClassificationEvaluator(labelCol=LABEL_COL, rawPredictionCol="rawPrediction", metricName="areaUnderPR")
auc_pr = evaluator_pr.evaluate(predictions)
logger.info(f"Area Under Precision-Recall (AUC-PR) on test data: {auc_pr:.4f}")

# 3. Precision, Recall, and F1-Score
# These metrics are derived from the confusion matrix.
# - Precision: Of all positive predictions, how many were correct? (Measures false positives)
# - Recall: Of all actual positives, how many did the model find? (Measures false negatives)
# - F1-Score: The harmonic mean of Precision and Recall.
evaluator_f1 = MulticlassClassificationEvaluator(labelCol=LABEL_COL, predictionCol="prediction", metricName="f1")
f1_score = evaluator_f1.evaluate(predictions)
logger.info(f"F1-Score on test data: {f1_score:.4f}")

evaluator_precision = MulticlassClassificationEvaluator(labelCol=LABEL_COL, predictionCol="prediction", metricName="weightedPrecision")
precision = evaluator_precision.evaluate(predictions)
logger.info(f"Weighted Precision on test data: {precision:.4f}")

evaluator_recall = MulticlassClassificationEvaluator(labelCol=LABEL_COL, predictionCol="prediction", metricName="weightedRecall")
recall = evaluator_recall.evaluate(predictions)
logger.info(f"Weighted Recall on test data: {recall:.4f}")

# 4. Confusion Matrix
# Provides a detailed breakdown of correct and incorrect predictions.
logger.info("Confusion Matrix:")
predictions.groupBy(LABEL_COL, "prediction").count().show()

StatementMeta(, 2d57995c-a530-4f2b-b9a8-44d67657fe31, 9, Finished, Available, Finished)

2025-07-29 03:39:29,036 - INFO - Making predictions on the test data.
2025-07-29 03:39:29,097 - INFO - Sample predictions:
2025-07-29 03:39:32,595 - INFO - Request URL: 'https://onelake.dfs.fabric.microsoft.com/ed6f8caf-8e4a-4dc4-9934-0c0368d29949/7ada3b0a-f2f1-4e0f-8424-abae2a0e4039%2Fb44c410d-fb10-4de1-a40f-0b92020a0acc%2Fartifacts%2Fmetric_info.json?resource=REDACTED'
Request method: 'PUT'
Request headers:
    'x-ms-version': 'REDACTED'
    'Accept': 'application/json'
    'User-Agent': 'azsdk-python-storage-dfs/12.14.0 Python/3.11.8 (Linux-5.15.182.1-1.cm2-x86_64-with-glibc2.35)'
    'x-ms-date': 'REDACTED'
    'x-ms-client-request-id': 'a3cd4dec-6c2d-11f0-9852-7ced8d70a41f'
    'Authorization': 'REDACTED'
No body was attached to the request
2025-07-29 03:39:32,611 - INFO - Response status: 201
Response headers:
    'Content-Length': '0'
    'Date': 'Tue, 29 Jul 2025 03:39:31 GMT'
    'Server': 'Windows-Azure-HDFS/1.0, Microsoft-HTTPAPI/2.0'
    'ETag': '"0x8DDCE5188036FF8"'
    'L

+---------------+----------+------------------------------------------+
+---------------+----------+------------------------------------------+
|0.0            |0.0       |[0.9953192007797271,0.004680799220272904] |
|0.0            |0.0       |[0.9971710526315789,0.0028289473684210524]|
|0.0            |0.0       |[0.9971710526315789,0.0028289473684210524]|
|0.0            |0.0       |[0.9953192007797271,0.004680799220272904] |
|0.0            |0.0       |[0.6477960526315789,0.35220394736842103]  |
|0.0            |0.0       |[1.0,0.0]                                 |
|0.0            |0.0       |[0.9971710526315789,0.0028289473684210524]|
|0.0            |0.0       |[0.9971710526315789,0.0028289473684210524]|
|0.0            |0.0       |[0.9971710526315789,0.0028289473684210524]|
|0.0            |0.0       |[0.9971710526315789,0.0028289473684210524]|
+---------------+----------+------------------------------------------+
only showing top 10 rows

+---------------+----------+-----+
+--

In [8]:
# --- Cell 7: Model Interpretation (Feature Importance) ---

# This cell extracts and displays the feature importances from the trained Random Forest model.
# This helps understand which features were most influential in the model's predictions.

logger.info("Extracting feature importances from the trained model.")
# The Random Forest model is the last stage of our pipeline.
rf_model_extracted = model.stages[-1]

# The `featureImportances` attribute is a vector of importance scores.
importances = rf_model_extracted.featureImportances

# Create a list of (feature, importance) tuples and sort them in descending order.
feature_importance_list = sorted(list(zip(FEATURE_COLS, importances)), key=lambda x: x[1], reverse=True)

logger.info("Feature Importances for Tsunami Prediction:")
for feature, importance in feature_importance_list:
    print(f"  - {feature}: {importance:.4f}")

# This insight is critical for explaining the model's behavior and for further feature engineering.
# As expected, 'magnitude' and 'depth_km' are likely to be the most important features.

StatementMeta(, 2d57995c-a530-4f2b-b9a8-44d67657fe31, 10, Finished, Available, Finished)

  - significance: 0.5917
  - latitude: 0.1622
  - magnitude: 0.1421
  - longitude: 0.0912
  - depth_km: 0.0128


In [9]:
# --- Cell 8: Save the Trained Model ---

# This cell saves the entire trained ML Pipeline to the "Files" section of the Lakehouse.
# Saving the model allows it to be reloaded later for batch predictions on new data
# without needing to retrain it, which is a key step in productionizing an ML model.

try:
    logger.info(f"Saving the trained pipeline model to: {MODEL_SAVE_PATH}")
    # `overwrite()` will replace any existing model at the same path.
    model.write().overwrite().save(MODEL_SAVE_PATH)
    logger.info("Model saved successfully.")
    
    # Example of how to load the model back:
    # from pyspark.ml import PipelineModel
    # loaded_model = PipelineModel.load(MODEL_SAVE_PATH)
    # logger.info("Model loaded back successfully for verification.")

except Exception as e:
    logger.error(f"Error saving the model: {e}", exc_info=True)
    # In production, a failure to save the model might trigger an alert.
    raise Exception(f"Failed to save the trained model: {e}")

logger.info("Tsunami Prediction ML Notebook complete.")

StatementMeta(, 2d57995c-a530-4f2b-b9a8-44d67657fe31, 11, Finished, Available, Finished)

2025-07-29 03:41:07,748 - INFO - Saving the trained pipeline model to: Files/ml_models/tsunami_prediction_rf_model
2025-07-29 03:41:25,245 - INFO - Model saved successfully.
2025-07-29 03:41:25,245 - INFO - Tsunami Prediction ML Notebook complete.


In [12]:
# --- Cell 7: Save Enriched Predictions to Silver Layer ---

"""
This cell saves the predictions and probabilities from the ML model
back into the Silver layer as a new, enriched table. This makes the model's
output available for further analysis or as a feature for downstream models
without cluttering the primary cleaned table.
"""

logger.info("Preparing to save model predictions to a new Silver layer table.")

# Ensure the predictions DataFrame is defined and not empty.
if 'predictions' in locals() and predictions.count() > 0:

    # 1. Select and prepare the columns for the output table.
    #    It's a best practice to include a unique identifier (like event_id) to join back
    #    to the original silver table if needed, along with the model's outputs.
    #    We'll load the original silver data again briefly to get the event_id.
    
    logger.info("Joining predictions with original event_id for traceability.")
    
    # We need to join our balanced and predicted data back to the original source to get the event_id
    # The 'predictions' DataFrame contains only the balanced data, let's make predictions on the full dataset
    # for a more complete output table.
    
    # Load the original, unfiltered, non-downsampled ML source data again
    df_full_ml_source = spark.table(SILVER_TABLE_NAME) \
                             .filter(ML_FILTER_CONDITION) \
                             .select(FEATURE_COLS + [LABEL_COL, "event_id"]) \
                             .na.drop()

    logger.info(f"Making predictions on the full, unfiltered dataset of {df_full_ml_source.count()} records for saving.")
    full_predictions = model.transform(df_full_ml_source)

    # To extract an element from a probability vector (which is a VectorUDT, not a simple array),
    # we need to create a User Defined Function (UDF).
    from pyspark.sql.types import DoubleType

    # This UDF takes a vector and returns the element at the specified index.
    # The probability vector is [prob_class_0, prob_class_1]. We want prob_class_1 (index 1).
    def get_probability_udf(vector, index):
        try:
            return float(vector[index])
        except (IndexError, TypeError):
            return None # Return null if vector is malformed or index is out of bounds

    # Register the UDF with Spark
    spark.udf.register("get_probability", get_probability_udf, DoubleType())
    
    # Create a UDF column expression
    get_probability_col = F.udf(get_probability_udf, DoubleType())

    # 2. Select the final columns for the new Silver table.
    df_predictions_to_save = full_predictions.select(
        F.col("event_id"), # Unique identifier from the source
        F.col(LABEL_COL).alias("actual_tsunami_warning"),
        F.col("prediction").cast("boolean").alias("predicted_tsunami_warning"),
        # Extract the probability of the positive class (tsunami = True, which is at index 1 in the vector)
        get_probability_col(F.col("probability"), F.lit(1)).alias("tsunami_probability"),
        F.current_timestamp().alias("prediction_timestamp_utc") # Audit timestamp
    )

    # 3. Define the name for the new Silver table.
    #    Using a descriptive name that indicates it contains ML predictions.
    SILVER_ML_PREDICTIONS_TABLE = "silver_earthquake_ml_predictions"
    logger.info(f"Target table for predictions: {SILVER_ML_PREDICTIONS_TABLE}")

    try:
        # 4. Write the DataFrame to the Silver layer.
        #    Using 'overwrite' mode to ensure the table is refreshed with the latest predictions
        #    each time the model training notebook is run.
        #    'overwriteSchema' is used to accommodate any future changes in the output schema.
        df_predictions_to_save.write \
                              .format("delta") \
                              .mode("overwrite") \
                              .option("overwriteSchema", "true") \
                              .saveAsTable(SILVER_ML_PREDICTIONS_TABLE)
        
        logger.info(f"Successfully saved {df_predictions_to_save.count()} predictions to: {SILVER_ML_PREDICTIONS_TABLE}")
        
        logger.info("Sample of the saved predictions table:")
        df_predictions_to_save.show(10, truncate=False)

    except Exception as e:
        logger.error(f"FATAL ERROR: Failed to save predictions to Silver table '{SILVER_ML_PREDICTIONS_TABLE}': {e}", exc_info=True)
        # Depending on the pipeline, this could be a critical failure.
        raise Exception(f"Failed to save ML predictions to the Silver layer: {e}")
        
else:
    logger.warning("Predictions DataFrame is not available or is empty. Skipping save operation.")

StatementMeta(, 2d57995c-a530-4f2b-b9a8-44d67657fe31, 14, Finished, Available, Finished)

2025-07-29 04:00:44,796 - INFO - Preparing to save model predictions to a new Silver layer table.
2025-07-29 04:00:46,222 - INFO - Joining predictions with original event_id for traceability.
2025-07-29 04:00:48,234 - INFO - Making predictions on the full, unfiltered dataset of 24850 records for saving.
2025-07-29 04:00:48,338 - INFO - Target table for predictions: silver_earthquake_ml_predictions


+------------+----------------------+-------------------------+---------------------+--------------------------+
+------------+----------------------+-------------------------+---------------------+--------------------------+
|ak024fg2aa79|0.0                   |false                    |0.0                  |2025-07-29 04:00:59.724778|
|ak024fhiemtu|0.0                   |false                    |0.0                  |2025-07-29 04:00:59.724778|
|ak024fhoojey|0.0                   |false                    |0.050624999999999996 |2025-07-29 04:00:59.724778|
|ak024fj7koem|0.0                   |false                    |0.4                  |2025-07-29 04:00:59.724778|
|ak024fkx0eg4|0.0                   |false                    |0.0                  |2025-07-29 04:00:59.724778|
|ak024fmhg0de|0.0                   |false                    |0.0                  |2025-07-29 04:00:59.724778|
|ak024fmnfh08|0.0                   |false                    |0.0024768518518518516|2025-07-29 