In [0]:
# COMMAND ----------
# ONE-TIME FIX: Run this cell once to delete the old table with the wrong schema

storage_account = "finlakeadlsa3b3"
container_feature = "feature"
feature_delta_path = f"abfss://{container_feature}@{storage_account}.dfs.core.windows.net/delta/feature_transactions"

print(f"Attempting to delete old table at: {feature_delta_path}")
dbutils.fs.rm(feature_delta_path, recurse=True)
print("✅ Old feature table successfully deleted. You can now rerun the main notebook.")

In [0]:
# Databricks notebook source
# =======================================================================================
# 03_feature_engineering_pro
#
# Description:
#   1️⃣ Authenticates to ADLS Gen2 using a robust, encapsulated function.
#   2️⃣ Reads the cleaned Delta dataset for a specific ingestion date.
#   3️⃣ Engineers powerful, transaction-level behavioral features using window functions.
#      - Compares current transaction amount to the user's recent average.
#      - Calculates user's transaction frequency over recent time windows.
#   4️⃣ Writes the enriched feature data back to a new Delta table, ready for ML modeling.
#
# What's New (Professional Enhancements):
#   - MODULARITY: Code is organized into functions for clarity and reusability.
#   - ADVANCED FEATURES: Switched from simple aggregation to powerful window functions
#     to create behavioral features (e.g., amount deviation, transaction velocity).
#     This provides far more predictive power for fraud models.
#   - IDEMPOTENCY: The final write operation uses a dynamic overwrite to ensure that
#     rerunning the notebook for the same day replaces the data, preventing duplicates.
#   - ROBUSTNESS: Clear parameterization and error handling remain.
# =======================================================================================

from pyspark.sql import SparkSession, Window
from pyspark.sql import functions as F

# --------------------------------------------------------------------------------------
# 1️⃣ Encapsulated Function for ADLS Authentication
# --------------------------------------------------------------------------------------
def setup_spark_adls_auth(spark, storage_account, scope, client_id_key, tenant_id_key, client_secret_key):
    """
    Configures Spark session for ADLS Gen2 authentication using secrets
    from Azure Key Vault-backed Databricks secret scope.
    """
    print(f"🔐 Authenticating to ADLS Gen2 storage account: {storage_account}...")
    try:
        client_id = dbutils.secrets.get(scope=scope, key=client_id_key)
        tenant_id = dbutils.secrets.get(scope=scope, key=tenant_id_key)
        client_secret = dbutils.secrets.get(scope=scope, key=client_secret_key)

        spark.conf.set(f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net", "OAuth")
        spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
        spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net", client_id)
        spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net", client_secret)
        spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")
        
        print("✅ ADLS Gen2 authentication configured successfully.")
    except Exception as e:
        print("❌ Failed to configure ADLS authentication.")
        raise e

# --------------------------------------------------------------------------------------
# 2️⃣ Main Feature Engineering Logic
# --------------------------------------------------------------------------------------
def engineer_behavioral_features(df, user_id_col='V1'):
    """
    Engineers transaction-level features using window functions.
    
    Args:
        df (DataFrame): Input DataFrame of cleaned transactions.
        user_id_col (str): The column to use as a proxy for user/customer ID.
                           The Kaggle dataset is anonymized, so we use one of the
                           principal components ('V1', 'V2', etc.) as a stand-in.
    
    Returns:
        DataFrame: DataFrame enriched with new behavioral features.
    """
    print(f"🧠 Engineering behavioral features. Using '{user_id_col}' as user identifier.")
    
    # Define window specifications for user activity
    # Window 1: User's activity over the last 24 hours
    user_window_24h = (
        Window.partitionBy(user_id_col)
        .orderBy(F.col("clean_ts").cast("long"))
        .rangeBetween(-86400, 0) # 24 hours * 3600 seconds
    )
    
    # Window 2: User's activity over the last 1 hour
    user_window_1h = (
        Window.partitionBy(user_id_col)
        .orderBy(F.col("clean_ts").cast("long"))
        .rangeBetween(-3600, 0) # 1 hour * 3600 seconds
    )

    df_feat = (
        df
        # --- Basic Time-Based Features ---
        .withColumn("hour", F.hour("clean_ts"))
        .withColumn("day_of_week", F.date_format("clean_ts", "E"))
        .withColumn("amount_log", F.log1p("Amount"))
        
        # --- Behavioral Features using Window Functions ---
        # Average transaction amount for this user over the last 24 hours
        .withColumn("avg_amount_user_24h", F.avg("Amount").over(user_window_24h))
        
        # Standard deviation of transaction amount for this user over the last 24 hours
        .withColumn("stddev_amount_user_24h", F.stddev("Amount").over(user_window_24h))
        
        # Number of transactions for this user in the last hour
        .withColumn("txn_count_user_1h", F.count("*").over(user_window_1h))
        
        # Number of transactions for this user in the last 24 hours
        .withColumn("txn_count_user_24h", F.count("*").over(user_window_24h))
    )
    
    # --- Deviation Features (very predictive for fraud) ---
    # Calculate Z-score: how many standard deviations is this transaction from the user's 24h average?
    df_final = df_feat.withColumn(
        "amount_deviation_zscore",
        (F.col("Amount") - F.col("avg_amount_user_24h")) / F.col("stddev_amount_user_24h")
    ).fillna(0) # Fill NaN/nulls with 0 (e.g., if stddev is 0 for a user's first transaction)

    print("✅ Feature engineering complete.")
    return df_final

# ======================================================================================
# Main Execution Block
# ======================================================================================
if __name__ == "__main__":
    spark = SparkSession.builder.appName("AdvancedFeatureEngineering").getOrCreate()

    # ------------------------------------------------------------
    # Parameters and Configuration
    # ------------------------------------------------------------
    storage_account = "finlakeadlsa3b3"
    container_clean = "clean"
    container_feature = "feature"
    scope = "finlake_scope"

    dbutils.widgets.text("ingest_date", "2025-10-10")
    ingest_date = dbutils.widgets.get("ingest_date")

    clean_delta_path = f"abfss://{container_clean}@{storage_account}.dfs.core.windows.net/delta/clean_transactions"
    feature_delta_path = f"abfss://{container_feature}@{storage_account}.dfs.core.windows.net/delta/feature_transactions"
    
    print("=== PARAMETERS ===")
    print(f"ingest_date: {ingest_date}")
    print(f"source_path: {clean_delta_path}")
    print(f"destination_path: {feature_delta_path}")
    print("===================")

    # ------------------------------------------------------------
    # Authentication
    # ------------------------------------------------------------
    setup_spark_adls_auth(
        spark,
        storage_account,
        scope,
        client_id_key="finlake-sp-client-id",
        tenant_id_key="finlake-sp-tenant-id",
        client_secret_key="finlake-sp-client-secret"
    )

    # ------------------------------------------------------------
    # Load Cleaned Data
    # ------------------------------------------------------------
    print(f"📂 Loading cleaned Delta data for ingest_date = {ingest_date}...")
    try:
        df_clean = (
            spark.read.format("delta")
            .load(clean_delta_path)
            .filter(F.col("ingest_date") == ingest_date)
        )
        row_count = df_clean.count()
        if row_count == 0:
            print(f"⚠️ No data found for ingest_date={ingest_date}. Exiting gracefully.")
            dbutils.notebook.exit(f"No data found for ingest_date={ingest_date}")
        
        print(f"✅ Loaded {row_count} records from cleaned data.")
        
    except Exception as e:
        print(f"❌ Error loading data from {clean_delta_path}")
        raise e

    # ------------------------------------------------------------
    # Apply Feature Engineering
    # ------------------------------------------------------------
    df_features = engineer_behavioral_features(df_clean)
    
    print("🔎 Engineered features preview:")
    display(df_features.select(
        "Amount", 
        "avg_amount_user_24h", 
        "stddev_amount_user_24h", 
        "amount_deviation_zscore", 
        "txn_count_user_1h",
        "is_fraud"
    ).limit(10))

    # ------------------------------------------------------------
    # Write to Feature Store Delta Table (Idempotent Write)
    # ------------------------------------------------------------
    print(f"💾 Writing enriched feature data to: {feature_delta_path}")
    (
        df_features.write
        .format("delta")
        .mode("overwrite")
        .option("replaceWhere", f"ingest_date = '{ingest_date}'").option("mergeSchema", "true") # Enables schema evolution for Delta write
        .partitionBy("ingest_date")
        .save(feature_delta_path)
    )

    print("✅ Feature data successfully written.")
    print("🎉 Advanced feature engineering pipeline completed successfully!")
