In [0]:
# ============================================
# 02_clean_transform
# ============================================
# Purpose: Clean and standardize raw transactions before feature engineering.
# ============================================

# --- 1️⃣ WIDGETS ---
dbutils.widgets.text("raw_delta_path", "abfss://raw@finlakeadlsa3b3.dfs.core.windows.net/delta/raw_transactions")
dbutils.widgets.text("clean_delta_path", "abfss://clean@finlakeadlsa3b3.dfs.core.windows.net/delta/clean_transactions")
dbutils.widgets.text("ingest_date", "")

raw_delta_path = dbutils.widgets.get("raw_delta_path")
clean_delta_path = dbutils.widgets.get("clean_delta_path")
ingest_date = dbutils.widgets.get("ingest_date")

print("=== PARAMETERS ===")
print(f"raw_delta_path : {raw_delta_path}")
print(f"clean_delta_path: {clean_delta_path}")
print(f"ingest_date     : {ingest_date}")
print("==================")

# --- 2️⃣ CONFIGURE STORAGE ACCESS (using secret scope) ---
spark.conf.set(
    "fs.azure.account.key.finlakeadlsa3b3.dfs.core.windows.net",
    dbutils.secrets.get(scope="finlake_scope", key="adls-key")
)

# --- 3️⃣ IMPORTS ---
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType

# --- 4️⃣ READ RAW DELTA DATA ---
try:
    if ingest_date:
        print(f"📥 Loading partition for ingest_date={ingest_date}")
        df_raw = (
            spark.read.format("delta")
            .load(raw_delta_path)
            .filter(F.col("ingest_date") == ingest_date)
        )
    else:
        print("📥 Loading full raw_delta dataset")
        df_raw = spark.read.format("delta").load(raw_delta_path)

    print(f"✅ Loaded raw data: {df_raw.count()} rows")
except Exception as e:
    print("❌ ERROR: Could not load raw delta data.")
    raise e

display(df_raw.limit(5))

# --- 5️⃣ CLEANING STEPS ---
print("🔧 Starting cleaning transformations...")

# (a) Normalize column names (remove spaces)
for c in df_raw.columns:
    if " " in c:
        df_raw = df_raw.withColumnRenamed(c, c.replace(" ", "_"))

# (b) Cast Amount to Double
if "Amount" in df_raw.columns:
    df_raw = df_raw.withColumn("Amount", F.col("Amount").cast(DoubleType()))

# (c) Rename 'Class' → 'is_fraud'
if "Class" in df_raw.columns and "is_fraud" not in df_raw.columns:
    df_raw = df_raw.withColumnRenamed("Class", "is_fraud")

# (d) Fill numeric nulls with 0
numeric_cols = [f.name for f in df_raw.schema.fields if str(f.dataType) in ("IntegerType","LongType","DoubleType","FloatType","DecimalType")]
for c in numeric_cols:
    df_raw = df_raw.withColumn(c, F.when(F.col(c).isNull(), F.lit(0)).otherwise(F.col(c)))

# (e) Remove duplicates
if "TransactionID" in df_raw.columns:
    df_clean = df_raw.dropDuplicates(["TransactionID"])
else:
    df_clean = df_raw.dropDuplicates()

# (f) Add metadata
df_clean = df_clean.withColumn("clean_ts", F.current_timestamp())
df_clean = df_clean.withColumn("processing_status", F.lit("cleaned"))

print(f"✅ Cleaned rows: {df_clean.count()}")

# --- 6️⃣ WRITE CLEAN DATA TO DELTA ---
try:
    (df_clean.write
     .format("delta")
     .mode("append")
     .partitionBy("ingest_date")
     .save(clean_delta_path))
    print(f"✅ Wrote cleaned Delta to: {clean_delta_path}")
except Exception as e:
    print("❌ ERROR writing cleaned data to Delta.")
    raise e

# --- 7️⃣ DISPLAY SAMPLE ---
display(df_clean.limit(10))
print("🎯 Cleaning pipeline completed successfully!")
