In [0]:
# Notebook 1: Create Worklist with Data (CORRECTED v2)
# Creates a worklist table with actual blob data for efficient processing

from pyspark.sql import functions as F
from pyspark.sql.window import Window
from datetime import datetime
import time

# Configuration
SOURCE_TABLE = "4_prod.raw.mill_ce_blob"
TARGET_TABLE = "4_prod.bronze.mill_blob_text"
STAGING_DB = "4_prod.tmp"
RUN_ID = datetime.now().strftime("%Y%m%d_%H%M%S")
WORKLIST_TABLE = f"{STAGING_DB}.blob_worklist_{RUN_ID}"
MAX_BLOB_SIZE = 16 * 1024 * 1024  # 16 MB

print("="*80)
print(f"CREATING WORKLIST: {WORKLIST_TABLE}")
print(f"Run ID: {RUN_ID}")
print("="*80)

# Step 1: Get events not in target
print(f"[{datetime.now().strftime('%H:%M:%S')}] Identifying new events...")
new_events = (spark.table(SOURCE_TABLE)
              .select("EVENT_ID", "ADC_UPDT")
              .distinct()
              .join(spark.table(TARGET_TABLE).select("EVENT_ID").distinct(), 
                    on="EVENT_ID", how="left_anti")).limit(250000)

new_event_count = new_events.count()
print(f"Found {new_event_count:,} events to process")

if new_event_count == 0:
    print("No new events to process!")
    dbutils.notebook.exit("NO_WORK")

# Step 2: Create worklist with actual data and metadata
print(f"[{datetime.now().strftime('%H:%M:%S')}] Building worklist with blob data...")

# Define columns we need
META_AND_BLOB_COLS = [
    "EVENT_ID", "BLOB_SEQ_NUM",
    "VALID_UNTIL_DT_TM", "VALID_FROM_DT_TM",
    "UPDT_DT_TM", "UPDT_ID", "UPDT_TASK", "UPDT_CNT", "UPDT_APPLCTX",
    "LAST_UTC_TS", "ADC_UPDT", "COMPRESSION_CD", "BLOB_CONTENTS", "BLOB_LENGTH"
]

# Load source data for new events
source_data = (spark.table(SOURCE_TABLE)
               .join(F.broadcast(new_events), on=["EVENT_ID", "ADC_UPDT"], how="inner")
               .select(*META_AND_BLOB_COLS))

# Deduplicate using window function
w_temporal = Window.partitionBy("EVENT_ID", "BLOB_SEQ_NUM").orderBy(
    F.col("VALID_UNTIL_DT_TM").desc(),
    F.col("UPDT_DT_TM").desc(),
    F.col("LAST_UTC_TS").desc()
)

deduped_data = (source_data
                .withColumn("version_rank", F.row_number().over(w_temporal))
                .filter(F.col("version_rank") == 1)
                .drop("version_rank"))

# Add metadata columns - Note: we're including ADC_UPDT in the struct now
worklist_with_meta = (deduped_data
    .withColumn("chunk_size", F.coalesce(F.col("BLOB_LENGTH").cast("long"), F.lit(0)))
    .groupBy("EVENT_ID", "ADC_UPDT")
    .agg(
        F.sum("chunk_size").alias("compressed_size"),
        F.count("*").alias("chunk_count"),
        # Include all columns in the struct (including ADC_UPDT for each chunk)
        F.collect_list(F.struct(*[c for c in META_AND_BLOB_COLS if c != "EVENT_ID"])).alias("chunks_data")
    )
    .withColumn("status", 
                F.when(F.col("compressed_size") > MAX_BLOB_SIZE, "oversized")
                .otherwise("pending"))
    .withColumn("batch_num", F.lit(0))
    .withColumn("process_ts", F.lit(None).cast("timestamp"))
    .withColumn("error_msg", F.lit(None).cast("string")))

# Write worklist table
print(f"[{datetime.now().strftime('%H:%M:%S')}] Writing worklist table...")
worklist_with_meta.write.mode("overwrite").saveAsTable(WORKLIST_TABLE)

# Create index for efficient querying
spark.sql(f"OPTIMIZE {WORKLIST_TABLE} ZORDER BY (ADC_UPDT, EVENT_ID)")

# Get statistics
stats = spark.table(WORKLIST_TABLE).groupBy("status").count().collect()
stats_dict = {row["status"]: row["count"] for row in stats}

print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Worklist created successfully:")
print(f"  - Pending: {stats_dict.get('pending', 0):,}")
print(f"  - Oversized: {stats_dict.get('oversized', 0):,}")
print(f"  - Total chunks stored: {spark.table(WORKLIST_TABLE).select(F.sum(F.col('chunk_count'))).collect()[0][0]:,}")

# Write oversized events directly to target
if stats_dict.get('oversized', 0) > 0:
    print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Writing {stats_dict.get('oversized', 0)} oversized events to target...")
    
    # CORRECTED v2: Simpler approach - just take first chunk for metadata
    oversized = (spark.table(WORKLIST_TABLE)
                 .filter(F.col("status") == "oversized")
                 .select(
                     "EVENT_ID",
                     "ADC_UPDT",  # Keep ADC_UPDT from worklist level
                     "compressed_size",
                     F.col("chunks_data")[0].alias("first_chunk")  # Just get first chunk for metadata
                 ))
    
    # Now extract fields from first_chunk
    oversized_output = oversized.select(
        "EVENT_ID",
        F.col("first_chunk.VALID_UNTIL_DT_TM").alias("VALID_UNTIL_DT_TM"),
        F.col("first_chunk.VALID_FROM_DT_TM").alias("VALID_FROM_DT_TM"),
        F.col("first_chunk.UPDT_DT_TM").alias("UPDT_DT_TM"),
        F.col("first_chunk.UPDT_ID").cast("long").alias("UPDT_ID"),
        F.col("first_chunk.UPDT_TASK").cast("long").alias("UPDT_TASK"),
        F.col("first_chunk.UPDT_CNT").cast("long").alias("UPDT_CNT"),
        F.col("first_chunk.UPDT_APPLCTX").cast("long").alias("UPDT_APPLCTX"),
        F.col("first_chunk.LAST_UTC_TS").alias("LAST_UTC_TS"),
        "ADC_UPDT",  # Use ADC_UPDT from worklist level
        F.lit(None).cast("binary").alias("BLOB_BINARY"),
        F.lit(None).cast("string").alias("CONTENT_TYPE"),
        F.lit(None).cast("string").alias("ENCODING"),
        F.lit(None).cast("string").alias("BLOB_TEXT"),
        F.col("compressed_size").alias("BINARY_SIZE"),
        F.lit(None).cast("long").alias("TEXT_LENGTH"),
        F.concat(F.lit("Compressed Too Large: "), F.col("compressed_size"), F.lit(" bytes")).alias("STATUS"),
        F.lit(None).cast("string").alias("anon_text")
    )
    
    oversized_output.write.mode("append").insertInto(TARGET_TABLE)
    print(f"  Wrote {stats_dict.get('oversized', 0)} oversized events to target")

# Output worklist table name for next notebook
print(f"\nWorklist table: {WORKLIST_TABLE}")
print(f"Run ID: {RUN_ID}")

# Create metadata table for pipeline coordination
# Create metadata table for pipeline coordination
METADATA_TABLE = f"{STAGING_DB}.pipeline_metadata"

print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Recreating metadata table...")
spark.sql(f"DROP TABLE IF EXISTS {METADATA_TABLE}")

# Create or update metadata table
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {METADATA_TABLE} (
        run_id STRING,
        worklist_table STRING,
        total_events INT,
        pending_events INT,
        oversized_events INT,
        created_ts TIMESTAMP,
        completed_ts TIMESTAMP,
        merged_ts TIMESTAMP,
        status STRING,
        batch_tables STRING,
        processed_events INT
    ) USING DELTA
""")

# Insert metadata using SQL
spark.sql(f"""
    INSERT INTO {METADATA_TABLE}
    VALUES (
        '{RUN_ID}',
        '{WORKLIST_TABLE}',
        {new_event_count},
        {stats_dict.get('pending', 0)},
        {stats_dict.get('oversized', 0)},
        current_timestamp(),
        NULL,
        NULL,
        'worklist_created',
        NULL,
        NULL
    )
""")

print(f"\nPipeline metadata saved to: {METADATA_TABLE}")
print(f"Run ID for next notebooks: {RUN_ID}")
