In [0]:
# 02_clean_silver (global_temp only)
from perf_lab_utils import perf_lab
from pyspark.sql import functions as F
from pyspark.sql.window import Window

CUSTOMERS_BRONZE = "global_temp.customers_bronze_v"
ORDERS_BRONZE    = "global_temp.orders_bronze_v"
ITEMS_BRONZE     = "global_temp.order_items_bronze_v"

# Ziel-Views (Silver)
CUSTOMERS_SILVER = "customers_silver"
ORDERS_SILVER    = "orders_silver"
ITEMS_SILVER     = "order_items_silver"

# -----------------------------
# 1) Bronze laden
# -----------------------------
customers_b = spark.table(CUSTOMERS_BRONZE)
orders_b    = spark.table(ORDERS_BRONZE)
items_b     = spark.table(ITEMS_BRONZE)

print("Bronze counts:",
      "customers=", customers_b.count(),
      "orders=", orders_b.count(),
      "items=", items_b.count())

# -----------------------------
# 2) Helpers
# -----------------------------
def dedup_latest(df, key_cols, ts_col="ingest_ts"):
    """
    Dedupliziert anhand key_cols.
    Falls ts_col existiert: latest wins.
    Sonst: dropDuplicates(key_cols)
    """
    if ts_col not in df.columns:
        return df.dropDuplicates(key_cols)
    w = Window.partitionBy([F.col(c) for c in key_cols]).orderBy(F.col(ts_col).desc())
    return (df
            .withColumn("_rn", F.row_number().over(w))
            .filter(F.col("_rn") == 1)
            .drop("_rn"))

def ensure_col(df, col_name, dtype):
    """Falls Spalte fehlt, f체ge sie als Null-Spalte hinzu."""
    if col_name not in df.columns:
        return df.withColumn(col_name, F.lit(None).cast(dtype))
    return df

# -----------------------------
# 3) Customers clean
# -----------------------------
c = customers_b

# Erwartete Spalten (falls anders, passt ensure_col es ab)
c = ensure_col(c, "customer_id", "string")
c = c.withColumn("customer_id", F.col("customer_id").cast("string"))

# Standardisiere h채ufige Textfelder, wenn vorhanden
for colname in ["first_name", "last_name", "country"]:
    if colname in c.columns:
        c = c.withColumn(colname, F.trim(F.col(colname)))

if "email" in c.columns:
    c = c.withColumn("email", F.lower(F.trim(F.col("email"))))

# Key muss vorhanden sein
c = c.filter(F.col("customer_id").isNotNull() & (F.length(F.col("customer_id")) > 0))

# dedupe
c_silver_df = dedup_latest(c, ["customer_id"], ts_col="ingest_ts")

# -----------------------------
# 4) Orders clean
# -----------------------------
o = orders_b

o = ensure_col(o, "order_id", "string")
o = ensure_col(o, "customer_id", "string")
o = o.withColumn("order_id", F.col("order_id").cast("string"))
o = o.withColumn("customer_id", F.col("customer_id").cast("string"))

# Timestamp normalisieren (unterst체tzt order_ts oder order_date)
if "order_ts" in o.columns:
    o = o.withColumn("order_ts", F.to_timestamp(F.col("order_ts")))
elif "order_date" in o.columns:
    o = o.withColumn("order_ts", F.to_timestamp(F.col("order_date")))
else:
    o = o.withColumn("order_ts", F.lit(None).cast("timestamp"))

if "status" in o.columns:
    o = o.withColumn("status", F.upper(F.trim(F.col("status"))))

# Keys validieren
o = o.filter(
    F.col("order_id").isNotNull() & (F.length(F.col("order_id")) > 0) &
    F.col("customer_id").isNotNull() & (F.length(F.col("customer_id")) > 0)
)

# dedupe
o = dedup_latest(o, ["order_id"], ts_col="ingest_ts")

# referential integrity: orders -> customers
o_before = o.count()
o_silver_df = o.join(
    c_silver_df.select("customer_id").dropDuplicates(["customer_id"]),
    on="customer_id",
    how="inner"
)
print("Orders removed due to missing customer reference:", o_before - o_silver_df.count())

# -----------------------------
# 5) Order Items clean
# -----------------------------
i = items_b

i = ensure_col(i, "order_id", "string")
i = ensure_col(i, "product_id", "string")
i = ensure_col(i, "quantity", "int")
i = ensure_col(i, "unit_price", "double")

i = (i
     .withColumn("order_id", F.col("order_id").cast("string"))
     .withColumn("product_id", F.col("product_id").cast("string"))
     .withColumn("quantity", F.col("qty").cast("int"))
     .withColumn("unit_price", F.col("price").cast("double"))
     .drop("qty", "price")
)

# Keys validieren
i = i.filter(
    F.col("order_id").isNotNull() & (F.length(F.col("order_id")) > 0) &
    F.col("product_id").isNotNull() & (F.length(F.col("product_id")) > 0)
)

# Plausibilit채tschecks
i = i.filter((F.col("quantity").isNull()) | (F.col("quantity") >= 0))
i = i.filter((F.col("unit_price").isNull()) | (F.col("unit_price") >= 0))

# Enrichment
i = i.withColumn(
    "line_amount",
    F.when(F.col("quantity").isNotNull() & F.col("unit_price").isNotNull(),
           F.col("quantity") * F.col("unit_price"))
     .otherwise(F.lit(None).cast("double"))
)

# dedupe (minimal: order_id + product_id)
i = dedup_latest(i, ["order_id", "product_id"], ts_col="ingest_ts")

# referential integrity: items -> orders
i_before = i.count()
i_silver_df = i.join(
    o_silver_df.select("order_id").dropDuplicates(["order_id"]),
    on="order_id",
    how="inner"
)
print("Items removed due to missing order reference:", i_before - i_silver_df.count())

# -----------------------------
# 6) Optional: order_total_amount in Orders
# -----------------------------
order_totals = (i_silver_df
                .groupBy("order_id")
                .agg(F.sum("line_amount").alias("order_total_amount")))

o_silver_df = o_silver_df.join(order_totals, on="order_id", how="left")

# -----------------------------
# 7) Als GLOBAL TEMP VIEWS registrieren

# Perf + Quality Checks (Silver)
perf_lab(c_silver_df, "customers_silver", keys=["customer_id"], null_cols=["customer_id"], emit_view="perf_metrics")
perf_lab(o_silver_df, "orders_silver", keys=["order_id"], null_cols=["order_id", "customer_id"], emit_view="perf_metrics")
perf_lab(i_silver_df, "order_items_silver", keys=["order_id", "product_id"], null_cols=["order_id", "product_id"], negative_cols=["quantity", "unit_price"], emit_view="perf_metrics")
# -----------------------------
c_silver_df.createOrReplaceGlobalTempView(CUSTOMERS_SILVER)
o_silver_df.createOrReplaceGlobalTempView(ORDERS_SILVER)
i_silver_df.createOrReplaceGlobalTempView(ITEMS_SILVER)

print("Created global_temp views:",
      f"global_temp.{CUSTOMERS_SILVER}, global_temp.{ORDERS_SILVER}, global_temp.{ITEMS_SILVER}")

# -----------------------------
# 8) Quick checks
# -----------------------------
print("Silver counts:",
      "customers=", spark.table(f"global_temp.{CUSTOMERS_SILVER}").count(),
      "orders=", spark.table(f"global_temp.{ORDERS_SILVER}").count(),
      "items=", spark.table(f"global_temp.{ITEMS_SILVER}").count())

# -----------------------------
# 9) Display Silver Tables
# -----------------------------
print("\n=== Customers Silver ===")
display(c_silver_df)

print("\n=== Orders Silver ===")
display(o_silver_df)

print("\n=== Order Items Silver ===")
display(i_silver_df)