In [0]:
# 03_build_wide_silver (global_temp only)
from pyspark.sql import functions as F

# -----------------------------
# 1) Silver Views laden
# -----------------------------
c = spark.table("global_temp.customers_silver")
o = spark.table("global_temp.orders_silver")
i = spark.table("global_temp.order_items_silver")

print("Input counts:",
      "customers=", c.count(),
      "orders=", o.count(),
      "items=", i.count())

# -----------------------------
# 2) Joins bauen (Item-Level Wide)
# -----------------------------
# i -> o über order_id
io = (i.alias("i")
        .join(o.alias("o"), on="order_id", how="left"))

# dann -> customers über customer_id
wide = (io.join(c.alias("c"), on="customer_id", how="left"))

# -----------------------------
# 3) Spaltenauswahl + Umbenennung (damit es sauber bleibt)
# -----------------------------
# Wir picken typische Spalten; falls manche bei dir nicht existieren, werden sie einfach nicht ausgewählt.
def pick_if_exists(df, cols):
    return [F.col(c) for c in cols if c in df.columns]

# Nach den Joins sind alle Spalten "flach" im wide-DF enthalten.
# Wir erstellen ein "Curated" Select-Set mit sinnvollen Namen.

select_exprs = []

# Keys
if "order_id" in wide.columns: select_exprs.append(F.col("order_id"))
if "customer_id" in wide.columns: select_exprs.append(F.col("customer_id"))
if "product_id" in wide.columns: select_exprs.append(F.col("product_id"))

# Order-Felder
if "order_ts" in wide.columns: select_exprs.append(F.col("order_ts"))
if "status" in wide.columns: select_exprs.append(F.col("status"))
if "order_total_amount" in wide.columns: select_exprs.append(F.col("order_total_amount"))

# Item-Felder
if "quantity" in wide.columns: select_exprs.append(F.col("quantity"))
if "unit_price" in wide.columns: select_exprs.append(F.col("unit_price"))
if "line_amount" in wide.columns: select_exprs.append(F.col("line_amount"))

# Customer-Felder (umbenennen, damit es eindeutig ist)
# Beispiel: first_name -> customer_first_name
rename_map = {
    "first_name": "customer_first_name",
    "last_name": "customer_last_name",
    "email": "customer_email",
    "country": "customer_country"
}
for src, dst in rename_map.items():
    if src in wide.columns:
        select_exprs.append(F.col(src).alias(dst))

# Ingest/Metadaten (optional)
for m in ["c.ingest_ts", "i.ingest_ts", "o.ingest_ts"]:
    if m in wide.columns:
        select_exprs.append(F.col(m))

wide_curated = wide.select(*select_exprs)

# -----------------------------
# 4) Wide View registrieren
# -----------------------------
wide_curated.createOrReplaceGlobalTempView("sales_wide_silver")
print("Created view: global_temp.sales_wide_silver")

# -----------------------------
# 5) Checks + Warnungen
# -----------------------------
wide_count = wide_curated.count()
print("Wide count:", wide_count)

# Sehr wichtiger Hinweis: wide table ist item-level -> mehrere Zeilen pro order_id
# Deshalb: wenn du Order-KPIs berechnest, vorher auf order_id aggregieren!
orders_distinct = wide_curated.select("order_id").dropDuplicates().count() if "order_id" in wide_curated.columns else None
if orders_distinct is not None:
    print("Distinct orders in wide:", orders_distinct)

# Beispiel-Check: wie viele Zeilen haben keinen customer match?
if "customer_first_name" in wide_curated.columns:
    missing_customer = wide_curated.filter(F.col("customer_first_name").isNull()).count()
    print("Rows with missing customer join:", missing_customer)