# 🥈 Silver Layer — Cleaning & Enrichment

**Annie's Magic Numbers Medallion Architecture**

This notebook reads Bronze Delta tables, cleans the data, and writes to the Silver layer.

### 🔐 Configuration — ADLS Gen2 Authentication

In [None]:
# ============================================================
# CELL 0 — Azure Data Lake Gen2 Authentication
# ============================================================
spark.conf.set("spark.sql.shuffle.partitions", "8")
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoOptimize.optimizeWrite", "true")

spark.conf.set(
    "fs.azure.account.key.anniedatalake123.dfs.core.windows.net",
    "<REDACTED_AZURE_STORAGE_KEY>"
)

### 🟦 Path Setup

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, IntegerType, LongType, StringType, DateType

container_name = "annie-data"
storage_account = "anniedatalake123"

base_path = f"abfss://{container_name}@{storage_account}.dfs.core.windows.net/"
bronze_path = base_path + "bronze/"
silver_path = base_path + "silver/"

### 🟦 Helper Functions

In [None]:
def normalize_columns(df):
    import re
    def to_snake(name):
        s1 = re.sub(r'(.)([A-Z][a-z]+)', r'\1_\2', name)
        return re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', s1).lower().strip()

    new_cols = {c: to_snake(c) for c in df.columns if not c.startswith("_")}
    for old, new in new_cols.items():
        if old != new:
            df = df.withColumnRenamed(old, new)
    return df

def write_silver(df, table_name):
    # Optimizamos la escritura para Delta Lake sin particionamiento físico lento
    target_path = silver_path + table_name
    (
        df.write
          .format("delta")
          .mode("overwrite")
          .option("overwriteSchema", "true")
          .save(target_path)
    )
    # Feedback instantáneo usando el cache
    count = df.count()
    print(f"   ✅ silver.{table_name} guardada exitosamente → {count:,} filas")


### 🥈 Silver — Beginning Inventory

In [None]:
print("Processing silver.beg_inventory ...")
beg_inv_raw = spark.read.format("delta").load(bronze_path + "begin_inventory")
beg_inv = normalize_columns(beg_inv_raw)

beg_inv = (
    beg_inv
    .withColumn("on_hand",    F.col("on_hand").cast(IntegerType()))
    .withColumn("price",      F.col("price").cast(DoubleType()))
    .withColumn("total_cost", F.round(F.col("on_hand") * F.col("price"), 2))
    .withColumn("start_date", F.to_date(F.col("start_date"), "yyyy-MM-dd"))
    .filter(F.col("inventory_id").isNotNull())
    .filter(F.col("brand").isNotNull())
    .filter(F.col("price") > 0)
    .dropDuplicates(["inventory_id"])
    .withColumn("inventory_value", F.col("total_cost"))
)


beg_inv = beg_inv.repartition(4).cache()
beg_inv.count()
write_silver(beg_inv, "beg_inventory")


### 🥈 Silver — Ending Inventory

In [None]:
print("Processing silver.end_inventory ...")
end_inv_raw = spark.read.format("delta").load(bronze_path + "end_inventory")
end_inv = normalize_columns(end_inv_raw)

end_inv = (
    end_inv
    .withColumn("on_hand",    F.col("on_hand").cast(IntegerType()))
    .withColumn("price",      F.col("price").cast(DoubleType()))
    .withColumn("total_cost", F.round(F.col("on_hand") * F.col("price"), 2))
    .withColumn("end_date",   F.to_date(F.col("end_date"), "yyyy-MM-dd"))
    .filter(F.col("inventory_id").isNotNull())
    .filter(F.col("brand").isNotNull())
    .filter(F.col("price") > 0)
    .dropDuplicates(["inventory_id"])
    .withColumn("inventory_value", F.col("total_cost"))
)


end_inv = end_inv.repartition(4).cache()
end_inv.count()
write_silver(end_inv, "end_inventory")


### 🥈 Silver — Purchase Prices

In [None]:
print("Processing silver.purchase_prices ...")
pp_raw = spark.read.format("delta").load(bronze_path + "prices")
pp = normalize_columns(pp_raw)

pp = (
    pp
    .withColumn("price", F.col("price").cast(DoubleType()))
    .filter(F.col("brand").isNotNull())
    .filter(F.col("description").isNotNull())
    .filter(F.col("price") > 0)
    .withColumn("brand", F.col("brand").cast(IntegerType()))
    .dropDuplicates(["brand", "description"])
)


pp = pp.repartition(4).cache()
pp.count()
write_silver(pp, "purchase_prices")

### 🥈 Silver — Invoice Purchases

In [None]:
print("Processing silver.invoice_purchases ...")
inv_raw = spark.read.format("delta").load(bronze_path + "invoices")
inv = normalize_columns(inv_raw)

inv = (
    inv
    .withColumn("vendor_number",  F.col("vendor_number").cast(IntegerType()))
    .withColumn("quantity",       F.col("quantity").cast(IntegerType()))
    .withColumn("dollars",        F.col("dollars").cast(DoubleType()))
    .withColumn("freight",        F.col("freight").cast(DoubleType()))
    .withColumn("invoice_date",   F.to_date(F.col("invoice_date"), "MM/dd/yyyy"))
    .withColumn("pay_date",       F.to_date(F.col("pay_date"), "MM/dd/yyyy"))
    .filter(F.col("po_number").isNotNull())
    .dropDuplicates(["vendor_number", "po_number", "invoice_date"])
)


inv = inv.repartition(4).cache()
inv.count()
write_silver(inv, "invoice_purchases")

### 🥈 Silver — Purchases (Enriched)

In [None]:
print("Processing silver.purchases ...")
purch_raw = spark.read.format("delta").load(bronze_path + "purchases")
purch = normalize_columns(purch_raw)

purch = (
    purch
    .withColumn("vendor_number",   F.col("vendor_number").cast(IntegerType()))
    .withColumn("quantity",        F.col("quantity").cast(IntegerType()))
    .withColumn("dollars",         F.col("dollars").cast(DoubleType()))
    .withColumn("purchase_price",  F.col("purchase_price").cast(DoubleType()))
    .withColumn("brand",           F.col("brand").cast(IntegerType()))
    .withColumn("po_date",         F.to_date(F.col("po_date"), "MM/dd/yyyy"))
    .withColumn("receiving_date",  F.to_date(F.col("receiving_date"), "MM/dd/yyyy"))
    .withColumn("invoice_date",    F.to_date(F.col("invoice_date"), "MM/dd/yyyy"))
    .filter(F.col("brand").isNotNull())
    .filter(F.col("quantity") > 0)
    .dropDuplicates(["vendor_number", "po_number", "brand", "description", "receiving_date"])
)

# OPTIMIZACIÓN: Usar 'pp' de memoria (ya cacheado) en lugar de leer disk
pp_lookup = pp.select(
    F.col("brand").alias("ref_brand"),
    F.col("description").alias("ref_description"),
    F.col("price").alias("ref_price")
)

purch_enriched = (
    purch.join(
        F.broadcast(pp_lookup),
        on=[purch["brand"] == pp_lookup["ref_brand"],
            purch["description"] == pp_lookup["ref_description"]],
        how="left"
    )
    .withColumn("cost_per_unit", F.round(F.coalesce(F.col("ref_price"), F.col("purchase_price")), 4))
    .drop("ref_brand", "ref_description", "ref_price")
    .withColumn("total_cost", F.round(F.col("cost_per_unit") * F.col("quantity"), 2))
)

purch_enriched = purch_enriched.repartition(8).cache()
purch_enriched.count()
write_silver(purch_enriched, "purchases")

### 🥈 Silver — Sales

In [None]:
print('Sales...')
s_raw = spark.read.format('delta').load(bronze_path + 'sales')
sales = normalize_columns(s_raw).repartition(8).cache()
sales.count()
write_silver(sales, 'sales')
