# ü•à Silver Layer ‚Äî Cleaning & Enrichment

**Annie's Magic Numbers Medallion Architecture**

This notebook reads Bronze Delta tables, cleans the data, and writes to the Silver layer.

### üîê Configuration ‚Äî ADLS Gen2 Authentication

In [None]:
spark.conf.set(
    "fs.azure.account.key.anniedatalake123.dfs.core.windows.net",
    "<PASTE_STORAGE_ACCOUNT_KEY_1_HERE>"
)

### üü¶ Path Setup

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, IntegerType, LongType, StringType, DateType

container_name = "annie-data"
storage_account = "anniedatalake123"

base_path = f"abfss://{container_name}@{storage_account}.dfs.core.windows.net/"
bronze_path = base_path + "bronze/"
silver_path = base_path + "silver/"

### üü¶ Helper Functions

In [None]:
def normalize_columns(df):
    import re
    def to_snake(name):
        s1 = re.sub(r'(.)([A-Z][a-z]+)', r'\1_\2', name)
        return re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', s1).lower().strip()

    new_cols = {c: to_snake(c) for c in df.columns if not c.startswith("_")}
    for old, new in new_cols.items():
        if old != new:
            df = df.withColumnRenamed(old, new)
    return df

def write_silver(df, table_name, partition_by=None):
    writer = (
        df.write
          .format("delta")
          .mode("overwrite")
          .option("overwriteSchema", "true")
    )
    if partition_by:
        writer = writer.partitionBy(partition_by)
    
    target_path = silver_path + table_name
    writer.save(target_path)
    
    count = spark.read.format("delta").load(target_path).count()
    print(f"   ‚úÖ  silver.{table_name} saved to {target_path}  ‚Üí  {count:,} rows")

### ü•à Silver ‚Äî Beginning Inventory

In [None]:
print("Processing silver.beg_inventory ...")
beg_inv_raw = spark.read.format("delta").load(bronze_path + "begin_inventory")
beg_inv = normalize_columns(beg_inv_raw)

beg_inv = (
    beg_inv
    .withColumn("on_hand",    F.col("on_hand").cast(IntegerType()))
    .withColumn("price",      F.col("price").cast(DoubleType()))
    .withColumn("total_cost", F.col("total_cost").cast(DoubleType()))
    .withColumn("start_date", F.to_date(F.col("start_date"), "yyyy-MM-dd"))
    .filter(F.col("inventory_id").isNotNull())
    .filter(F.col("brand").isNotNull())
    .filter(F.col("price") > 0)
    .dropDuplicates(["inventory_id"])
    .withColumn("inventory_value", F.round(F.col("on_hand") * F.col("price"), 2))
)

write_silver(beg_inv, "beg_inventory")

### ü•à Silver ‚Äî Ending Inventory

In [None]:
print("Processing silver.end_inventory ...")
end_inv_raw = spark.read.format("delta").load(bronze_path + "end_inventory")
end_inv = normalize_columns(end_inv_raw)

end_inv = (
    end_inv
    .withColumn("on_hand",    F.col("on_hand").cast(IntegerType()))
    .withColumn("price",      F.col("price").cast(DoubleType()))
    .withColumn("total_cost", F.col("total_cost").cast(DoubleType()))
    .withColumn("end_date",   F.to_date(F.col("end_date"), "yyyy-MM-dd"))
    .filter(F.col("inventory_id").isNotNull())
    .filter(F.col("brand").isNotNull())
    .filter(F.col("price") > 0)
    .dropDuplicates(["inventory_id"])
    .withColumn("inventory_value", F.round(F.col("on_hand") * F.col("price"), 2))
)

write_silver(end_inv, "end_inventory")

### ü•à Silver ‚Äî Purchase Prices

In [None]:
print("Processing silver.purchase_prices ...")
pp_raw = spark.read.format("delta").load(bronze_path + "prices")
pp = normalize_columns(pp_raw)

pp = (
    pp
    .withColumn("price", F.col("price").cast(DoubleType()))
    .filter(F.col("brand").isNotNull())
    .filter(F.col("description").isNotNull())
    .filter(F.col("price") > 0)
    .withColumn("brand", F.col("brand").cast(IntegerType()))
    .dropDuplicates(["brand", "description"])
)

write_silver(pp, "purchase_prices")

### ü•à Silver ‚Äî Invoice Purchases

In [None]:
print("Processing silver.invoice_purchases ...")
inv_raw = spark.read.format("delta").load(bronze_path + "invoices")
inv = normalize_columns(inv_raw)

inv = (
    inv
    .withColumn("vendor_number",  F.col("vendor_number").cast(IntegerType()))
    .withColumn("quantity",       F.col("quantity").cast(IntegerType()))
    .withColumn("dollars",        F.col("dollars").cast(DoubleType()))
    .withColumn("freight",        F.col("freight").cast(DoubleType()))
    .withColumn("invoice_date",   F.to_date(F.col("invoice_date"), "MM/dd/yyyy"))
    .withColumn("pay_date",       F.to_date(F.col("pay_date"), "MM/dd/yyyy"))
    .filter(F.col("po_number").isNotNull())
    .dropDuplicates(["vendor_number", "po_number", "invoice_date"])
)

write_silver(inv, "invoice_purchases")

### ü•à Silver ‚Äî Purchases (Enriched)

In [None]:
print("Processing silver.purchases ...")
purch_raw = spark.read.format("delta").load(bronze_path + "purchases")
purch = normalize_columns(purch_raw)

purch = (
    purch
    .withColumn("vendor_number",   F.col("vendor_number").cast(IntegerType()))
    .withColumn("quantity",        F.col("quantity").cast(IntegerType()))
    .withColumn("dollars",         F.col("dollars").cast(DoubleType()))
    .withColumn("purchase_price",  F.col("purchase_price").cast(DoubleType()))
    .withColumn("brand",           F.col("brand").cast(IntegerType()))
    .withColumn("po_date",         F.to_date(F.col("po_date"), "MM/dd/yyyy"))
    .withColumn("receiving_date",  F.to_date(F.col("receiving_date"), "MM/dd/yyyy"))
    .withColumn("invoice_date",    F.to_date(F.col("invoice_date"), "MM/dd/yyyy"))
    .filter(F.col("brand").isNotNull())
    .filter(F.col("quantity") > 0)
    .dropDuplicates(["vendor_number", "po_number", "brand", "description", "receiving_date"])
)

pp_silver = spark.read.format("delta").load(silver_path + "purchase_prices") \
                 .select(
                     F.col("brand").alias("ref_brand"),
                     F.col("description").alias("ref_description"),
                     F.col("price").alias("ref_price")
                 )

purch_enriched = (
    purch
    .join(
        pp_silver,
        on=[purch["brand"] == pp_silver["ref_brand"],
            purch["description"] == pp_silver["ref_description"]],
        how="left"
    )
    .withColumn(
        "cost_per_unit",
        F.round(F.coalesce(F.col("ref_price"), F.col("purchase_price")), 4)
    )
    .drop("ref_brand", "ref_description", "ref_price")
    .withColumn("total_cost", F.round(F.col("cost_per_unit") * F.col("quantity"), 2))
)

write_silver(purch_enriched, "purchases", partition_by="brand")

### ü•à Silver ‚Äî Sales

In [None]:
print("Processing silver.sales ...")
sales_raw = spark.read.format("delta").load(bronze_path + "sales")
sales = normalize_columns(sales_raw)

sales = (
    sales
    .withColumn("brand",           F.col("brand").cast(IntegerType()))
    .withColumn("sales_quantity",  F.col("sales_quantity").cast(IntegerType()))
    .withColumn("sales_dollars",   F.col("sales_dollars").cast(DoubleType()))
    .withColumn("sales_price",     F.col("sales_price").cast(DoubleType()))
    .withColumn("excise_tax",      F.col("excise_tax").cast(DoubleType()))
    .withColumn("volume",          F.col("volume").cast(DoubleType()))
    .withColumn("sales_date",      F.to_date(F.col("sales_date"), "MM/dd/yyyy"))
    .withColumn("sale_year",       F.year("sales_date"))
    .withColumn("sale_month",      F.month("sales_date"))
    .withColumn("sale_month_name", F.date_format("sales_date", "MMMM"))
    .withColumn("sale_week",       F.weekofyear("sales_date"))
    .filter(F.col("brand").isNotNull())
    .filter(F.col("sales_dollars") > 0)
    .filter(F.col("sales_quantity") > 0)
    .filter(F.col("sales_date").isNotNull())
    .dropDuplicates(["store", "brand", "description", "sales_date", "sales_quantity", "sales_dollars"])
)

write_silver(sales, "sales", partition_by="brand")