# Ingest to Gold Layer

Transforms Silver-layer tables into curated Gold-layer tables (`customer_orders`, `customer_summary`, and `product_summary`) for business intelligence and analytics.


In [0]:
customers_silver = spark.read.format("delta").load("/mnt/silver/customers_valid")
product_silver = spark.read.format("delta").load("/mnt/silver/products_valid")
orders_silver = spark.read.format("delta").load("/mnt/silver/orders_valid")

In [0]:
from pyspark.sql.functions import col

a = customers_silver.alias("a")
b = orders_silver.alias("b")
c = product_silver.alias("c")

customer_orders = (
    a.join(
        b,
        col("a.customer_id") == col("b.customer_id")
    )
    .join(
        c,
        col("b.product_id") == col("c.product_id")
    )
    .select(
        col("b.order_id"),
        col("b.order_date"),
        col("a.customer_id"),
        col("a.first_name").alias("customer_first_name"),
        col("a.last_name").alias("customer_last_name"),
        col("a.email"),
        col("a.city"),
        col("a.state"),
        col("c.product_id"),
        col("c.product_name"),
        col("c.brand"),
        col("c.category"),
        col("c.subcategory"),
        col("b.quantity"),
        col("c.unit_price"),
    )
    .withColumn(
        "order_total",
        col("b.quantity") * col("c.unit_price")
    )
)

order_id,order_date,customer_id,customer_first_name,customer_last_name,email,city,state,product_id,product_name,brand,category,subcategory,quantity,unit_price,order_total
1014,2025-02-16,148,Stephanie,Stevenson,stephanie.stevenson@yahoo.com,Phoenix,AZ,31,BrandB Floor Lamp,BrandB,Home,Decor,3,88.89,266.67
931,2024-12-22,463,Jim,Barnett,jim.barnett@yahoo.com,Brookline,MA,24,BrandB New Balance 990,BrandB,Clothing,Shoes,4,256.48,1025.92
89,2025-09-20,471,Michelle,Jackson,michelle.jackson@outlook.com,Chula Vista,CA,7,BrandA Samsung Galaxy S23,BrandA,Electronics,Smartphone,2,1044.48,2088.96
546,2025-04-02,496,Sabrina,Ramirez,sabrina.ramirez@gmail.com,Warren,MI,22,BrandE Nike Air Max,BrandE,Clothing,Shoes,4,191.84,767.36
394,2025-04-29,833,Sherry,Wood,sherry.wood@yahoo.com,Atlanta,GA,16,BrandA Oxford Shirt,BrandA,Clothing,Shirt,2,45.79,91.58
1810,2025-06-13,243,Cassidy,Williams,cassidy.williams@gmail.com,Dallas,TX,18,BrandA Levi's Jeans,BrandA,Clothing,Pants,2,61.78,123.56
121,2024-12-08,392,Erin,Moreno,erin.moreno@hotmail.com,Miami,FL,1,BrandB MacBook Pro,BrandB,Electronics,Laptop,2,1821.21,3642.42
859,2024-11-13,540,Colton,Hardy,colton.hardy@icloud.com,Livonia,MI,33,BrandC Wall Art,BrandC,Home,Decor,2,141.98,283.96
22,2025-07-13,623,Valerie,Powell,valerie.powell@yahoo.com,St Paul,MN,27,BrandD West Elm Table,BrandD,Home,Furniture,5,1458.57,7292.85
1310,2025-10-03,737,Kimberly,Dyer,kimberly.dyer@outlook.com,Sunnyvale,CA,19,BrandE Chinos,BrandE,Clothing,Pants,1,63.7,63.7


In [0]:
from pyspark.sql.functions import count, sum

customer_summary = (
    customer_orders
    .groupBy(
        "customer_id",
        "customer_first_name",
        "customer_last_name",
        "email",
        "city",
        "state"
    )
    .agg(
        count("order_id").alias("total_orders"),
        sum("order_total").alias("total_spent")
    )
)

customer_id,customer_first_name,customer_last_name,email,city,state,total_orders,total_spent
65,Kevin,Sutton,kevin.sutton@hotmail.com,Charlotte,NC,1,485.16
230,Anne,Hansen,anne.hansen@hotmail.com,North Las Vegas,NV,1,308.9
137,Brittany,Rose,brittany.rose@gmail.com,Bellevue,WA,3,14637.25
330,Derek,Carrillo,derek.carrillo@yahoo.com,Beaverton,OR,3,657.77
791,Sean,Hughes,sean.hughes@hotmail.com,Evanston,IL,1,3883.48
627,Douglas,Dyer,,Pearland,TX,6,13352.81
712,Bonnie,Thomas,bonnie.thomas@hotmail.com,Chicago,IL,5,2217.02
174,Joshua,Lloyd,joshua.lloyd@outlook.com,Henderson,NV,1,3590.4
694,Rachel,Parker,rachel.parker@yahoo.com,Livonia,MI,2,7300.68
523,Mark,Bridges,mark.bridges@gmail.com,Hialeah,FL,2,14131.83


In [0]:
product_summary = customer_orders \
    .groupBy("product_id", "product_name", "brand", "category", "subcategory") \
    .agg(
        sum("quantity").alias("units_sold"),
        sum("order_total").alias("total_revenue"),
        count("order_id").alias("times_ordered")
    )

display(product_summary)

product_id,product_name,brand,category,subcategory,units_sold,total_revenue,times_ordered
16,BrandA Oxford Shirt,BrandA,Clothing,Shirt,209,9570.11,64
27,BrandD West Elm Table,BrandD,Home,Furniture,184,268376.88,59
22,BrandE Nike Air Max,BrandE,Clothing,Shoes,189,36257.76,59
18,BrandA Levi's Jeans,BrandA,Clothing,Pants,155,9575.9,48
28,BrandA Crate & Barrel Chair,BrandA,Home,Furniture,171,166018.77,60
8,BrandB Google Pixel 7,BrandB,Electronics,Smartphone,187,168761.89,62
9,BrandB OnePlus 11,BrandB,Electronics,Smartphone,174,93605.04,56
12,BrandD AirPods Pro,BrandD,Electronics,Headphones,165,57684.0,56
3,BrandC HP Spectre,BrandC,Electronics,Laptop,205,356242.85,69
31,BrandB Floor Lamp,BrandB,Home,Decor,193,17155.77,58


In [0]:
customer_orders.write.format("delta").mode("overwrite") \
    .save(f"/mnt/gold/customer_orders")

customer_summary.write.format("delta").mode("overwrite") \
    .save(f"/mnt/gold/customer_summary")

product_summary.write.format("delta").mode("overwrite") \
    .save(f"/mnt/gold/product_summary")