## Gold notebook cleanup: checkpoints + gold tables only



In [0]:
dbutils.widgets.text(name = "env", defaultValue = '', label = "Enter the environment in lower case")
env = dbutils.widgets.get("env")

print(f"Gold cleanup for environment: {env}")

%run "./commons"


# 1. Delete GOLD checkpoints
print("Deleting GOLD streaming checkpoints (if any)...")

gold_checkpoint_paths = [
    f"{checkpoint}/GoldOrderItemsLoad/Checkpt",
    f"{checkpoint}/GoldDailySalesSummary/Checkpt",
]

for path in gold_checkpoint_paths:
    try:
        dbutils.fs.rm(path, recurse = True)
        print(f"Deleted checkpoint: {path}")
    except Exception as e:
        print(f"Checkpoint not found or already deleted (skipped): {path}")


# 2. Drop GOLD tables only
print("Dropping GOLD tables created by gold notebook...")

gold_tables = [
    "gold_order_items",
    "daily_sales_summary",

    "sales_by_product_category",
    "sales_by_region",
    "top_customers_ltv"
]

for table in gold_tables:
    spark.sql(f"DROP TABLE IF EXISTS `{env}_catalog`.`gold`.`{table}`")
    print(f"Dropped gold table (if existed): {table}")

print("Gold cleanup completed!")


In [0]:
# Gold sales analytics - built from silver Orders & OrderDetails
dbutils.widgets.text(name = "env", defaultValue = '', label = 'Enter the environment in lower case')
env = dbutils.widgets.get("env")

print(f"Running Gold Sales Analytics for environment: {env}")


In [0]:
%run "./commons"

In [0]:
from pyspark.sql.functions import col, sum as _sum, countDistinct

## Read silver tables


In [0]:
print(f"Reading [silver_orders] table from {env}_catalog ...")
df_silver_orders = spark.table(f"`{env}_catalog`.`silver`.`silver_orders`")
display(df_silver_orders.limit(10))


In [0]:
print(f"Reading [silver_orderdetails] table from {env}_catalog ...")

df_silver_orderdetails = spark.table(f"`{env}_catalog`.`silver`.`silver_orderdetails`")
display(df_silver_orderdetails.limit(10))


In [0]:
print(f"Reading [silver_product] table from {env}_catalog ...")

df_silver_product = spark.table(f"`{env}_catalog`.`silver`.`silver_product`")
display(df_silver_product.limit(10))

In [0]:
print(f"Reading [silver_customer] table from {env}_catalog ...")

df_silver_customer = spark.table(f"`{env}_catalog`.`silver`.`silver_customer`")
display(df_silver_customer.limit(10))

## Build a simple gold fact table: fact_sales


In [0]:
# Build a simple gold fact table: fact_sales
# Joins silver_orderdetails, silver_orders, silver_product, silver_customer
print("Building [gold.fact_sales] DataFrame ...")

df_fact_sales = (
    df_silver_orderdetails.alias("od")
        .join(df_silver_orders.alias("o"), col("od.OrderID") == col("o.OrderID"), "inner")
        .join(df_silver_product.alias("p"), col("od.ProductID") == col("p.ProductID"), "inner")
        .join(df_silver_customer.alias("c"),col("o.CustomerID") == col("c.CustomerID"), "inner")
        .select(
            col("o.OrderID").alias("OrderID"),
            col("o.OrderDateParsed").alias("OrderDate"),
            col("o.CustomerID").alias("CustomerID"),
            col("od.ProductID").alias("ProductID"),

            # Product attributes
            col("p.ProductName").alias("ProductName"),
            col("p.CategoryName").alias("CategoryName"),

            # Measures
            col("od.OrderItemQuantity").alias("OrderItemQuantity"),
            col("od.PerUnitPrice").alias("PerUnitPrice"),
            col("od.LineSalesAmount").alias("LineSalesAmount")
        )
)

display(df_fact_sales.limit(10))


In [0]:
# Create gold schema if it does not exist
print(f"Ensuring `{env}_catalog`.`gold` schema exists ...")
spark.sql(f"CREATE DATABASE IF NOT EXISTS `{env}_catalog`.`gold`")

# Write / refresh gold.fact_sales table
print("Writing [gold.fact_sales] table (OVERWRITE mode) ...")

(df_fact_sales
    .write
    .format("delta")
    .mode("overwrite")
    .saveAsTable(f"`{env}_catalog`.`gold`.`fact_sales`")
)

print("Success! [gold.fact_sales] created/overwritten.")



In [0]:
# Check gold.fact_sales
print(f"Reading [gold.fact_sales] table from {env}_catalog ...")
display(spark.sql(f"SELECT * FROM `{env}_catalog`.`gold`.`fact_sales` LIMIT 20"))

## Build aggregation gold tables (analytics-focused) 

In [0]:
# Example 1) Daily sales summary
print("Calculating [Daily Sales Summary] ...")

df_daily_sales = spark.sql(f"""
    SELECT
        OrderDate,
        SUM(LineSalesAmount)      AS TotalSalesAmount,
        SUM(OrderItemQuantity)    AS TotalQuantity,
        COUNT(DISTINCT OrderID)   AS NumberOfOrders
    FROM `{env}_catalog`.`gold`.`fact_sales`
    GROUP BY OrderDate
    ORDER BY OrderDate
""")

display(df_daily_sales)



In [0]:
# Example 2) Top 10 products by total sales
print("Calculating [Top 10 Products by Total Sales] ...")

df_top_products = spark.sql(f"""
    SELECT
        ProductID,
        ProductName,
        CategoryName,
        SUM(LineSalesAmount) AS TotalSalesAmount,
        SUM(OrderItemQuantity) AS TotalQuantitySold
    FROM `{env}_catalog`.`gold`.`fact_sales`
    GROUP BY ProductID, ProductName, CategoryName
    ORDER BY TotalSalesAmount DESC
    LIMIT 10
""")

display(df_top_products)

In [0]:
# Example 3) Top 10 customers by total spend
print("Calculating [Top 10 Customers by Sales] ...")

df_top_customers = spark.sql(f"""
    SELECT
        CustomerID,
        SUM(LineSalesAmount)    AS TotalSalesAmount,
        COUNT(DISTINCT OrderID) AS NumberOfOrders
    FROM `{env}_catalog`.`gold`.`fact_sales`
    GROUP BY CustomerID
    ORDER BY TotalSalesAmount DESC
    LIMIT 10
""")

display(df_top_customers)