# GOLD NOTEBOOK: Create Materialized Lake View (mv_fact_sales)


In [1]:
%%sql
CREATE SCHEMA IF NOT EXISTS gold;

StatementMeta(, 6d77f00b-323c-4bc8-af16-fd63e7928704, 2, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

### 1. SALES SUMMARY

In [2]:
from datetime import datetime

PROCESSING_START_TIME = datetime.now()

StatementMeta(, 6d77f00b-323c-4bc8-af16-fd63e7928704, 4, Finished, Available, Finished)

In [3]:
%%sql

CREATE MATERIALIZED LAKE VIEW IF NOT EXISTS gold.mv_gold_sales_summary
COMMENT 'Materialized summary of sales by category, region, and month'
AS
SELECT
    category,
    sub_category,
    region,
    YEAR(TO_DATE(order_date, 'dd-MM-yyyy')) AS year,
    MONTH(TO_DATE(order_date, 'dd-MM-yyyy')) AS month,
    SUM(sales) AS total_sales,
    SUM(profit) AS total_profit,
    SUM(quantity) AS total_quantity,
    AVG(discount) AS avg_discount
FROM dbo.mv_fact_sales
GROUP BY
    category,
    sub_category,
    region,
    YEAR(TO_DATE(order_date, 'dd-MM-yyyy')),
    MONTH(TO_DATE(order_date, 'dd-MM-yyyy'));


StatementMeta(, 6d77f00b-323c-4bc8-af16-fd63e7928704, 5, Finished, Available, Finished)

<Spark SQL result set with 8 rows and 2 fields>

In [4]:


LOG_TABLE = "dbo.pipeline_log"
row_count = spark.table("gold.mv_gold_sales_summary").count()

PROCESSING_END_TIME = datetime.now()

duration_seconds = (PROCESSING_END_TIME - PROCESSING_START_TIME).total_seconds()
duration_minutes = round(duration_seconds / 60, 2)
path = "abfss://fabric_dev@onelake.dfs.fabric.microsoft.com/fabric_LH_sales.Lakehouse/Tables/gold/mv_gold_sales_summary"


log_data = [("gold_sales_summary", PROCESSING_START_TIME, PROCESSING_END_TIME, duration_minutes, "Gold-Layer", path, row_count)]
log_df = spark.createDataFrame(log_data, ["Dataset", "Start_Timestamp", "End_Timestamp", "run_duration", "Stage", "Destination", "Row_Count"])

log_df.write \
.format("delta") \
.mode("append") \
.option("mergeSchema", "true") \
.saveAsTable(LOG_TABLE)

print(" Pipeline load logged successfully.")


StatementMeta(, 6d77f00b-323c-4bc8-af16-fd63e7928704, 6, Finished, Available, Finished)

 Pipeline load logged successfully.


### 2. TOP CUSTOMERS

In [5]:
from datetime import datetime

PROCESSING_START_TIME_CUS = datetime.now()

StatementMeta(, 6d77f00b-323c-4bc8-af16-fd63e7928704, 7, Finished, Available, Finished)

In [3]:
%%sql

CREATE MATERIALIZED LAKE VIEW IF NOT EXISTS gold.mv_gold_top_customers
COMMENT 'Materialized lake view of top 50 customers by total sales and profit'
AS
SELECT
    customer_id,
    customer_name,
    segment,
    region,
    SUM(sales) AS total_sales,
    SUM(profit) AS total_profit,
    COUNT(DISTINCT order_id) AS total_orders
FROM dbo.mv_fact_sales
WHERE region IS NOT NULL
GROUP BY
    customer_id,
    customer_name,
    segment,
    region
ORDER BY total_sales DESC
LIMIT 50;


StatementMeta(, a739a281-ff37-407f-9c40-90fbb3668333, 4, Finished, Available, Finished)

<Spark SQL result set with 8 rows and 2 fields>

In [7]:


LOG_TABLE = "dbo.pipeline_log"
row_count_CUS = spark.table("gold.mv_gold_top_customers").count()

PROCESSING_END_TIME_CUS = datetime.now()

duration_seconds_CUS = (PROCESSING_END_TIME_CUS - PROCESSING_START_TIME_CUS).total_seconds()
duration_minutes_CUS = round(duration_seconds_CUS / 60, 2)
path_CUS = "abfss://fabric_dev@onelake.dfs.fabric.microsoft.com/fabric_LH_sales.Lakehouse/Tables/gold/mv_gold_top_customers"


log_data = [("gold_top_customers", PROCESSING_START_TIME_CUS, PROCESSING_END_TIME_CUS, duration_minutes_CUS, "Gold-Layer", path_CUS, row_count_CUS)]
log_df = spark.createDataFrame(log_data, ["Dataset", "Start_Timestamp", "End_Timestamp", "run_duration", "Stage", "Destination", "Row_Count"])

log_df.write \
.format("delta") \
.mode("append") \
.option("mergeSchema", "true") \
.saveAsTable(LOG_TABLE)

print(" Pipeline load logged successfully.")


StatementMeta(, 6d77f00b-323c-4bc8-af16-fd63e7928704, 9, Finished, Available, Finished)

 Pipeline load logged successfully.


### 3. PRODUCT SUMMARY

In [8]:
from datetime import datetime

PROCESSING_START_TIME_PRO = datetime.now()

StatementMeta(, 6d77f00b-323c-4bc8-af16-fd63e7928704, 10, Finished, Available, Finished)

In [9]:
%%sql

CREATE MATERIALIZED LAKE VIEW IF NOT EXISTS gold.mv_gold_product_summary
COMMENT 'Materialized lake view summarizing product-level sales, profit, and discounts'
AS
SELECT
    product_id,
    product_name,
    sub_category,
    category,
    SUM(sales) AS total_sales,
    SUM(profit) AS total_profit,
    SUM(quantity) AS total_quantity,
    ROUND(AVG(discount), 4) AS avg_discount
FROM dbo.mv_fact_sales
GROUP BY
    product_id, product_name, sub_category, category;


StatementMeta(, 6d77f00b-323c-4bc8-af16-fd63e7928704, 11, Finished, Available, Finished)

<Spark SQL result set with 8 rows and 2 fields>

In [10]:


LOG_TABLE = "dbo.pipeline_log"
row_count_PRO = spark.table("gold.mv_gold_product_summary").count()

PROCESSING_END_TIME_PRO = datetime.now()

duration_seconds_PRO = (PROCESSING_END_TIME_PRO - PROCESSING_START_TIME_PRO).total_seconds()
duration_minutes_PRO = round(duration_seconds_PRO / 60, 2)
path_PRO = "abfss://fabric_dev@onelake.dfs.fabric.microsoft.com/fabric_LH_sales.Lakehouse/Tables/gold/mv_gold_product_summary"


log_data = [("gold_product_summary", PROCESSING_START_TIME_PRO, PROCESSING_END_TIME_PRO, duration_minutes_PRO, "Gold-Layer", path_PRO, row_count_PRO)]
log_df = spark.createDataFrame(log_data, ["Dataset", "Start_Timestamp", "End_Timestamp", "run_duration", "Stage", "Destination", "Row_Count"])

log_df.write \
.format("delta") \
.mode("append") \
.option("mergeSchema", "true") \
.saveAsTable(LOG_TABLE)

print(" Pipeline load logged successfully.")


StatementMeta(, 6d77f00b-323c-4bc8-af16-fd63e7928704, 12, Finished, Available, Finished)

 Pipeline load logged successfully.


### 4. REGION PERFORMANCE

In [11]:
from datetime import datetime

PROCESSING_START_TIME_RG = datetime.now()

StatementMeta(, 6d77f00b-323c-4bc8-af16-fd63e7928704, 13, Finished, Available, Finished)

In [12]:
%%sql

CREATE MATERIALIZED LAKE VIEW IF NOT EXISTS gold.mv_gold_region_performance
COMMENT 'Materialized lake view summarizing regional performance metrics (sales, profit, discount)'
AS
SELECT
    region,
    country,
    state,
    COUNT(DISTINCT customer_id) AS total_customers,
    SUM(sales) AS total_sales,
    SUM(profit) AS total_profit,
    ROUND(AVG(discount), 4) AS avg_discount
FROM dbo.mv_fact_sales
GROUP BY
    region, country, state;


StatementMeta(, 6d77f00b-323c-4bc8-af16-fd63e7928704, 14, Finished, Available, Finished)

<Spark SQL result set with 8 rows and 2 fields>

In [13]:


LOG_TABLE = "dbo.pipeline_log"
row_count_RG = spark.table("gold.mv_gold_region_performance").count()

PROCESSING_END_TIME_RG = datetime.now()

duration_seconds_RG = (PROCESSING_END_TIME_RG - PROCESSING_START_TIME_RG).total_seconds()
duration_minutes_RG = round(duration_seconds_RG / 60, 2)
path_RG = "abfss://fabric_dev@onelake.dfs.fabric.microsoft.com/fabric_LH_sales.Lakehouse/Tables/gold/mv_gold_region_performance"


log_data = [("gold_region_performance", PROCESSING_START_TIME_RG, PROCESSING_END_TIME_RG, duration_minutes_RG, "Gold-Layer", path_RG, row_count_RG)]
log_df = spark.createDataFrame(log_data, ["Dataset", "Start_Timestamp", "End_Timestamp", "run_duration", "Stage", "Destination", "Row_Count"])

log_df.write \
.format("delta") \
.mode("append") \
.option("mergeSchema", "true") \
.saveAsTable(LOG_TABLE)

print(" Pipeline load logged successfully.")


StatementMeta(, 6d77f00b-323c-4bc8-af16-fd63e7928704, 15, Finished, Available, Finished)

 Pipeline load logged successfully.


### 5. MONTHLY TRENDS

In [14]:
from datetime import datetime

PROCESSING_START_TIME_MT = datetime.now()

StatementMeta(, 6d77f00b-323c-4bc8-af16-fd63e7928704, 16, Finished, Available, Finished)

In [15]:
%%sql

CREATE MATERIALIZED LAKE VIEW IF NOT EXISTS gold.mv_gold_monthly_trends
COMMENT 'Materialized lake view showing monthly sales, profit, and order trends'
AS
SELECT
    DATE_TRUNC('month', TO_DATE(order_date, 'dd-MM-yyyy')) AS month_start,
    SUM(sales) AS total_sales,
    SUM(profit) AS total_profit,
    SUM(quantity) AS total_quantity,
    COUNT(DISTINCT order_id) AS total_orders
FROM dbo.mv_fact_sales
GROUP BY DATE_TRUNC('month', TO_DATE(order_date, 'dd-MM-yyyy'))
ORDER BY month_start;



StatementMeta(, 6d77f00b-323c-4bc8-af16-fd63e7928704, 17, Finished, Available, Finished)

<Spark SQL result set with 8 rows and 2 fields>

In [16]:


LOG_TABLE = "dbo.pipeline_log"
row_count_MT = spark.table("gold.mv_gold_monthly_trends").count()

PROCESSING_END_TIME_MT = datetime.now()

duration_seconds_MT = (PROCESSING_END_TIME_MT - PROCESSING_START_TIME_MT).total_seconds()
duration_minutes_MT = round(duration_seconds_MT / 60, 2)
path_MT = "abfss://fabric_dev@onelake.dfs.fabric.microsoft.com/fabric_LH_sales.Lakehouse/Tables/gold/mv_gold_monthly_trends"


log_data = [("gold_monthly_trends", PROCESSING_START_TIME_MT, PROCESSING_END_TIME_MT, duration_minutes_MT, "Gold-Layer", path_MT, row_count_MT)]
log_df = spark.createDataFrame(log_data, ["Dataset", "Start_Timestamp", "End_Timestamp", "run_duration", "Stage", "Destination", "Row_Count"])

log_df.write \
.format("delta") \
.mode("append") \
.option("mergeSchema", "true") \
.saveAsTable(LOG_TABLE)

print(" Pipeline load logged successfully.")


StatementMeta(, 6d77f00b-323c-4bc8-af16-fd63e7928704, 18, Finished, Available, Finished)

 Pipeline load logged successfully.


### 6. CUSTOMER SUMMARY

In [17]:
from datetime import datetime

PROCESSING_START_TIME_CS = datetime.now()

StatementMeta(, 6d77f00b-323c-4bc8-af16-fd63e7928704, 19, Finished, Available, Finished)

In [5]:
%%sql

CREATE MATERIALIZED LAKE VIEW IF NOT EXISTS gold.mv_gold_customer_summary
COMMENT 'Materialized lake view showing per-customer sales, profit, and order summary'
AS
SELECT
    customer_id,
    customer_name,
    segment,
    region,
    COUNT(DISTINCT order_id) AS total_orders,
    SUM(sales) AS total_sales,
    SUM(profit) AS total_profit,
    ROUND(AVG(discount), 4) AS avg_discount,
    SUM(quantity) AS total_quantity
FROM dbo.mv_fact_sales
GROUP BY
    customer_id, customer_name, segment, region;



StatementMeta(, a739a281-ff37-407f-9c40-90fbb3668333, 6, Finished, Available, Finished)

<Spark SQL result set with 8 rows and 2 fields>

In [19]:


LOG_TABLE = "dbo.pipeline_log"
row_count_CS = spark.table("gold.mv_gold_customer_summary").count()

PROCESSING_END_TIME_CS = datetime.now()

duration_seconds_CS = (PROCESSING_END_TIME_CS - PROCESSING_START_TIME_CS).total_seconds()
duration_minutes_CS = round(duration_seconds_CS / 60, 2)
path_CS = "abfss://fabric_dev@onelake.dfs.fabric.microsoft.com/fabric_LH_sales.Lakehouse/Tables/gold/mv_gold_customer_summary"


log_data = [("gold_customer_summary", PROCESSING_START_TIME_CS, PROCESSING_END_TIME_CS, duration_minutes_CS, "Gold-Layer", path_CS, row_count_CS)]
log_df = spark.createDataFrame(log_data, ["Dataset", "Start_Timestamp", "End_Timestamp", "run_duration", "Stage", "Destination", "Row_Count"])

log_df.write \
.format("delta") \
.mode("append") \
.option("mergeSchema", "true") \
.saveAsTable(LOG_TABLE)

print(" Pipeline load logged successfully.")


StatementMeta(, 6d77f00b-323c-4bc8-af16-fd63e7928704, 21, Finished, Available, Finished)

 Pipeline load logged successfully.


### 7. SHIPPING PERFORMANCE

In [20]:
from datetime import datetime

PROCESSING_START_TIME_SP = datetime.now()

StatementMeta(, 6d77f00b-323c-4bc8-af16-fd63e7928704, 22, Finished, Available, Finished)

In [21]:
%%sql

CREATE MATERIALIZED LAKE VIEW IF NOT EXISTS gold.mv_gold_shipping_performance
COMMENT 'Materialized lake view showing shipping performance and sales metrics by region and ship mode'
AS
SELECT
    ship_mode,
    region,
    COUNT(DISTINCT order_id) AS total_orders,
    ROUND(
        AVG(DATEDIFF(
            TO_DATE(ship_date, 'dd-MM-yyyy'),
            TO_DATE(order_date, 'dd-MM-yyyy')
        )), 
    2) AS avg_ship_days,
    SUM(sales) AS total_sales
FROM dbo.mv_fact_sales
GROUP BY ship_mode, region;



StatementMeta(, 6d77f00b-323c-4bc8-af16-fd63e7928704, 23, Finished, Available, Finished)

<Spark SQL result set with 8 rows and 2 fields>

In [22]:

LOG_TABLE = "dbo.pipeline_log"
row_count_SP = spark.table("gold.mv_gold_shipping_performance").count()

PROCESSING_END_TIME_SP = datetime.now()

duration_seconds_SP = (PROCESSING_END_TIME_SP - PROCESSING_START_TIME_SP).total_seconds()
duration_minutes_SP = round(duration_seconds_SP / 60, 2)
path_SP = "abfss://fabric_dev@onelake.dfs.fabric.microsoft.com/fabric_LH_sales.Lakehouse/Tables/gold/mv_gold_shipping_performance"


log_data = [("gold_shipping_performance", PROCESSING_START_TIME_SP, PROCESSING_END_TIME_SP, duration_minutes_SP, "Gold-Layer", path_SP, row_count_SP)]
log_df = spark.createDataFrame(log_data, ["Dataset", "Start_Timestamp", "End_Timestamp", "run_duration", "Stage", "Destination", "Row_Count"])

log_df.write \
.format("delta") \
.mode("append") \
.option("mergeSchema", "true") \
.saveAsTable(LOG_TABLE)

print(" Pipeline load logged successfully.")


StatementMeta(, 6d77f00b-323c-4bc8-af16-fd63e7928704, 24, Finished, Available, Finished)

 Pipeline load logged successfully.
