In [0]:
import dlt
from pyspark.sql.functions import (
    col, current_timestamp, sha2, concat_ws,year,month,dayofmonth,dayofweek,weekofyear,quarter,date_format,expr
)


 # Stage Product Data

@dlt.view
def staged_product():
     product = dlt.readStream("training.ashish.product_silver").alias("product")
     prodmodel = dlt.read("training.ashish.productmodel_silver")
     subcategory = dlt.read("training.ashish.productsubcategory_silver")
     category = dlt.read("training.ashish.productcategory_silver")
     productdesc = dlt.read("training.ashish.productdescription_silver")
     prodmodelproddesccult = dlt.read("training.ashish.productmodelproductdescriptionculture_silver")

     df = product \
           .join(prodmodel, product["ProductModelID"] == prodmodel["ProductModelID"], "left")\
           .join(prodmodelproddesccult, prodmodelproddesccult["ProductModelID"] == prodmodel["ProductModelID"], "left")\
           .join(subcategory, product["ProductSubcategoryID"] == subcategory["ProductSubcategoryID"], "left")\
           .join(category, subcategory["ProductCategoryID"] == category["ProductCategoryID"], "left")\
           .join(productdesc, productdesc['ProductDescriptionID'] == prodmodelproddesccult['ProductDescriptionID'], "left")
     df = df.withColumn(
        "product_sk",
        sha2(concat_ws("||", col('product.ProductID'), col('product.ModifiedDate')), 256)
    )  
     product_mergeddf = df.select(
           df["product_sk"],
           product["ProductID"],
           product["Name"].alias("Product_Name"),
           product["ProductNumber"],
           product['MakeFlag'],
           product["FinishedGoodsFlag"],
           product["Color"],
           product['StandardCost'],
           product['SellStartDate'],
           product['SellEndDate'],
           product['ModifiedDate'],
           prodmodel['ProductModelID'].alias('ProductModelID'),
           prodmodel['Name'].alias('ProductModel_Name'),
           subcategory["ProductSubcategoryID"].alias("ProductSubcategoryID"),
           subcategory["Name"].alias("Subcategory_Name"),
           category['ProductCategoryID'].alias("ProductCategoryID"),
           category["Name"].alias("ProductCategory_Name"),
           prodmodelproddesccult['CultureID'],
           productdesc['ProductDescriptionID'],
           productdesc['Description'],
        
        
           current_timestamp().alias("event_timestamp"),
        
        ).filter(col("ProductID").isNotNull())
     return product_mergeddf

dlt.create_streaming_table(
     name="dim_product",
     comment="Product Dimension with SCD Type 2"
 )

dlt.create_auto_cdc_flow(
     target="dim_product",
     source="staged_product",
     keys=["ProductID"],
     sequence_by='ModifiedDate',
     stored_as_scd_type="2"
 )


 # Stage Customer Data
@dlt.view(
     name = "customer_combined_view"
 )
def customer_changes():
     customer = dlt.read("training.ashish.customer_silver").alias("cutomer")  
     person = dlt.readStream("training.ashish.person_silver") 
     businessentityaddress = dlt.read("training.ashish.businessentityaddress_silver")  
     addresstype = dlt.read("training.ashish.addresstype_silver")  
     emailaddress = dlt.read("training.ashish.emailaddress_silver")
     address = dlt.read("training.ashish.address_silver") 

     customer_mergeddf = person \
        .join(customer, person["BusinessEntityID"] == customer["PersonID"], "left")\
        .join(businessentityaddress, person['BusinessEntityID'] == businessentityaddress['BusinessEntityID'], "left") \
        .join(addresstype, businessentityaddress['AddressTypeID'] == addresstype['AddressTypeID'], "left") \
        .join(address, businessentityaddress['AddressID'] == address['AddressID'], "left")\
        .join(emailaddress, person['BusinessEntityID'] == emailaddress['BusinessEntityID'], "left")
     customer_mergeddf = customer_mergeddf.withColumn(
        "customer_sk",
        sha2(concat_ws("||", col('cutomer.CustomerID'), col('cutomer.ModifiedDate')), 256)
     )
     customer_final = customer_mergeddf.select(
           customer_mergeddf["customer_sk"],
           customer["CustomerID"],
           customer["PersonID"].alias('Customer_PersonID'),
           customer["StoreID"],
           customer['TerritoryID'],
           expr("training.ashish.customer_mask(AccountNumber)").alias("AccountNumber"),
           customer['ModifiedDate'],
           businessentityaddress['BusinessEntityID'],
           businessentityaddress['AddressTypeID'],
           businessentityaddress['AddressID'],
           person['BusinessEntityID'].alias('PersonID'),
           person['PersonType'],
           person['NameStyle'],
           person['FirstName'],
           person['MiddleName'],
           person['LastName'],
           person['ModifiedDate'].alias("Person_ModifiedDate"),
        #    emailaddress['EmailAddressID'],
        #    expr("training.ashish.mask_email(EmailAddress)").alias("EmailAddress"),
            expr("training.ashish.mask_email(EmailAddress)").alias ("EmailAddress"),
           address['AddressLine1'],
           address['AddressLine2'],
           emailaddress['EmailAddressID'],
        #    emailaddress['EmailAddress'],
           customer['rowguid'],
           current_timestamp().alias("Ingested_time"),
           #current_timestamp().cast("timestamp").alias("event_timestamp")
     ).filter(col("CustomerID").isNotNull())

     return customer_final
 # Create Streaming Table for Gold Dimension
dlt.create_streaming_table(
   name = 'dim_customer',
   comment="customer with SCD Type 2"
 )

 # Apply Changes (SCD Type 2)
dlt.apply_changes(
   name = "apply_the_changes_to_the_dim_customer_gold_table",
   target = 'dim_customer',
   source = 'customer_combined_view',
   keys = ['PersonID'],
   sequence_by = 'Person_ModifiedDate',
   stored_as_scd_type = 2
 )

 #Date Dimension

@dlt.table(
     name="dim_date",
     comment="Date Dimension"
 )
def dim_date():
     df = dlt.readStream("training.ashish.salesorderdetail_silver")
     df = df.select(col("ModifiedDate").alias("date")).distinct()

     return df.select(
         col("date").alias("Date"),
         year("date").alias("Year"),
         month("date").alias("Month"),
         dayofmonth("date").alias("Day"),
         dayofweek("date").alias("DayOfWeek"),
         weekofyear("date").alias("WeekOfYear"),
         quarter("date").alias("Quarter"),
         date_format(col("date"), "MMMM").alias("MonthName"),
         date_format(col("date"), "EEEE").alias("DayName"),
         date_format(col("date"), "yyyyMMdd").alias("DateKey")
     )




 # Fact Sales Order Detail

@dlt.table(
     name='Sales_fact_table',
     comment="Fact table for sales with FK to product, customer, and date dimensions"
 )
def fact_sales():
     sales_order_detail = dlt.readStream("training.ashish.salesorderdetail_silver")
     sales_order_header = dlt.read("training.ashish.salesorderheader_silver")
    
     sales_df = sales_order_detail.join(
         sales_order_header, 
         sales_order_detail["SalesOrderID"] == sales_order_header["SalesOrderID"], 
         "left"
     )
     sales_df = sales_df.select(
                   sales_order_detail["SalesOrderID"].alias("SalesOrderID"),
                   sales_order_detail["SalesOrderDetailID"].alias("SalesOrderDetailID"),
                   sales_order_detail["ProductID"],
                   sales_order_detail["OrderQty"].alias("OrderQty"),
                   sales_order_detail["UnitPrice"].alias("UnitPrice"),
                   sales_order_detail["UnitPriceDiscount"].alias("UnitPriceDiscount"),
                   sales_order_detail["LineTotal"].alias("LineTotal"),
                   sales_order_header["CustomerID"],
                   sales_order_header["OrderDate"].alias("OrderDate"),
                   sales_order_header["ShipDate"].alias("ShipDate"),
                   sales_order_header["ShipMethodID"].alias("ShipMethodID")
     )

     # Get only active dimension rows (latest SCD2 records)
     customer_dim_df = dlt.read("training.ashish.dim_customer").filter(col('__END_AT').isNull())
     product_dim_df = dlt.read("training.ashish.dim_product").filter(col('__END_AT').isNull())
     date_dim_df = dlt.read("training.ashish.dim_date")

     merged_sales_df = (
         sales_df
         .join(product_dim_df, sales_df["ProductID"] == product_dim_df["ProductID"], "left")
         .join(customer_dim_df, sales_df["CustomerID"] == customer_dim_df["CustomerID"], "left")
         .join(date_dim_df, sales_df["OrderDate"] == date_dim_df["Date"], "left")
     )

     return merged_sales_df.select(
         sales_df["SalesOrderID"].alias("order_id"),
         sales_df["SalesOrderDetailID"].alias("order_detail_id"),
         sales_df["ProductID"].alias("product_key"),    # FK to dim_product
         sales_df["CustomerID"].alias("customer_key"),  # FK to dim_customer
         sales_df["OrderDate"].alias("date_key"),       # FK to dim_date
         sales_df["OrderQty"],
         sales_df["UnitPrice"],
         sales_df["UnitPriceDiscount"],
         sales_df["LineTotal"],
         customer_dim_df["ModifiedDate"].alias("Person_ModifiedDate"),
         product_dim_df["ModifiedDate"].alias("modifiedDate"),
         product_dim_df["product_sk"],
         customer_dim_df["customer_sk"]
      )

In [0]:
# import dlt
# from pyspark.sql.functions import (
#     col, current_timestamp, sha2, concat_ws,year,month,dayofmonth,dayofweek,weekofyear,quarter,date_format,expr
# )

# @dlt.view
# def staged_product():
#     product = dlt.readStream("training.ashish.product_silver").alias("product")
#     prodmodel = dlt.read("training.ashish.productmodel_silver")
#     subcategory = dlt.read("training.ashish.productsubcategory_silver")
#     category = dlt.read("training.ashish.productcategory_silver")
#     productdesc = dlt.read("training.ashish.productdescription_silver")
#     prodmodelproddesccult = dlt.read("training.ashish.productmodelproductdescriptionculture_silver")

#     df = product \
#         .join(prodmodel, "ProductModelID", "left") \
#         .join(prodmodelproddesccult, "ProductModelID", "left") \
#         .join(subcategory, "ProductSubcategoryID", "left") \
#         .join(category, "ProductCategoryID", "left") \
#         .join(productdesc, "ProductDescriptionID", "left")

#     df = df.withColumn(
#         "product_sk",
#         sha2(concat_ws("||", col('product.ProductID'), col('product.ModifiedDate')), 256)
#     )

#     return df.select(
#         "product_sk",
#         product["ProductID"],
#         product["Name"].alias("Product_Name"),
#         product["ProductNumber"],
#         product["MakeFlag"],
#         product["FinishedGoodsFlag"],
#         product["Color"],
#         product["StandardCost"],
#         product["SellStartDate"],
#         product["SellEndDate"],
#         product["ModifiedDate"],
#         prodmodel["ProductModelID"],
#         prodmodel["Name"].alias("ProductModel_Name"),
#         subcategory["ProductSubcategoryID"],
#         subcategory["Name"].alias("Subcategory_Name"),
#         category["ProductCategoryID"],
#         category["Name"].alias("ProductCategory_Name"),
#         prodmodelproddesccult["CultureID"],
#         productdesc["ProductDescriptionID"],
#         productdesc["Description"]
#     )

# dlt.create_streaming_table(name="dim_product", comment="Product Dimension with explicit surrogate key")

# dlt.create_auto_cdc_flow(
#     target="dim_product",
#     source="staged_product",
#     keys=["ProductID"],
#     sequence_by="ModifiedDate",
#     stored_as_scd_type="2"
# )


In [0]:
# @dlt.view(name="customer_combined_view")
# def customer_changes():
#     customer = dlt.read("training.ashish.customer_silver").alias("cutomer")
#     person = dlt.readStream("training.ashish.person_silver")
#     businessentityaddress = dlt.read("training.ashish.businessentityaddress_silver")
#     addresstype = dlt.read("training.ashish.addresstype_silver")
#     emailaddress = dlt.read("training.ashish.emailaddress_silver")
#     address = dlt.read("training.ashish.address_silver")

#     df = person \
#         .join(customer, person["BusinessEntityID"] == customer["PersonID"], "left") \
#         .join(businessentityaddress, "BusinessEntityID", "left") \
#         .join(addresstype, "AddressTypeID", "left") \
#         .join(address, "AddressID", "left") \
#         .join(emailaddress, "BusinessEntityID", "left")

#     df = df.withColumn(
#         "customer_sk",
#         sha2(concat_ws("||", col('cutomer.CustomerID'), col('cutomer.ModifiedDate')), 256)
#     )

#     return df.select(
#         "customer_sk",
#         customer["CustomerID"],
#         customer["PersonID"].alias("Customer_PersonID"),
#         customer["StoreID"],
#         customer["TerritoryID"],
#         expr("training.ashish.customer_mask(AccountNumber)").alias("AccountNumber"),
#         customer["ModifiedDate"],
#         businessentityaddress["BusinessEntityID"],
#         businessentityaddress["AddressTypeID"],
#         businessentityaddress["AddressID"],
#         person["BusinessEntityID"].alias("PersonID"),
#         person["PersonType"],
#         person["NameStyle"],
#         person["FirstName"],
#         person["MiddleName"],
#         person["LastName"],
#         person["ModifiedDate"].alias("Person_ModifiedDate"),
#         emailaddress["EmailAddressID"],
#         expr("training.ashish.mask_email(EmailAddress)").alias("EmailAddress"),
#         address["AddressLine1"],
#         address["AddressLine2"],
#         customer["rowguid"],
#         current_timestamp().alias("Ingested_time")
#     )

# dlt.create_streaming_table(name="dim_customer", comment="Customer Dimension with explicit surrogate key")

# dlt.create_auto_cdc_flow(
#     target="dim_customer",
#     source="customer_combined_view",
#     keys=["CustomerID"],
#     sequence_by="ModifiedDate",
#     stored_as_scd_type="2"
# )


In [0]:
# @dlt.table(name="dim_date", comment="Date Dimension")
# def dim_date():
#     df = dlt.readStream("training.ashish.salesorderdetail_silver")
#     df = df.select(col("ModifiedDate").alias("date")).distinct()

#     return df.select(
#         col("date").alias("Date"),
#         year("date").alias("Year"),
#         month("date").alias("Month"),
#         dayofmonth("date").alias("Day"),
#         dayofweek("date").alias("DayOfWeek"),
#         weekofyear("date").alias("WeekOfYear"),
#         quarter("date").alias("Quarter"),
#         date_format(col("date"), "MMMM").alias("MonthName"),
#         date_format(col("date"), "EEEE").alias("DayName"),
#         date_format(col("date"), "yyyyMMdd").alias("DateKey")
#     )


In [0]:
# @dlt.table(name="Sales_fact_table", comment="Fact table using explicit surrogate keys")
# def fact_sales():
#     sales_order_detail = dlt.readStream("training.ashish.salesorderdetail_silver")
#     sales_order_header = dlt.read("training.ashish.salesorderheader_silver")

#     sales_df = sales_order_detail.join(
#         sales_order_header,
#         "SalesOrderID",
#         "left"
#     ).select(
#         "SalesOrderID", "SalesOrderDetailID", "ProductID", "OrderQty", "UnitPrice",
#         "UnitPriceDiscount", "LineTotal", "CustomerID", "OrderDate", "ShipDate", "ShipMethodID"
#     )

#     customer_dim_df = dlt.read("dim_customer").filter(col("__END_AT").isNull())
#     product_dim_df = dlt.read("dim_product").filter(col("__END_AT").isNull())
#     date_dim_df = dlt.read("dim_date")

#     enriched_df = sales_df \
#         .join(product_dim_df, "ProductID", "left") \
#         .join(customer_dim_df, "CustomerID", "left") \
#         .join(date_dim_df, sales_df["OrderDate"] == date_dim_df["Date"], "left")

#     return enriched_df.select(
#         sales_df["SalesOrderID"].alias("order_id"),
#         sales_df["SalesOrderDetailID"].alias("order_detail_id"),
#         product_dim_df["product_sk"],
#         customer_dim_df["customer_sk"],
#         date_dim_df["DateKey"].alias("date_sk"),
#         sales_df["OrderQty"],
#         sales_df["UnitPrice"],
#         sales_df["UnitPriceDiscount"],
#         sales_df["LineTotal"],
#         sales_df["ShipDate"],
#         sales_df["ShipMethodID"],
#         customer_dim_df["ModifiedDate"].alias("Person_ModifiedDate"),
#         product_dim_df["ModifiedDate"].alias("modifiedDate")
#     )
