In [0]:
from pyspark.sql import SparkSession 

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import sha2, concat_ws, upper
from pyspark.sql.types import StructType, StructField, LongType, StringType, DateType, IntegerType

In [0]:
spark = SparkSession.builder.appName('FactSales2Dimensionality').getOrCreate()

In [0]:
df = spark.read.format("delta").load("/delta/fact_sales_final")

In [0]:
df.show()

+---------------+-----------+------+-----------+---------+---------+--------------+-----------+---------------+---------+---------+--------+----------+
|ProductCategory|ProductName| Brand|StoreRegion|StoreName|StoreType|      SalesRep| Department|   EmployeeRole|UnitsSold|UnitPrice|Discount|  SaleDate|
+---------------+-----------+------+-----------+---------+---------+--------------+-----------+---------------+---------+---------+--------+----------+
|      Furniture|    T-shirt|BrandB|       East|   StoreX|Franchise|   Martha Long|Electronics|        Cashier|     12.0|   279.35|     5.0|2022-12-14|
|       Clothing|     Tablet|BrandC|       East|   StoreZ|Franchise|   Martha Long|       Home|Sales Associate|    33.67|   272.49|     0.0|2023-02-24|
|       Clothing|     Tablet|BrandA|      South|   StoreX|   Retail| Emily Vazquez|    Apparel|        Cashier|    33.67|   484.75|    15.0|2025-03-24|
|    Electronics| Smartphone|BrandB|       West|   StoreY|   Outlet|Charles Fields|    A

In [0]:
dim_product = df.select("ProductCategory", "ProductName", "Brand").distinct()

dim_product = dim_product.withColumn("DIM_ProductId",sha2(concat_ws("||", "ProductCategory", "ProductName", "Brand"), 256))

default_product = spark.createDataFrame([("N/A", "N/A", "N/A")], ["ProductCategory", "ProductName", "Brand"])
default_product = default_product.withColumn("DIM_ProductId",
    sha2(concat_ws("||", "ProductCategory", "ProductName", "Brand"), 256))

dim_product = dim_product.union(default_product)

In [0]:
display(dim_product)

ProductCategory,ProductName,Brand,DIM_ProductId
Electronics,Smartphone,BrandC,fc08e15f7ca6c6df9df7da9e4815aaf89a29e1e19459ee7d969019e8cbb37123
Electronics,Smartphone,BrandB,6a421110b266104bf1cf67be336c9ca4cfff1f897b231e0631948bb3f034677e
Electronics,T-shirt,BrandA,cd2f205a3b37acc265cf4867714abf35098ab215ba53c4864cd498abeda41f19
Electronics,Desk,BrandC,d77d44bf969beb3dbe1c220e980137a5a50d692fcb5488788dcab5776eea5935
Clothing,Tablet,BrandA,a8406b374712399cebdf16a3e8a8c56cf72bf72db75cb2ad0dc3a1ff71e61285
Electronics,Chair,BrandA,dcd1602d12efc29c8c4598b7843ec57a534683119f15c2042695e5fe71031ffd
Furniture,Desk,BrandA,b5c80882f8e29da43355e05bb4fef15d73e22fc3e8a99209a389132cb1eaa6f2
Furniture,Tablet,BrandA,1839298fc3616a7fb8581b8eb5373248e28632ab00dfdec326053bc309f58bca
Electronics,Desk,BrandA,27280aade539db2d7433159d56c762fc2f0d86f6c030d9edc55eaaf355997161
Electronics,Tablet,BrandB,36fc80bfe93a526d8a175456e8243770785451a22d4ce473e3bbf391080eeba1


In [0]:
dim_store = df.select("StoreRegion", "StoreName", "StoreType").distinct()
dim_store = dim_store.withColumn("DIM_StoreId",sha2(concat_ws("||", "StoreRegion", "StoreName", "StoreType"), 256))
default_store = spark.createDataFrame([("N/A", "N/A", "N/A")],["StoreRegion", "StoreName", "StoreType"]).withColumn("DIM_StoreId",
    sha2(concat_ws("||", "StoreRegion", "StoreName", "StoreType"), 256))
dim_store_final = dim_store.union(default_store)

In [0]:
display(dim_store_final)

StoreRegion,StoreName,StoreType,DIM_StoreId
North,StoreY,Outlet,94ba623007bdf38a8d9dd25b359ba7a49b0a434a8c49e2aaff76cc12c50a8a77
South,StoreY,Retail,24007c12903a7fd1b36a5faa4c4069a2be77b043fb7b335f4a558f650aa68072
East,StoreX,Retail,eaaa9336e570e385b77b0050a85962732193ca23004af89ae72f2889abd91bd3
North,StoreX,Outlet,cd1aac1b6cedfa5d51c325a3c19d2132ce133ac285ce754fdf436527f68ab882
North,StoreZ,Retail,30d46cf1deb74c2d33b4ea7e7f6798ae0623d5195c080721dbf022942ba48b72
South,StoreZ,Franchise,b697ea824f0541eb515498c5eb6f2ce43947880d645dbe09ee14075c732aef48
South,StoreZ,Retail,1120c453edb7620e31fdc4578bc15c8cb5aa664c7d30218eefcaec0b67e511ce
East,StoreX,Franchise,3c7aa2361bb113e0892437ecba18d0fca00119aefb95552a6b18bb7eee14759a
South,StoreX,Outlet,76868e571d2e34422cf3c39e724b18da8fe589d29e3bd589ee5390c548aaabd5
East,StoreZ,Franchise,20f1ebafb8d3513848d887c5554eb0ce0ea2cc393afc88041813c6f9714ee7df


In [0]:
dim_employee = df.select("SalesRep", "Department", "EmployeeRole").distinct()
dim_employee = dim_employee.withColumn("DIM_EmployeeId",sha2(concat_ws("||", "SalesRep", "Department", "EmployeeRole"), 256))
default_employee = spark.createDataFrame([("N/A", "N/A", "N/A")],["SalesRep", "Department", "EmployeeRole"]).withColumn("DIM_EmployeeId",
    sha2(concat_ws("||", "SalesRep", "Department", "EmployeeRole"), 256))
dim_employee_final = dim_employee.union(default_employee)

In [0]:
display(dim_employee_final)

SalesRep,Department,EmployeeRole,DIM_EmployeeId
Kyle Lin,Electronics,Sales Associate,b90eb8c52995b65d912ee69813964cf8b27d10fea22e5176a0efde3a2c331ad0
Charles Fields,Apparel,Manager,11e3df9450f3951d15425406cfd773e170e7e53297b071b8dfcfa87a688599c3
Wendy Castillo,Home,Manager,c4a16c19bcbaa9d8754690e3a800e18c1e1ee9c48dde85e5b64bcf7f30578cab
Wendy Castillo,Electronics,Manager,5b35a50e37f2fb896eef5da16148195eff3e208575ad22d8a6edfbff8511201c
Charles Fields,Home,Cashier,8b9d3284385f39a45911d6eb2bb4611a58d6ec152bee163ad1e886bd1083cd46
Kyle Lin,Home,Cashier,e3729a173a8c99d2fbdf176c56d695c1576bcc8e1a39a695a626c2fb1badb7a2
John Harris,Electronics,Manager,87ac6d5a4977bf75fdd34375ba02238b096ce6503cb4b6c9cd785114665a4d03
John Harris,Apparel,Manager,880162d4763b81e4ac57db4b957d788ae22f0ff2f3673c6d7c15f3a56a63ea54
Billy Perez,Electronics,Cashier,0b98f44b61cd1b506c8069d037184c4fde9d6b5b111aaa06ecb519d90d626be7
John Harris,Home,Cashier,fdedff2cb72a6a9be5669b3ceb5a76b5954802411dda3d1e83ddb183d5a86548


In [0]:

dim_product.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save("/delta/dim_product")

dim_store_final.write \
    .format("delta") \
    .option("overwriteSchema", "true") \
    .mode("overwrite") \
    .save("/delta/dim_store_final")

dim_employee_final.write \
    .format("delta") \
    .option("overwriteSchema", "true") \
    .mode("overwrite") \
    .save("/delta/dim_employee_final")