In [0]:
from pyspark.sql import SparkSession 

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, LongType, StringType, DateType, IntegerType

In [0]:
spark = SparkSession.builder.appName('FactSales2Dimensionality').getOrCreate()

In [0]:
df = spark.read.format("delta").load("/delta/fact_sales_stage1")

In [0]:
windowed_avg = df.groupBy("ProductCategory").agg(F.mean("UnitsSold").alias("avg_units"))

In [0]:
df = df.join(windowed_avg, on="ProductCategory", how="left") \
       .withColumn("UnitsSold", F.coalesce("UnitsSold", "avg_units")) \
       .withColumn("UnitsSold", F.round(F.col("UnitsSold"), 2)) \
       .drop("avg_units")

In [0]:
display(df)

ProductCategory,ProductName,Brand,StoreRegion,StoreName,StoreType,SalesRep,Department,EmployeeRole,UnitsSold,UnitPrice,Discount,SaleDate
Furniture,T-shirt,BrandB,East,StoreX,Franchise,Martha Long,Electronics,Cashier,12.0,-1.0,5.0,2022-12-14
Clothing,Tablet,BrandC,East,StoreZ,Franchise,Martha Long,Home,Sales Associate,33.67,272.49,,2023-02-24
Clothing,Tablet,BrandA,South,StoreX,Retail,Emily Vazquez,Apparel,Cashier,33.67,484.75,15.0,2025-03-24
Electronics,Smartphone,BrandB,West,StoreY,Outlet,Charles Fields,Apparel,Cashier,26.0,205.74,10.0,2023-09-30
Furniture,T-shirt,BrandC,East,StoreZ,Outlet,Wendy Castillo,Home,Manager,46.0,20.25,5.0,2022-10-14
Furniture,T-shirt,BrandC,South,StoreY,Retail,Wendy Castillo,Home,Manager,22.5,361.06,10.0,2024-02-23
Clothing,T-shirt,BrandC,South,StoreY,Outlet,John Harris,Home,Cashier,37.0,492.65,5.0,2024-05-06
Electronics,Smartphone,BrandC,South,StoreX,Outlet,Charles Fields,Home,Sales Associate,37.0,293.87,15.0,2023-04-04
Clothing,Jeans,BrandA,South,StoreY,Retail,Wendy Castillo,Electronics,Manager,23.0,189.47,15.0,2022-12-26
Furniture,T-shirt,BrandB,East,StoreZ,Franchise,Charles Fields,Apparel,Manager,25.0,359.08,10.0,2022-10-28


We filled null UnitsSold with mean per ProductCategory

In [0]:
df.write.format("delta").mode("overwrite").save("/delta/fact_sales_final")

In [0]:
median_values = df.filter("UnitPrice != -1") \
    .groupBy("ProductCategory") \
    .agg(F.expr('percentile_approx(UnitPrice, 0.5)').alias("median_price"))

We filled null UnitPrice with median per ProductCategory

In [0]:
df = df.join(median_values, on="ProductCategory", how="left") \
       .withColumn("UnitPrice",F.round(F.when((F.col("UnitPrice") == -1) | F.col("UnitPrice").isNull(), F.col("median_price")).otherwise(F.col("UnitPrice")),2)) \
       .drop("median_price")

In [0]:
display(df)

ProductCategory,ProductName,Brand,StoreRegion,StoreName,StoreType,SalesRep,Department,EmployeeRole,UnitsSold,UnitPrice,Discount,SaleDate
Furniture,T-shirt,BrandB,East,StoreX,Franchise,Martha Long,Electronics,Cashier,12.0,279.35,5.0,2022-12-14
Clothing,Tablet,BrandC,East,StoreZ,Franchise,Martha Long,Home,Sales Associate,33.67,272.49,,2023-02-24
Clothing,Tablet,BrandA,South,StoreX,Retail,Emily Vazquez,Apparel,Cashier,33.67,484.75,15.0,2025-03-24
Electronics,Smartphone,BrandB,West,StoreY,Outlet,Charles Fields,Apparel,Cashier,26.0,205.74,10.0,2023-09-30
Furniture,T-shirt,BrandC,East,StoreZ,Outlet,Wendy Castillo,Home,Manager,46.0,20.25,5.0,2022-10-14
Furniture,T-shirt,BrandC,South,StoreY,Retail,Wendy Castillo,Home,Manager,22.5,361.06,10.0,2024-02-23
Clothing,T-shirt,BrandC,South,StoreY,Outlet,John Harris,Home,Cashier,37.0,492.65,5.0,2024-05-06
Electronics,Smartphone,BrandC,South,StoreX,Outlet,Charles Fields,Home,Sales Associate,37.0,293.87,15.0,2023-04-04
Clothing,Jeans,BrandA,South,StoreY,Retail,Wendy Castillo,Electronics,Manager,23.0,189.47,15.0,2022-12-26
Furniture,T-shirt,BrandB,East,StoreZ,Franchise,Charles Fields,Apparel,Manager,25.0,359.08,10.0,2022-10-28


In [0]:
df.write.format("delta").mode("overwrite").save("/delta/fact_sales_stage3")

In [0]:
df = spark.read.format("delta").load("/delta/fact_sales_stage3")

In [0]:
discount_median = df.groupBy("ProductCategory").agg(F.expr("percentile_approx(Discount, 0.5)").alias("median_discount"))

We use median to replace null in discount sections, however, we will get 0s after rounding to 2 decimel places as they are in the order of 0,5,10,15... 


In [0]:
df = df.join(discount_median, on="ProductCategory", how="left") \
       .withColumn("Discount", F.round(F.coalesce("Discount", "median_discount"), 2)) \
       .drop("median_discount")

In [0]:
display(df)

ProductCategory,ProductName,Brand,StoreRegion,StoreName,StoreType,SalesRep,Department,EmployeeRole,UnitsSold,UnitPrice,Discount,SaleDate
Furniture,T-shirt,BrandB,East,StoreX,Franchise,Martha Long,Electronics,Cashier,12.0,279.35,5.0,2022-12-14
Clothing,Tablet,BrandC,East,StoreZ,Franchise,Martha Long,Home,Sales Associate,33.67,272.49,0.0,2023-02-24
Clothing,Tablet,BrandA,South,StoreX,Retail,Emily Vazquez,Apparel,Cashier,33.67,484.75,15.0,2025-03-24
Electronics,Smartphone,BrandB,West,StoreY,Outlet,Charles Fields,Apparel,Cashier,26.0,205.74,10.0,2023-09-30
Furniture,T-shirt,BrandC,East,StoreZ,Outlet,Wendy Castillo,Home,Manager,46.0,20.25,5.0,2022-10-14
Furniture,T-shirt,BrandC,South,StoreY,Retail,Wendy Castillo,Home,Manager,22.5,361.06,10.0,2024-02-23
Clothing,T-shirt,BrandC,South,StoreY,Outlet,John Harris,Home,Cashier,37.0,492.65,5.0,2024-05-06
Electronics,Smartphone,BrandC,South,StoreX,Outlet,Charles Fields,Home,Sales Associate,37.0,293.87,15.0,2023-04-04
Clothing,Jeans,BrandA,South,StoreY,Retail,Wendy Castillo,Electronics,Manager,23.0,189.47,15.0,2022-12-26
Furniture,T-shirt,BrandB,East,StoreZ,Franchise,Charles Fields,Apparel,Manager,25.0,359.08,10.0,2022-10-28


In [0]:
df.write.format("delta").mode("overwrite").save("/delta/fact_sales_final")