In [0]:
from pyspark.sql import SparkSession 
spark = SparkSession.builder.appName('Mapping').getOrCreate()

In [0]:
dbutils.fs.ls("/FileStore/tables/")

Out[2]: [FileInfo(path='dbfs:/FileStore/tables/Custom_Mapping_DIM.csv', name='Custom_Mapping_DIM.csv', size=224, modificationTime=1750890784000),
 FileInfo(path='dbfs:/FileStore/tables/Dim_CustomerType.csv', name='Dim_CustomerType.csv', size=65, modificationTime=1750890993000),
 FileInfo(path='dbfs:/FileStore/tables/Dim_ProductCategory.csv', name='Dim_ProductCategory.csv', size=113, modificationTime=1750891112000),
 FileInfo(path='dbfs:/FileStore/tables/Dim_StoreRegion.csv', name='Dim_StoreRegion.csv', size=69, modificationTime=1750891157000),
 FileInfo(path='dbfs:/FileStore/tables/Fact_Transactions.csv', name='Fact_Transactions.csv', size=330, modificationTime=1750891215000),
 FileInfo(path='dbfs:/FileStore/tables/sales.csv', name='sales.csv', size=3308303, modificationTime=1750795238000)]

In [0]:
df_Custom_Mapping_DIM = spark.read.csv("/FileStore/tables/Custom_Mapping_DIM.csv", header=True, inferSchema=True)
df_Custom_Mapping_DIM.show()

+---------------+-----------+------------+-----------------+
|ProductCategory|StoreRegion|CustomerType|     MappingLabel|
+---------------+-----------+------------+-----------------+
|         Tablet|       East|      Retail|        Side Head|
|         Laptop|       West|      Retail|      Premium Box|
|         Mobile|      North|   Wholesale|North Dist Mobile|
|         Tablet|      South|      Retail|      Tablet Push|
|         Laptop|      North|   Wholesale|    Laptop Supply|
+---------------+-----------+------------+-----------------+



In [0]:
df_Dim_CustomerType = spark.read.csv("/FileStore/tables/Dim_CustomerType.csv", header=True, inferSchema=True)
df_Dim_CustomerType.show()

+------------+--------------+-------+
|CustomerType|CustomerTypeID|Segment|
+------------+--------------+-------+
|      Retail|             1|    B2C|
|   Wholesale|             2|    B2B|
+------------+--------------+-------+



In [0]:
df_Dim_ProductCategory = spark.read.csv("/FileStore/tables/Dim_ProductCategory.csv", header=True, inferSchema=True)
df_Dim_ProductCategory.show()

+---------------+-----------------+-------------+
|ProductCategory|ProductCategoryID|CategoryGroup|
+---------------+-----------------+-------------+
|         Laptop|              101|    Computing|
|         Mobile|              102|  Electronics|
|         Tablet|              103|    Computing|
+---------------+-----------------+-------------+



In [0]:
df_Dim_StoreRegion = spark.read.csv("/FileStore/tables/Dim_StoreRegion.csv", header=True, inferSchema=True)
df_Dim_StoreRegion.show()

+-----------+--------+---------+
|StoreRegion|RegionID|Territory|
+-----------+--------+---------+
|      North|       1|        A|
|      South|       2|        B|
|       East|       3|        C|
|       West|       4|        D|
+-----------+--------+---------+



In [0]:
df_Fact_Transactions = spark.read.csv("/FileStore/tables/Fact_Transactions.csv", header=True, inferSchema=True)
df_Fact_Transactions.show()

+-------------+---------------+-----------+------------+------+
|TransactionID|ProductCategory|StoreRegion|CustomerType|Amount|
+-------------+---------------+-----------+------------+------+
|            1|         Laptop|      North|      Retail|  1200|
|            2|         Mobile|      South|   Wholesale|   800|
|            3|         Tablet|       East|      Retail|   300|
|            4|         Laptop|       West|      Retail|  1500|
|            5|         Tablet|      North|   Wholesale|   400|
|            6|         Mobile|       East|      Retail|   900|
|            7|         Mobile|      South|      Retail|   850|
|            8|         Laptop|       West|   Wholesale|  1700|
|            9|         Tablet|       East|      Retail|   350|
|           10|         Mobile|      North|      Retail|   950|
+-------------+---------------+-----------+------------+------+



In [0]:
from pyspark.sql.functions import sha2, concat_ws, col
df_Custom_Mapping_DIM = df_Custom_Mapping_DIM.withColumn("DIM_CustomLabelID",
                                                  sha2(concat_ws("||", col("ProductCategory"), col("StoreRegion"), col("CustomerType")), 256))

In [0]:
from pyspark.sql.functions import hash, trim, upper, col

df_Custom_Mapping_DIM = df_Custom_Mapping_DIM.withColumn("DIM_CustomLabelID",hash(col("DIM_CustomLabelID")).cast("bigint"))
df_Custom_Mapping_DIM = df_Custom_Mapping_DIM.select("DIM_CustomLabelID", "ProductCategory", "StoreRegion", "CustomerType", "MappingLabel")
df_Custom_Mapping_DIM.show()

+-----------------+---------------+-----------+------------+-----------------+
|DIM_CustomLabelID|ProductCategory|StoreRegion|CustomerType|     MappingLabel|
+-----------------+---------------+-----------+------------+-----------------+
|       -109225604|         Tablet|       East|      Retail|        Side Head|
|      -1989734020|         Laptop|       West|      Retail|      Premium Box|
|       1586346454|         Mobile|      North|   Wholesale|North Dist Mobile|
|      -1031006801|         Tablet|      South|      Retail|      Tablet Push|
|       -132973822|         Laptop|      North|   Wholesale|    Laptop Supply|
+-----------------+---------------+-----------+------------+-----------------+



In [0]:
df_Custom_Mapping_DIM.write.format("delta").mode("overwrite").saveAsTable("DIM_CustomLabel")

In [0]:
#ProductCategory DIM
df_Dim_ProductCategory = df_Dim_ProductCategory \
    .withColumn("ProductCategory", trim(upper(col("ProductCategory")))) \
    .withColumn("DIM_ProductCategoryID", hash(col("ProductCategory")).cast("bigint"))

#StoreRegion DIM
df_Dim_StoreRegion = df_Dim_StoreRegion \
    .withColumn("StoreRegion", trim(upper(col("StoreRegion")))) \
    .withColumn("DIM_StoreRegionID", hash(col("StoreRegion")).cast("bigint"))

#CustomerType DIM
df_Dim_CustomerType = df_Dim_CustomerType \
    .withColumn("CustomerType", trim(upper(col("CustomerType")))) \
    .withColumn("DIM_CustomerTypeID", hash(col("CustomerType")).cast("bigint"))

In [0]:
df_Dim_ProductCategory.show()

+---------------+-----------------+-------------+---------------------+
|ProductCategory|ProductCategoryID|CategoryGroup|DIM_ProductCategoryID|
+---------------+-----------------+-------------+---------------------+
|         LAPTOP|              101|    Computing|           2110147584|
|         MOBILE|              102|  Electronics|          -1314911383|
|         TABLET|              103|    Computing|            178668753|
+---------------+-----------------+-------------+---------------------+



In [0]:
df_Dim_StoreRegion.show()

+-----------+--------+---------+-----------------+
|StoreRegion|RegionID|Territory|DIM_StoreRegionID|
+-----------+--------+---------+-----------------+
|      NORTH|       1|        A|       1869582694|
|      SOUTH|       2|        B|        -96241737|
|       EAST|       3|        C|      -1592160435|
|       WEST|       4|        D|       -402154387|
+-----------+--------+---------+-----------------+



In [0]:
df_Dim_CustomerType.show()

+------------+--------------+-------+------------------+
|CustomerType|CustomerTypeID|Segment|DIM_CustomerTypeID|
+------------+--------------+-------+------------------+
|      RETAIL|             1|    B2C|       -2034590472|
|   WHOLESALE|             2|    B2B|        2128323039|
+------------+--------------+-------+------------------+



In [0]:
df_Dim_ProductCategory.write.format("delta").mode("overwrite").saveAsTable("DIM_ProductCategory_Hashed")
df_Dim_StoreRegion.write.format("delta").mode("overwrite").saveAsTable("DIM_StoreRegion_Hashed")
df_Dim_CustomerType.write.format("delta").mode("overwrite").saveAsTable("DIM_CustomerType_Hashed")


In [0]:
df_fact_cleaned_input = df_Fact_Transactions \
    .withColumn("ProductCategory", upper(trim(col("ProductCategory")))) \
    .withColumn("StoreRegion", upper(trim(col("StoreRegion")))) \
    .withColumn("CustomerType", upper(trim(col("CustomerType"))))

Normalize these tables so that t

In [0]:
df_fact_norm = df_fact_cleaned_input \
    .withColumn("ProductCategory", upper(trim(col("ProductCategory")))) \
    .withColumn("StoreRegion", upper(trim(col("StoreRegion")))) \
    .withColumn("CustomerType", upper(trim(col("CustomerType"))))


df_Dim_ProductCategory_norm = df_Dim_ProductCategory \
    .withColumn("ProductCategory", upper(trim(col("ProductCategory"))))

df_Dim_StoreRegion_norm = df_Dim_StoreRegion \
    .withColumn("StoreRegion", upper(trim(col("StoreRegion"))))

df_Dim_CustomerType_norm = df_Dim_CustomerType \
    .withColumn("CustomerType", upper(trim(col("CustomerType"))))

df_Custom_Mapping_DIM_norm = df_Custom_Mapping_DIM \
    .withColumn("ProductCategory", upper(trim(col("ProductCategory")))) \
    .withColumn("StoreRegion", upper(trim(col("StoreRegion")))) \
    .withColumn("CustomerType", upper(trim(col("CustomerType"))))


In [0]:
#Join with ProductCategory dimension
df_fact_enriched = df_fact_norm.join(
    df_Dim_ProductCategory_norm.select("ProductCategory", "DIM_ProductCategoryID"),
    on="ProductCategory", how="left")

#Join with StoreRegion dimension
df_fact_enriched = df_fact_enriched.join(
    df_Dim_StoreRegion_norm.select("StoreRegion", "DIM_StoreRegionID"),
    on="StoreRegion", how="left")

#Join with CustomerType dimension
df_fact_enriched = df_fact_enriched.join(
    df_Dim_CustomerType_norm.select("CustomerType", "DIM_CustomerTypeID"),
    on="CustomerType", how="left")


In [0]:
df_fact_final = df_fact_enriched.join(
    df_Custom_Mapping_DIM_norm.select("ProductCategory", "StoreRegion", "CustomerType", "DIM_CustomLabelID"),
    on=["ProductCategory", "StoreRegion", "CustomerType"],
    how="left")

In [0]:
df_fact_final.show()

+---------------+-----------+------------+-------------+------+---------------------+-----------------+------------------+-----------------+
|ProductCategory|StoreRegion|CustomerType|TransactionID|Amount|DIM_ProductCategoryID|DIM_StoreRegionID|DIM_CustomerTypeID|DIM_CustomLabelID|
+---------------+-----------+------------+-------------+------+---------------------+-----------------+------------------+-----------------+
|         LAPTOP|      NORTH|      RETAIL|            1|  1200|           2110147584|       1869582694|       -2034590472|             null|
|         MOBILE|      SOUTH|   WHOLESALE|            2|   800|          -1314911383|        -96241737|        2128323039|             null|
|         TABLET|       EAST|      RETAIL|            3|   300|            178668753|      -1592160435|       -2034590472|       -109225604|
|         LAPTOP|       WEST|      RETAIL|            4|  1500|           2110147584|       -402154387|       -2034590472|      -1989734020|
|         TAB

In [0]:
#Check 
df_fact_cleaned = df_fact_final.select(
    "TransactionID", "DIM_ProductCategoryID", "DIM_StoreRegionID",
    "DIM_CustomerTypeID", "DIM_CustomLabelID", "Amount")
df_fact_cleaned.show()

+-------------+---------------------+-----------------+------------------+-----------------+------+
|TransactionID|DIM_ProductCategoryID|DIM_StoreRegionID|DIM_CustomerTypeID|DIM_CustomLabelID|Amount|
+-------------+---------------------+-----------------+------------------+-----------------+------+
|            1|           2110147584|       1869582694|       -2034590472|             null|  1200|
|            2|          -1314911383|        -96241737|        2128323039|             null|   800|
|            3|            178668753|      -1592160435|       -2034590472|       -109225604|   300|
|            4|           2110147584|       -402154387|       -2034590472|      -1989734020|  1500|
|            5|            178668753|       1869582694|        2128323039|             null|   400|
|            6|          -1314911383|      -1592160435|       -2034590472|             null|   900|
|            7|          -1314911383|        -96241737|       -2034590472|             null|   850|


In [0]:
df_fact_cleaned.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("fact_transactions_cleaned")