In [0]:
from pyspark.sql.functions import *
from delta.tables import DeltaTable
from pyspark.sql.window import Window

In [0]:
%run /Workspace/Consolidated_pipeline/1_setup/utilities

In [0]:
print(bronze_schema, silver_schema, gold_schema)

In [0]:
dbutils.widgets.text("catalog", "fmcg", "Catalog")
dbutils.widgets.text("data_source", "gross_price", "Data Source")

catalog = dbutils.widgets.get("catalog")
data_source = dbutils.widgets.get("data_source")

base_path = f's3://sportsbar-dj047/{data_source}/*.csv'
print(base_path)

### Bronze Data Processing

In [0]:
df = spark.read.format('csv')\
    .option('inferSchema',True)\
    .option('header',True)\
    .load(base_path)\
    .withColumn("read_timestamp",current_timestamp())\
    .select("*","_metadata.file_name","_metadata.file_size")

display(df.limit(10))

In [0]:
# print check data type
df.printSchema()

In [0]:
df.write\
 .format("delta") \
 .option("delta.enableChangeDataFeed", "true") \
 .mode("overwrite") \
 .saveAsTable(f"{catalog}.{bronze_schema}.{data_source}")

### Silver data processing

In [0]:
df_bronze = spark.sql(f"select * from {catalog}.{bronze_schema}.{data_source}")
# display(df_bronze.limit(10))

Transformation

1. Normalize Month Field

In [0]:
df_bronze.select(col('month')).distinct().show()

In [0]:

# 1️. Parse `month` from multiple possible formats
date_formats = ["yyyy/MM/dd", "dd/MM/yyyy", "yyyy-MM-dd", "dd-MM-yyyy"]

df_silver = df_bronze.withColumn(
    "month",
    coalesce(
        try_to_date(col("month"), "yyyy/MM/dd"),
        try_to_date(col("month"), "dd/MM/yyyy"),
        try_to_date(col("month"), "yyyy-MM-dd"),
        try_to_date(col("month"), "dd-MM-yyyy")
    )
)

In [0]:
df_silver.select('month').distinct().show()

2. handling Gross Price

In [0]:
df_silver.show(10)

In [0]:
# We are validating the gross_price column, converting only valid numeric values to double, fixing negative prices by making them positive, and replacing all non-numeric values with 0

df_silver = df_silver.withColumn(
    "gross_price",
    when(col("gross_price").rlike(r'^-?\d+(\.\d+)?$'), 
           when(col("gross_price").cast("double") < 0, -1 * col("gross_price").cast("double"))
            .otherwise(col("gross_price").cast("double")))
    .otherwise(0)
)

In [0]:
display(df_silver)

In [0]:
# We enrich the silver dataset by performing an inner join with the products table to fetch the correct product_code for each product_id.

df_products = spark.table("fmcg.silver.products") 
df_joined = df_silver.join(df_products.select("product_id", "product_code"), on="product_id", how="inner")
df_joined = df_joined.select("product_id", "product_code", "month", "gross_price", "read_timestamp", "file_name", "file_size")

df_joined.show(5)

In [0]:
df_joined.write\
 .format("delta") \
 .option("delta.enableChangeDataFeed", "true")\
 .option("mergeSchema", "true") \
 .mode("overwrite") \
 .saveAsTable(f"{catalog}.{silver_schema}.{data_source}")

### Gold data Processing

In [0]:
df_silver = spark.sql(f"SELECT * FROM {catalog}.{silver_schema}.{data_source};")

In [0]:
# select only required columns
df_gold = df_silver.select("product_code", "month", "gross_price")
df_gold.show(5)

In [0]:
df_gold.write\
 .format("delta") \
 .option("delta.enableChangeDataFeed", "true") \
 .mode("overwrite") \
 .saveAsTable(f"{catalog}.{gold_schema}.sb_dim_{data_source}")

Merging Data Sourcing with parent

In [0]:
df_gold_price = spark.table("fmcg.gold.sb_dim_gross_price")
df_gold_price.show(5)

In [0]:
df_gold_price = (
    df_gold_price
    .withColumn("year", year("month"))
    # 0 = non-zero price, 1 = zero price  ➜ non-zero comes first
    .withColumn("is_zero", when(col("gross_price") == 0, 1).otherwise(0)))

w = (Window
    .partitionBy("product_code", "year")
    .orderBy(col("is_zero"), col("month").desc())
)

df_gold_latest_price = (
    df_gold_price
      .withColumn("rnk", row_number().over(w))
      .filter(col("rnk") == 1))

In [0]:
display(df_gold_latest_price)

In [0]:
## Take required cols

df_gold_latest_price = df_gold_latest_price.select("product_code", "year", "gross_price").withColumnRenamed("gross_price", "price_inr").select("product_code", "price_inr", "year")

# change year to string
df_gold_latest_price = df_gold_latest_price.withColumn("year", col("year").cast("string"))

display(df_gold_latest_price)