## Bronze data peocessing


In [0]:
from pyspark.sql.functions import *
from delta.tables import DeltaTable

In [0]:
%run /Workspace/Consolidated_pipeline/1_setup/utilities

In [0]:
print(bronze_schema, silver_schema, gold_schema)

In [0]:
dbutils.widgets.text("catalog","fmcg","Catalog")
dbutils.widgets.text("data_source","customers","Data Source")

In [0]:
catalog = dbutils.widgets.get("catalog")
data_source = dbutils.widgets.get("data_source")

base_path = f's3://sportsbar-dj047/{data_source}/*.csv'
print(base_path)

In [0]:
df = spark.read.format('csv')\
        .option('header',True)\
        .option("inferSchema",True)\
        .load(base_path)\
        .withColumn("read_timestamp",current_timestamp())\
        .withColumn("file_name", col("_metadata.file_name"))\
        .withColumn("file_size", col("_metadata.file_size"))
display(df, truncate=False)

In [0]:
df.write.format("delta").option("delta.enableChangeDataFeed", "true").mode("overwrite").saveAsTable(f"{catalog}.{bronze_schema}.{data_source}")

## Silver Data Processing

In [0]:
df_bronze = spark.sql(f"SELECT * FROM {catalog}.{bronze_schema}.{data_source}")
df_bronze.display(10)

In [0]:
df_bronze.printSchema()

In [0]:
df_duplicate = df_bronze.groupBy(col('customer_id')).count().filter(col('count') > 1)
df_duplicate.display()

In [0]:

print('Rows before Dupclicates dropped:',df_bronze.count())
df_silver = df_bronze.dropDuplicates(['customer_id'])
print('Rows after Dupclicates dropped:',df_silver.count())


In [0]:
df_silver = df_silver.withColumn("customer_name",trim(col('customer_name')))

In [0]:
# (df_silver.filter(col("customer_name") != trim(col('customer_name')))).display()

In [0]:
df_silver.select('city').distinct().display()

In [0]:
# typos â†’ correct names

city_mapping = {
    'Bengaluruu': 'Bengaluru',
    'Bengalore': 'Bengaluru',

    'Hyderabadd': 'Hyderabad',
    'Hyderbad': 'Hyderabad',

    'NewDelhi': 'New Delhi',
    'NewDheli': 'New Delhi',
    'NewDelhee': 'New Delhi'
}


allowed = ["Bengaluru", "Hyderabad", "New Delhi"]

# df_silver = df_silver.replace(city_mapping, subset = ['city'])\
#     .withColumn('city', when(col('city').isin(allowed), col('city'))
#                                         .when(col('city').isNull(),None)
#                                         .otherwise(None))
    
df_silver = (
    df_silver
    .replace(city_mapping, subset=["city"])
    .withColumn(
        "city",
        when(col("city").isNull(), None)
         .when(col("city").isin(allowed), col("city"))
         .otherwise(None)
    )
)

In [0]:
df_silver.select('city').distinct().show()

In [0]:
df_silver.select('customer_name').distinct().show()

In [0]:
df_silver = df_silver.withColumn("customer_name",when(col('customer_name').isNull(),None)
                                        .otherwise(initcap(col('customer_name'))))

df_silver.select('customer_name').distinct().show()

In [0]:
df_silver.display()

In [0]:
df_silver.filter(col('city').isNull()).display()

In [0]:
null_cutomer_names = ['Sprintx Nutrition', 'Zenathlete Foods', 'Primefuel Nutrition', 'Recovery Lane' ]
df_silver.filter(col("customer_name").isin(null_cutomer_names)).display()

In [0]:
customer_city_fix = {
    # Sprintx Nutrition
    789403 : "New Delhi",

    # Zenathlete Foods
    789420 : "Bengaluru",

    # Primefuel Nutrition"
    789521 : 'Hyderabad',

    # Recovery Lane'
    789603: 'Hyderabad'
}

df_fix = spark.createDataFrame(
    [(k,v) for k,v in customer_city_fix.items()],
    ['customer_id','fixed_city']
)
df_fix.display()

In [0]:
df_silver = (
    df_silver
    .join(df_fix,"customer_id","left")
    .withColumn("city",coalesce('city','fixed_city')).drop('fixed_city')
)

In [0]:
df_silver.display()

In [0]:
df_silver = df_silver.withColumn("customer_id",col('customer_id').cast('string'))
df_silver.printSchema()

In [0]:
df_silver = df_silver.withColumnRenamed("customer_id",'customer_code')\
    .withColumn('customer', concat_ws('_', col("customer_name"), coalesce(col("city"),lit('Unknown'))))\
    .withColumn("market",lit("India"))\
    .withColumn("platform",lit("Sprts Bar"))\
    .withColumn("Channel",lit("Acquisition"))

df_silver.display() 

In [0]:
df_silver.write\
    .format('delta')\
    .option('delta.enableChangeDataFeed','true')\
    .option('mergeSchema','true')\
    .mode('overwrite')\
    .saveAsTable(f"{catalog}.{silver_schema}.{data_source}")

## Gold Data Processing

In [0]:
df_silver = spark.sql(f"SELECT * FROM {catalog}.{silver_schema}.{data_source};")


# take req cols only
# "customer_code, customer_name, city, read_timestamp, file_name, file_size, customer, market, platform, channel"
df_gold = df_silver.select("customer_code", "customer_name", "city", "customer", "market", "platform", "channel")

In [0]:
df_gold.write\
 .format("delta") \
 .option("delta.enableChangeDataFeed", "true") \
 .mode("overwrite") \
 .saveAsTable(f"{catalog}.{gold_schema}.sb_dim_{data_source}")

### Merging data Source with parent

In [0]:
delta_table = DeltaTable.forName(spark, "fmcg.gold.dim_customers")
df_child_customers = spark.table("fmcg.gold.sb_dim_customers").select(
    "customer_code",
    "customer",
    "market",
    "platform",
    "channel"
)

In [0]:
delta_table.alias("target").merge(
    source=df_child_customers.alias("source"),
    condition="target.customer_code = source.customer_code"
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

In [0]:
df_gold = spark.sql(f"SELECT * FROM {catalog}.{gold_schema}.dim_customers")
df_gold.display()