## Silver Pipeline – Cleansed & Standardized Data
Here we shall refine the raw Bronze data through validation, deduplication, schema alignment, and business‑rule transformations.

In [0]:
# Load bronze data
df_events = spark.read.table("workspace.default.df_bronze_events")
display(df_events.limit(10))

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-10-01 00:00:00 UTC,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
2019-10-01 00:00:00 UTC,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2019-10-01 00:00:01 UTC,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
2019-10-01 00:00:01 UTC,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
2019-10-01 00:00:04 UTC,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d
2019-10-01 00:00:05 UTC,view,1480613,2053013561092866779,computers.desktop,pulser,908.62,512742880,0d0d91c2-c9c2-4e81-90a5-86594dec0db9
2019-10-01 00:00:08 UTC,view,17300353,2053013553853497655,,creed,380.96,555447699,4fe811e9-91de-46da-90c3-bbd87ed3a65d
2019-10-01 00:00:08 UTC,view,31500053,2053013558031024687,,luminarc,41.16,550978835,6280d577-25c8-4147-99a7-abc6048498d6
2019-10-01 00:00:10 UTC,view,28719074,2053013565480109009,apparel.shoes.keds,baden,102.71,520571932,ac1cd4e5-a3ce-4224-a2d7-ff660a105880
2019-10-01 00:00:11 UTC,view,1004545,2053013555631882655,electronics.smartphone,huawei,566.01,537918940,406c46ed-90a4-4787-a43b-59a410c1a5fb


### HANDLED MISSING VALUES

In [0]:
df_events = df_events.dropna(subset=["product_id"])
df_events = df_events.dropna(subset=["user_id"])
print("✅Dropped null values")

✅Dropped null values


### REMOVE DUPLICATES

In [0]:
df_events = df_events.dropDuplicates(["product_id", "user_id"])
print("✅Dropped duplicates based on product_id and user_id")

✅Dropped duplicates based on product_id and user_id


### STANDARDISE DATA TYPES

#### Addition of new columns 

In [0]:

# Add event_date and event_year columns
from pyspark.sql.functions import to_date, year

df_events = df_events.withColumn("event_date", to_date("event_time"))
df_events = df_events.withColumn("event_year", year("event_time"))
display(df_events.limit(10))

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_date,event_year
2019-11-17 08:43:15 UTC,view,2601500,2053013563970159485,,artel,117.09,572493724,2f9284c6-3213-425a-88c3-1683af98f3a7,2019-11-17,2019
2019-11-17 08:43:20 UTC,cart,32403829,2053013566562238479,,,20.59,572259923,b5b6ec96-3689-4b53-811f-9eed2bfedf9a,2019-11-17,2019
2019-11-17 08:43:20 UTC,view,1201292,2172371436436455782,electronics.tablet,apple,334.09,518536694,82c32b08-7db0-4576-87a0-1c84d2cc50c6,2019-11-17,2019
2019-11-17 08:43:25 UTC,view,1307240,2053013558920217191,computers.notebook,asus,385.57,530827719,3517cc5b-47ad-41fe-b8f5-834bfbcdbfd8,2019-11-17,2019
2019-11-17 08:43:52 UTC,view,1201479,2172371436436455782,electronics.tablet,lenovo,306.06,518307425,865c008f-24ed-41b7-8b43-5a112652f278,2019-11-17,2019
2019-11-17 08:44:04 UTC,view,4804295,2053013554658804075,electronics.audio.headphone,xiaomi,33.21,568514979,42e89d6f-4b15-4c9d-b175-0de3487c7786,2019-11-17,2019
2019-11-17 08:44:07 UTC,view,5801656,2053013553945772349,electronics.audio.subwoofer,element,131.28,515654856,a3bc13ae-6066-40b3-af64-5e8e5bbd6f01,2019-11-17,2019
2019-11-17 08:44:07 UTC,view,1004250,2053013555631882655,electronics.smartphone,apple,870.78,515470439,59789a31-90d5-409c-ac9f-d79636e45326,2019-11-17,2019
2019-11-17 08:44:27 UTC,view,15700033,2053013559733912211,,,202.58,554591196,b44105fb-ea13-4649-9700-f5b8f60a8d67,2019-11-17,2019
2019-11-17 08:44:47 UTC,view,14300088,2053013557603205653,electronics.audio.music_tools.piano,yamaha,161.11,513834891,524854f3-74b4-4fe4-9e85-a188d462b6a1,2019-11-17,2019


#### Changed Date format


In [0]:
from pyspark.sql.functions import date_format

# format: dd-MM-yyyy
df_events = df_events.withColumn("date", date_format("event_date", "dd-MM-yyyy"))
display(df_events.limit(3))

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_date,event_year,date
2019-11-17 08:43:15 UTC,view,2601500,2053013563970159485,,artel,117.09,572493724,2f9284c6-3213-425a-88c3-1683af98f3a7,2019-11-17,2019,17-11-2019
2019-11-17 08:43:20 UTC,cart,32403829,2053013566562238479,,,20.59,572259923,b5b6ec96-3689-4b53-811f-9eed2bfedf9a,2019-11-17,2019,17-11-2019
2019-11-17 08:43:20 UTC,view,1201292,2172371436436455782,electronics.tablet,apple,334.09,518536694,82c32b08-7db0-4576-87a0-1c84d2cc50c6,2019-11-17,2019,17-11-2019


####  String to date

In [0]:
from pyspark.sql.functions import to_date

df_events = df_events.withColumn("date", to_date("date", "dd-MM-yyyy"))
display(df_events.limit(3))

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_date,event_year,date
2019-11-17 08:43:15 UTC,view,2601500,2053013563970159485,,artel,117.09,572493724,2f9284c6-3213-425a-88c3-1683af98f3a7,2019-11-17,2019,2019-11-17
2019-11-17 08:43:20 UTC,cart,32403829,2053013566562238479,,,20.59,572259923,b5b6ec96-3689-4b53-811f-9eed2bfedf9a,2019-11-17,2019,2019-11-17
2019-11-17 08:43:20 UTC,view,1201292,2172371436436455782,electronics.tablet,apple,334.09,518536694,82c32b08-7db0-4576-87a0-1c84d2cc50c6,2019-11-17,2019,2019-11-17


#### Sorting data

In [0]:

# Sort by date
df_events = df_events.orderBy("event_date", ascending=True)
display(df_events.limit(3))

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_date,event_year,date
2019-10-01 02:28:25 UTC,view,11000101,2053013554960793975,appliances.kitchen.toster,philips,28.29,518497482,547ed012-4c21-4490-b9c5-80778b6d4780,2019-10-01,2019,2019-10-01
2019-10-01 02:33:57 UTC,view,39800121,2086471240842740173,apparel.shirt,dion,29.6,521245875,550e3f91-3053-4fb0-ae1f-20a712d67255,2019-10-01,2019,2019-10-01
2019-10-01 02:42:42 UTC,view,1005105,2053013555631882655,electronics.smartphone,apple,1415.48,553019427,a20b68b2-484f-4732-b4b9-b02a35c8fbc4,2019-10-01,2019,2019-10-01


#### Getting rid of unimportant columns

In [0]:
# Deleting columns
df_events = df_events.drop("event_time", "category_id", "user_session", "event_date")
display(df_events.limit(3))

event_type,product_id,category_code,brand,price,user_id,event_year,date
view,11000101,appliances.kitchen.toster,philips,28.29,518497482,2019,2019-10-01
view,39800121,apparel.shirt,dion,29.6,521245875,2019,2019-10-01
view,1005105,electronics.smartphone,apple,1415.48,553019427,2019,2019-10-01


### CORRECTING INCONSISTENCIES

In [0]:
from pyspark.sql.functions import col, round

# Convert price from string to float, round to two decimal places, then cast to integer
df_events = df_events.withColumn("price", round(col("price").cast("float"), 2).cast("int"))
display(df_events.limit(3))

event_type,product_id,category_code,brand,price,user_id,event_year,date
view,11000101,appliances.kitchen.toster,philips,28,518497482,2019,2019-10-01
view,39800121,apparel.shirt,dion,29,521245875,2019,2019-10-01
view,1005105,electronics.smartphone,apple,1415,553019427,2019,2019-10-01


### CLEAN CATEGORIAL VALUES

In [0]:
display(df_events.select("category_code").distinct().orderBy("category_code", ascending=True))

# Count the number of nulls in category_code
display(df_events.filter(col("category_code").isNull()).count())

category_code
""
accessories.bag
accessories.umbrella
accessories.wallet
apparel.belt
apparel.costume
apparel.dress
apparel.glove
apparel.jacket
apparel.jeans


20094946

#### Category split

In [0]:
from pyspark.sql.functions import split

df_events = df_events.withColumn("category", split("category_code", "\\.").getItem(0))
df_events=df_events.withColumn("product_name",split("category_code","\\.").getItem(1))
display(df_events.limit(3))

event_type,product_id,category_code,brand,price,user_id,event_year,date,category,product_name
view,11000101,appliances.kitchen.toster,philips,28,518497482,2019,2019-10-01,appliances,kitchen
view,39800121,apparel.shirt,dion,29,521245875,2019,2019-10-01,apparel,shirt
view,1005105,electronics.smartphone,apple,1415,553019427,2019,2019-10-01,electronics,smartphone


#### Rearrangement of columns

In [0]:
# Rearranged columns
df_events = df_events.select("date", "event_year", "user_id", "product_id", "event_type", "category_code", "category","product_name","brand", "price")
display(df_events)

date,event_year,user_id,product_id,event_type,category_code,category,product_name,brand,price
2019-10-01,2019,529059532,5701009,view,auto.accessories.player,auto,accessories,,270
2019-10-01,2019,538071503,1005171,view,electronics.smartphone,electronics,smartphone,samsung,254
2019-10-01,2019,555538291,4802747,view,electronics.audio.headphone,electronics,audio,ritmix,11
2019-10-01,2019,554065445,15200067,view,,,,bosch,126
2019-10-01,2019,514432403,7005315,view,kids.carriage,kids,carriage,belecoo,101
2019-10-01,2019,555516331,18100049,view,,,,rezult,16
2019-10-01,2019,555598489,1801790,view,electronics.video.tv,electronics,video,elenberg,189
2019-10-01,2019,552630864,1005129,view,electronics.smartphone,electronics,smartphone,apple,1413
2019-10-01,2019,555556504,28716679,view,apparel.shoes,apparel,shoes,respect,120
2019-10-01,2019,532909958,34800089,view,,,,carfashion,60


#### Finding total Null values in category

In [0]:
from pyspark.sql.functions import col

null_counts = df_events.select(
    (col("category_code").isNull()).cast("int").alias("category_code_null"),
    (col("category").isNull()).cast("int").alias("category_null")
).groupBy().count()

display(null_counts)

count
56371590


### Handling Missing values in category and category_code
Over 10 million records were missing category codes; removing them would significantly weaken the analysis. To preserve completeness, the missing category values were imputed with ‘Others’ as a practical fallback.

In [0]:
from pyspark.sql.functions import when

df_events = df_events.withColumn(
    "category_code",
    when(col("category_code").isNull(), "others").otherwise(col("category_code"))
).withColumn(
    "category",
    when(col("category").isNull(), "others").otherwise(col("category"))
)

### Handling missing values in Brand and Product names

In [0]:
df_events = df_events.withColumn(
    "brand",
    when(col("brand").isNull(), "Unspecified").otherwise(col("brand"))
)
df_events = df_events.withColumn(
    "product_name",
    when(col("product_name").isNull(), "Unspecified").otherwise(col("product_name"))
)


### Saving the silver events

In [0]:
df_events = df_events.withColumn("date", df_events["date"].cast("string"))
df_events.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("df_silver_events")



### Produced clean, analytics‑ready dataset that maintain fidelity while removing noise, inconsistencies, and structural issues.