# Transformation using Pyspark/SQL

#### Loaded the silver dataset

In [0]:
df=spark.read.table("workspace.default.df_silver_events")


#### Handling Null values

In [0]:
from pyspark.sql.functions import when

df = df.withColumn("category", when(df.category.isNull(), "others").otherwise(df.category))

#### Grouped By Category to view sum of Price


In [0]:
result = df.filter(df.event_type == "purchase") \
           .groupBy("category") \
           .agg({"price": "sum"})

display(result)

category,sum(price)
apparel,9725
auto,14588
others,327411
construction,8478
kids,7895
furniture,31065
computers,143726
appliances,211776
electronics,2257324
accessories,555


#### Filtered to view only the price greater than 100000

In [0]:
from pyspark.sql.functions import col

filtered_result = result.filter(col("sum(price)") > 100000)
display(filtered_result)

category,sum(price)
others,327411
computers,143726
appliances,211776
electronics,2257324


#### Grouped by Product name

In [0]:
from pyspark.sql.functions import col, sum as spark_sum

filtered_df = df.filter(
    (col("event_type") == "purchase") &
    (col("category").isin("electronics", "appliances"))
)

grouped_df = filtered_df.groupBy("product_name").agg(spark_sum("price").alias("total_price"))

result_df = grouped_df.filter(col("total_price") > 100000)

display(result_df)

product_name,total_price
smartphone,1930041
kitchen,152309
video,153817


####  Finding the cheapest nonâ€‘zero priced record for each product.

In [0]:
df = spark.read.table("workspace.default.df_silver_events")

from pyspark.sql import functions as F
from pyspark.sql.window import Window

df = df.filter(col("price") != 0)

w = Window.partitionBy("product_name").orderBy(F.col("price").asc())

df = df.withColumn("rn", F.row_number().over(w)) \
       .filter("rn = 1")
display(df)

date,event_year,user_id,product_id,event_type,category_code,category,product_name,brand,price,rn
2019-10-15,2019,512637703,18001268,view,others,others,Unspecified,samsung,1,1
2019-11-30,2019,521904265,55500002,view,auto.accessories.anti_freeze,auto,accessories,wurth,5,1
2019-10-15,2019,513083223,22400046,view,electronics.audio.microphone,electronics,audio,ritmix,1,1
2019-10-15,2019,512543923,52900149,view,accessories.bag,accessories,bag,stanley,1,1
2019-11-30,2019,512392966,13800602,view,furniture.bathroom.toilet,furniture,bathroom,Unspecified,3,1
2019-10-15,2019,559066029,22900106,view,furniture.bedroom.pillow,furniture,bedroom,tamish,2,1
2019-10-15,2019,514171862,28500074,view,apparel.belt,apparel,belt,Unspecified,15,1
2019-11-26,2019,512541412,48100011,view,sport.bicycle,sport,bicycle,author,3,1
2019-10-15,2019,516601388,2300177,view,electronics.camera.video,electronics,camera,veho,25,1
2019-10-11,2019,514781060,7004721,view,kids.carriage,kids,carriage,bambola,25,1


#### With the transformation layer complete, my data has now moved through the full Medallion architecture.