Product Recommendation Logs A recommendation engine generates user-product interaction logs (views, clicks, purchases). The logs arrive every hour in Parquet format. How would you design a pipeline to maintain an aggregated interaction table (per user, per product)? How would you prevent data skew if 10% of the products account for 90% of the interactions? How would you optimize storage (file size, partitioning, Z-Ordering)? this one i take can you tell step by step solution that solution should like you re expert in databricks data enigner

In [0]:
%sql
DROP TABLE IF EXISTS learning_db.qna.user_interactions;
CREATE TABLE learning_db.qna.user_interactions (
  event_time TIMESTAMP,
  user_id STRING,
  product_id STRING,
  interaction_type STRING
);

INSERT INTO learning_db.qna.user_interactions (event_time, user_id, product_id, interaction_type) VALUES
('2025-08-20 10:01:00', 'U1', 'P100', 'view'),
('2025-08-20 10:02:00', 'U1', 'P100', 'click'),
('2025-08-20 10:05:00', 'U2', 'P200', 'view'),
('2025-08-20 10:06:00', 'U2', 'P200', 'view'),
('2025-08-20 10:10:00', 'U1', 'P100', 'purchase'),
('2025-08-20 10:12:00', 'U3', 'P300', 'view');

num_affected_rows,num_inserted_rows
6,6


In [0]:
from pyspark.sql.functions import col, sum, when, rand, lit, to_date, max

df = spark.table("learning_db.qna.user_interactions")
display(df)

event_time,user_id,product_id,interaction_type
2025-08-20T10:01:00.000Z,U1,P100,view
2025-08-20T10:02:00.000Z,U1,P100,click
2025-08-20T10:05:00.000Z,U2,P200,view
2025-08-20T10:06:00.000Z,U2,P200,view
2025-08-20T10:10:00.000Z,U1,P100,purchase
2025-08-20T10:12:00.000Z,U3,P300,view


In [0]:
# add event_date for partioning
df = df.withColumn("event_date", to_date(col("event_time")))
display(df)

event_time,user_id,product_id,interaction_type,event_date
2025-08-20T10:01:00.000Z,U1,P100,view,2025-08-20
2025-08-20T10:02:00.000Z,U1,P100,click,2025-08-20
2025-08-20T10:05:00.000Z,U2,P200,view,2025-08-20
2025-08-20T10:06:00.000Z,U2,P200,view,2025-08-20
2025-08-20T10:10:00.000Z,U1,P100,purchase,2025-08-20
2025-08-20T10:12:00.000Z,U3,P300,view,2025-08-20


In [0]:
# aggregate interactions per user x product x date
agg_df = df.groupBy("user_id", "product_id", "event_date")\
    .agg(
        sum(when(col("interaction_type") == 'view', 1).otherwise(0)).alias("view"),
        sum(when(col("interaction_type") == 'click', 1).otherwise(0)).alias("click"),
        sum(when(col("interaction_type") == 'purchase', 1).otherwise(0)).alias("purchase"),
        max("event_time").alias("last_interaction_time")
    )

display(agg_df)

user_id,product_id,event_date,view,click,purchase,last_interaction_time
U1,P100,2025-08-20,1,1,1,2025-08-20T10:10:00.000Z
U3,P300,2025-08-20,1,0,0,2025-08-20T10:12:00.000Z
U2,P200,2025-08-20,2,0,0,2025-08-20T10:06:00.000Z


In [0]:
# handle data skew(hot products)
# fix : adding salting
# aggregation in two phases

df_salting = df.withColumn(
    "salt",
    when(col("product_id") == "P100", (rand() * 10). cast("int")).otherwise(lit(0))
)
# phase 1 agg with salting method

preagg_df = df_salting.groupBy("user_id", "product_id", "event_date", "salt")\
    .agg(
        sum(when(col("interaction_type") == 'view', 1).otherwise(0)).alias("view"),
        sum(when(col("interaction_type") == 'click', 1).otherwise(0)).alias("click"),
        sum(when(col("interaction_type") == 'purchase', 1).otherwise(0)).alias("purchase"),
        max("event_time").alias("last_interaction_time")
    )

# phase 2 collapse salts to get final aggates
agg_df = preagg_df.groupBy("user_id", "product_id", "event_date", "salt")\
    .agg(
        sum("view").alias("view"),
        sum("click").alias("click"),
        sum("purchase").alias("purchase"),
        max("last_interaction_time").alias("last_interaction_time")
    )



user_id,product_id,event_date,salt,view,click,purchase,last_interaction_time
U1,P100,2025-08-20,0,1,0,0,2025-08-20T10:01:00.000Z
U3,P300,2025-08-20,0,1,0,0,2025-08-20T10:12:00.000Z
U1,P100,2025-08-20,7,0,0,1,2025-08-20T10:10:00.000Z
U1,P100,2025-08-20,2,0,1,0,2025-08-20T10:02:00.000Z
U2,P200,2025-08-20,0,2,0,0,2025-08-20T10:06:00.000Z


In [0]:
agg_df.write.format("delta")\
    .mode("append")\
    .partitionBy("event_date")\
    .save("/Volumes/learning_db/qna/landing/user_interaction")

In [0]:
%sql
OPTIMIZE delta.`/Volumes/learning_db/qna/landing/user_interaction`
ZORDER BY (user_id, product_id);

VACUUM delta.`/Volumes/learning_db/qna/landing/user_interaction` RETAIN 168 HOURS; --keep 7 days histroy