In [0]:
spark


In [0]:
df_user = spark.table("workspace.ad_tables.user_profile")
df_genre = spark.table("workspace.ad_tables.user_app_genre")
df_camp = spark.table("workspace.ad_tables.campaigns")
df_req = spark.table("workspace.ad_tables.requests")
df_imps = spark.table("workspace.ad_tables.impressions")
df_clks = spark.table("workspace.ad_tables.clicks")
df_user.show(5)
df_genre.show(5)
df_camp.show(5)
df_req.show(5)
df_imps.show(5)
df_clsk.show(5)
display(df_user)
display(df_genre)
display(df_camp)
display(df_req)
display(df_imps)
display(df_clks)    

In [0]:
%sql
describe detail workspace.ad_tables.impressions

In [0]:
from pyspark.sql.functions import col, lower, when, coalesce, lit, hour, dayofweek

# 1. Load the Bronze Tables
# Adjust the schema name if it's different in your catalog
schema = "workspace.ad_tables"

df_req = spark.read.table(f"{schema}.requests")
df_imp = spark.read.table(f"{schema}.impressions")
df_clk = spark.read.table(f"{schema}.clicks")
df_usr = spark.read.table(f"{schema}.user_profile")

# 2. Cleaning & Standardization (Silver Logic)
# Fix Gender and State casing
df_usr_clean = df_usr.withColumn("gender", 
    when(lower(col("gender")).startswith("m"), "M")
    .when(lower(col("gender")).startswith("f"), "F")
    .otherwise("Unknown")
).withColumn("state", lower(col("state")))

# 3. Deduplication (Addressing Data Quality)
# Rule: One request per ID and one click per MD5
df_req_dedup = df_req.dropDuplicates(["request_id"])
df_clk_dedup = df_clk.dropDuplicates(["id_md5"]) 

# 4. Building the Master Funnel (The "Heart" of the Pipeline)
# We Left Join to keep all requests (even those with no impression or click)
silver_funnel = (df_req_dedup.alias("r")
    .join(df_imp.alias("i"), "id_md5", "left")
    .join(df_clk_dedup.alias("c"), "id_md5", "left")
    .select(
        "r.id_md5",
        "r.user_id",
        "r.timestamp",
        "r.device_type",
        "r.network_type",
        col("i.campaign_id"),
        col("i.ad_position"),
        col("i.campaign_type"),
        # Create Target Labels for ML
        coalesce(col("i.has_impression"), lit(0)).alias("is_impression"),
        coalesce(col("c.has_click"), lit(0)).alias("is_click")
    ))

# 5. Extract Time Features (Crucial for ML later)
silver_funnel = silver_funnel.withColumn("hour_of_day", hour(col("timestamp"))) \
                             .withColumn("day_of_week", dayofweek(col("timestamp")))

# 6. Save as a Silver Table
silver_funnel.write.mode("overwrite").option("mergeSchema", "true").saveAsTable(f"{schema}.silver_ad_funnel")

print("Silver Layer Funnel created successfully!")