In [0]:
from pyspark.sql.functions import col, when, count, avg

schema = "workspace.ad_tables"

# 1. Load Silver Funnel and User/App Metadata
df_funnel = spark.read.table(f"{schema}.silver_ad_funnel")
df_app = spark.read.table(f"{schema}.user_app_genre")
df_profile = spark.read.table(f"{schema}.user_profile") # Clean version from Silver if you saved it

# 2. Join User Interests (The Matchmaking logic)
# We join the funnel with app genres to see if the ad context matches the user's interest
gold_features = (df_funnel.alias("f")
    .join(df_app.alias("a"), "user_id", "left")
    .join(df_profile.alias("p"), "user_id", "left")
    .select(
        "f.id_md5",
        "f.user_id",
        "f.timestamp",
        "f.hour_of_day",
        "f.day_of_week",
        "f.device_type",
        "f.network_type",
        "f.ad_position",
        "f.campaign_id",
        "f.campaign_type",
        "a.app_cat",
        "a.primary_genre",
        "p.age_range",
        "p.gender",
        "p.state",
        "p.phone_price_range",
        "f.is_impression",
        "f.is_click"  # This is our LABEL (Target Variable)
    ))

# 3. Create a 'Match' Feature (Innovation Insight)
# Does the campaign type match the user's primary genre? (Example logic)
gold_features = gold_features.withColumn("is_affinity_match", 
    when((col("primary_genre") == "Videos") & (col("campaign_type") == "CPM"), 1)
    .when((col("primary_genre") == "Shopping") & (col("campaign_type") == "CPC"), 1)
    .otherwise(0))

# 4. Save to Gold Table
gold_features.write.mode("overwrite").saveAsTable(f"{schema}.gold_ml_features")

print("Gold Layer Table 'gold_ml_features' is ready for ML Training!")

In [0]:
from pyspark.sql.functions import col, lower, when, coalesce, lit

schema = "workspace.ad_tables"

# 1. Load Silver Funnel and Metadata
df_funnel = spark.read.table(f"{schema}.silver_ad_funnel")
df_app = spark.read.table(f"{schema}.user_app_genre")
df_profile = spark.read.table(f"{schema}.user_profile")

# 2. Fix Categorical Noise in User Profile (Crucial for ML)
df_profile_clean = df_profile.withColumn("gender", 
    when(lower(col("gender")).isin("m", "male"), "Male")
    .when(lower(col("gender")).isin("f", "female"), "Female")
    .otherwise("Unknown")
).withColumn("state", coalesce(lower(col("state")), lit("unknown"))) # Handle null states

# 3. Master Join for ML Features
gold_ml_features = (df_funnel.alias("f")
    .join(df_app.alias("a"), "user_id", "left")
    .join(df_profile_clean.alias("p"), "user_id", "left")
    .select(
        "f.id_md5",
        "f.user_id",
        "f.hour_of_day",
        "f.day_of_week",
        coalesce(col("f.device_type"), lit("Mobile")).alias("device_type"), # Fill nulls
        "f.ad_position",
        "f.campaign_type",
        "a.app_cat",
        "a.primary_genre",
        "p.age_range",
        "p.gender", # Now cleaned
        "p.state",  # Now standardized
        "p.phone_price_range",
        "f.is_click"  # Target Label
    ))

# 4. Save to Gold ML Table
gold_ml_features.write.mode("overwrite").saveAsTable(f"{schema}.gold_ml_features")

print("ML Gold Features standardized and saved successfully.")

In [0]:
from pyspark.sql.functions import col, upper, lower, when, coalesce, lit

schema = "workspace.ad_tables"

# 1. Load Tables
df_funnel = spark.read.table(f"{schema}.silver_ad_funnel")
df_app = spark.read.table(f"{schema}.user_app_genre")
df_profile = spark.read.table(f"{schema}.user_profile")

# 2. Pre-Clean User Profile (Standardizing State, Gender, and Age)
df_profile_clean = df_profile.select(
    col("user_id").alias("u_user_id"),
    coalesce(upper(col("state")), lit("UNKNOWN")).alias("state"),
    coalesce(col("age_range"), lit("Unknown")).alias("age_range"),
    when(lower(col("gender")).isin("m", "male"), "Male")
    .when(lower(col("gender")).isin("f", "female"), "Female")
    .otherwise("Unknown").alias("gender"),
    coalesce(col("phone_price_range"), lit("mid")).alias("phone_price_range")
)

# 3. Create the Gold ML Table with Matchmaking Logic
gold_features = (df_funnel.alias("f")
    .join(df_app.alias("a"), "user_id", "left")
    .join(df_profile_clean, df_funnel.user_id == df_profile_clean.u_user_id, "left")
    .select(
        "f.id_md5",
        "f.user_id",
        "f.hour_of_day",
        "f.day_of_week",
        coalesce(col("f.device_type"), lit("Mobile")).alias("device_type"),
        "f.ad_position",
        "f.campaign_type",
        coalesce(col("a.app_cat"), lit("Utility")).alias("app_cat"),
        coalesce(col("a.primary_genre"), lit("General")).alias("primary_genre"),
        "state",      # Cleaned
        "age_range",  # Cleaned
        "gender",     # Cleaned
        "phone_price_range",
        "f.is_click"  # Target Label
    ))

# 4. Innovation Insight: Create the 'Affinity Match' Feature
gold_features = gold_features.withColumn("is_affinity_match", 
    when((col("primary_genre") == "Videos") & (col("campaign_type") == "CPM"), 1)
    .when((col("primary_genre") == "Shopping") & (col("campaign_type") == "CPC"), 1)
    .otherwise(0))

# 5. Save and Overwrite
gold_features.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(f"{schema}.gold_ml_features")

print("Final Gold ML Table ready. Combined Standardisation + Affinity Match Logic!")