In [0]:
print("Hello work")

In [0]:
from pyspark.sql import functions as F
from pyspark.sql import types as T
import random


In [0]:
START_DATE = "2025-10-01"
END_DATE   = "2025-12-31"

TOTAL_REQUESTS = 1_000_000
TOTAL_USERS    = 250_000
TOTAL_APPS     = 300
TOTAL_CAMPAIGNS = 150

IMPRESSION_RATE = 0.5      # 50%
CLICK_RATE = 0.015         # 1.5%

In [0]:
users_df = (
    spark.range(TOTAL_USERS)
    .withColumn("user_id", F.concat(F.lit("U"), F.col("id")))
    .withColumn("age_range", F.expr("element_at(array('18-24','25-34','35-44','45-54'), int(rand()*4)+1)"))
    .withColumn("gender", F.expr("element_at(array('M','F'), int(rand()*2)+1)"))
    .withColumn("state", F.expr("element_at(array('KA','TN','MH','DL','UP'), int(rand()*5)+1)"))
    .withColumn("city", F.expr("element_at(array('Bangalore','Chennai','Mumbai','Delhi','Noida'), int(rand()*5)+1)"))
    .withColumn("phone_price_range", F.expr("element_at(array('low','mid','premium'), int(rand()*3)+1)"))
    .withColumn("phone_model", F.expr("element_at(array('Samsung','Redmi','iPhone','OnePlus'), int(rand()*4)+1)"))
    .withColumn("language", F.expr("element_at(array('en','ta','hi','kn'), int(rand()*4)+1)"))
    .drop("id")
)

users_df.write.mode("overwrite").csv("/Volumes/workspace/ad_tech/ad_tech_data/user_profile", header=True)

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS workspace.ad_tech;
CREATE VOLUME IF NOT EXISTS workspace.ad_tech.ad_tech_data;

In [0]:
apps_df = (
    spark.range(TOTAL_APPS)
    .withColumn("app_id", F.concat(F.lit("APP"), F.col("id")))
    .withColumn("app_category", F.expr("element_at(array('OTT','Gaming','News','Finance'), int(rand()*4)+1)"))
    .withColumn("primary_genre", F.expr("element_at(array('Action','Drama','Sports','Kids'), int(rand()*4)+1)"))
    .withColumn("secondary_genre", F.expr("element_at(array('Comedy','Thriller','None'), int(rand()*3)+1)"))
    .drop("id")
)

apps_df.write.mode("overwrite").csv("/Volumes/workspace/ad_tech/ad_tech_data/app_genre_profile", header=True)

In [0]:
campaigns_df = (
    spark.range(TOTAL_CAMPAIGNS)
    .withColumn("campaign_id", F.concat(F.lit("C"), F.col("id")))
    .withColumn("advertiser_id", F.concat(F.lit("ADV"), (F.col("id") % 20)))
    .withColumn("campaign_type", F.expr("element_at(array('CPM','CPC','CPI'), int(rand()*3)+1)"))
    .withColumn("billing_rate", (F.rand()*200 + 50))
    .withColumn(
        "targeting_params",
        F.to_json(F.struct(
            F.expr("array('KA','TN')").alias("state"),
            F.expr("array('en','ta')").alias("language"),
            F.expr("array('premium','mid')").alias("phone_price_range")
        ))
    )
    .drop("id")
)

campaigns_df.write.mode("overwrite").csv("/Volumes/workspace/ad_tech/ad_tech_data/campaigns", header=True)

In [0]:
from pyspark.sql import functions as F

# Example constants
TOTAL_APPS = 1000        # number of rows
TOTAL_USERS = 5000      # user population to sample from

apps_df = (
    spark.range(TOTAL_APPS)
    .withColumn("app_id", F.concat(F.lit("APP"), F.col("id")))
    .withColumn(
        "user_id",
        F.concat(
            F.lit("USER"),
            (F.floor(F.rand() * TOTAL_USERS) + 1).cast("int")
        )
    )
    .withColumn(
        "app_category",
        F.expr("element_at(array('OTT','Gaming','News','Finance'), int(rand()*4)+1)")
    )
    .withColumn(
        "primary_genre",
        F.expr("element_at(array('Action','Drama','Sports','Kids'), int(rand()*4)+1)")
    )
    .withColumn(
        "secondary_genre",
        F.expr("element_at(array('Comedy','Thriller','None'), int(rand()*3)+1)")
    )
    .drop("id")
)

# Write to the required path
apps_df.write.mode("overwrite").csv(
    "/Volumes/workspace/ad_tech/ad_tech_data/app_genre_user",
    header=True
)


In [0]:
START_DATE = "2025-10-01"
END_DATE   = "2025-12-31"

TOTAL_REQUESTS = 1_000_000
TOTAL_USERS    = 250_000
TOTAL_APPS     = 300
TOTAL_CAMPAIGNS = 150

IMPRESSION_RATE = 0.5      # 50%
CLICK_RATE = 0.015         # 1.5%
requests_df = (
    spark.range(TOTAL_REQUESTS)
    .withColumn("request_id", F.concat(F.lit("R"), F.col("id")))
    .withColumn("user_id", F.concat(F.lit("U"), (F.col("id") % TOTAL_USERS)))
    .withColumn("timestamp", F.expr(f"date_add('{START_DATE}', int(rand()*90))"))
    .withColumn("state", F.expr("element_at(array('KA','TN','MH','DL','UP'), int(rand()*5)+1)"))
    .withColumn("city", F.expr("element_at(array('Bangalore','Chennai','Mumbai','Delhi','Noida'), int(rand()*5)+1)"))
    .withColumn("gender", F.expr("element_at(array('M','F'), int(rand()*2)+1)"))
    .withColumn("age_range", F.expr("element_at(array('18-24','25-34','35-44'), int(rand()*3)+1)"))
    .withColumn("phone_price_range", F.expr("element_at(array('low','mid','premium'), int(rand()*3)+1)"))
    .withColumn("phone_model", F.expr("element_at(array('Samsung','iPhone','Redmi'), int(rand()*3)+1)"))
    .withColumn("language", F.expr("element_at(array('en','ta','hi'), int(rand()*3)+1)"))
    .withColumn("device_type", F.expr("element_at(array('android','ios'), int(rand()*2)+1)"))
    .withColumn("network_type", F.expr("element_at(array('wifi','4g','5g'), int(rand()*3)+1)"))
    .withColumn("has_request", F.lit(1))
    .drop("id")
)

#requests_df.write.mode("overwrite").csv("/Volumes/workspace/ad_tech/ad_tech_data/requests", header=True)
apps_df.write.mode("overwrite").csv(
    "/Volumes/workspace/ad_tech/ad_tech_data/requests",
    header=True
)

In [0]:
impressions_df = (
    requests_df
    .withColumn("show_ad", F.rand() < IMPRESSION_RATE)
    .filter("show_ad")
    .withColumn("impression_id", F.monotonically_increasing_id())
    .withColumn("campaign_id", F.concat(F.lit("C"), (F.col("impression_id") % TOTAL_CAMPAIGNS)))
    .withColumn("ad_position", F.expr("element_at(array('top','mid','bottom'), int(rand()*3)+1)"))
    .withColumn("bid_type", F.expr("element_at(array('CPM','CPC','CPI'), int(rand()*3)+1)"))
    .withColumn("has_impression", F.lit(1))
    .select("impression_id","request_id","campaign_id","timestamp","ad_position","bid_type","has_impression")
)

impressions_df.write.mode("overwrite").csv("/Volumes/workspace/ad_tech/ad_tech_data/impressions", header=True)

In [0]:
clicks_df = (
    impressions_df
    .withColumn("clicked", F.rand() < CLICK_RATE)
    .filter("clicked")
    .withColumn("click_id", F.monotonically_increasing_id())
    .withColumn("has_click", F.lit(1))
    .select("click_id","impression_id","timestamp","has_click")
)

clicks_df.write.mode("overwrite").csv("/Volumes/workspace/ad_tech/ad_tech_data/clicks", header=True)