In [25]:
# --------------------------------------------
# 01_synthetic_data_generation.ipynb
# --------------------------------------------

import numpy as np
import pandas as pd

np.random.seed(42)

# --------------------------------------------
# 1. Generate synthetic demographic + campaign data
# --------------------------------------------

N = 5000  # number of samples

ages = np.random.randint(18, 60, N)
genders = np.random.choice(["Male", "Female"], N)
cities = np.random.choice(
    ["Mumbai", "Delhi", "Bangalore", "Chennai", "Hyderabad", "Kolkata"],
    N
)

ad_types = np.random.choice(
    ["Video", "Image", "Carousel", "Shorts"],
    N
)

ad_budget = np.random.normal(50000, 15000, N).clip(5000, 100000)
duration = np.random.randint(5, 60, N)  # seconds

# Engagement metrics (realistic correlations)
impressions = (ad_budget * np.random.uniform(20, 40, N)).astype(int)
click_through_rate = np.round(np.random.uniform(0.1, 8.0, N), 2)
clicks = (impressions * (click_through_rate / 100)).astype(int)
conversion_rate = np.round(np.random.uniform(0.1, 5.0, N), 2)
conversions = (clicks * (conversion_rate / 100)).astype(int)

# Target variable: performance (Low / Medium / High)
performance = []
for imp, clk, conv, budget in zip(impressions, clicks, conversions, ad_budget):
    score = (0.4 * imp) + (1.2 * clk) + (4 * conv) + (0.0005 * budget)
    if score < 20000:
        performance.append("Low")
    elif score < 60000:
        performance.append("Medium")
    else:
        performance.append("High")

# Create DataFrame
df = pd.DataFrame({
    "Age": ages,
    "Gender": genders,
    "City": cities,
    "AdType": ad_types,
    "Budget": ad_budget.astype(int),
    "DurationSec": duration,
    "Impressions": impressions,
    "CTR": click_through_rate,
    "Clicks": clicks,
    "ConversionRate": conversion_rate,
    "Conversions": conversions,
    "Performance": performance
})

# Save CSV
df.to_csv("ads_synthetic.csv", index=False)
df.head()


Unnamed: 0,Age,Gender,City,AdType,Budget,DurationSec,Impressions,CTR,Clicks,ConversionRate,Conversions,Performance
0,56,Male,Mumbai,Image,48936,11,1808652,5.14,92964,0.93,864,High
1,46,Female,Mumbai,Carousel,68041,7,2344682,4.39,102931,4.92,5064,High
2,32,Female,Delhi,Carousel,30267,33,926874,3.84,35591,2.52,896,High
3,25,Male,Hyderabad,Shorts,50156,40,1735996,3.6,62495,0.16,99,High
4,38,Female,Hyderabad,Shorts,61013,43,1335366,4.04,53948,4.78,2578,High
