<a href="https://colab.research.google.com/github/DataSavvyYT/experiments/blob/main/1_llm_finetune/0_create_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import json
import random

# --------------------------------------------------------------------------------
# CONFIG: Synthetic Data Generator for Promotion Effectiveness
# --------------------------------------------------------------------------------
NUM_SAMPLES = 500  # Number of examples to generate
TRAIN_RATIO = 0.8
OUTPUT_FILES = {"train": "/content/drive/MyDrive/data/promotion/train.jsonl", "val": "/content/drive/MyDrive/data/promotion/validation.jsonl"}

# Data attributes to mix and match
campaigns = [
    "Diwali Mega Sale", "End of Season Sale", "Freedom Sale", "Cyber Monday",
    "Holi Splash", "Monsoon Clearance", "Flash Sale (24h)", "Weekend Bonanza",
    "New Arrival Launch", "Clearance Blowout", "Payday Sale", "Student Special"
]
channels = [
    "Email", "Push Notification", "SMS", "WhatsApp", "Instagram Ads",
    "Facebook Ads", "Google Search", "In-App Banner", "Influencer Collab"
]
audiences = [
    "Returning customers", "New users", "Lapsed (>90 days)", "High LTV VIPs",
    "Cart Abandoners", "Gen Z (18-24)", "Parents", "Credit Card Holders"
]
categories = [
    "Electronics", "Fashion", "Home Decor", "Beauty", "Groceries",
    "Footwear", "Accessories", "Kitchenware"
]
discounts = ["5%", "10%", "15%", "20%", "25%", "30%", "50%", "Flat ₹500 off", "Buy 1 Get 1"]

# Logic to determine "Ground Truth" effectiveness (adds realistic signal)
def get_label(campaign, channel, discount, past_ctr):
    score = 0
    # Higher discount = effective
    if any(x in discount for x in ["20%", "25%", "30%", "50%", "Get 1"]): score += 2
    # Specific channels work better for specific campaigns
    if "Flash" in campaign and "Push" in channel: score += 2
    if "Diwali" in campaign and "Email" in channel: score += 1
    # Past performance signal
    if past_ctr > 2.5: score += 2
    elif past_ctr < 1.0: score -= 2

    # Random noise (market unpredictability)
    score += random.uniform(-1.5, 1.5)

    return "effective" if score > 1.5 else "not effective"

# --------------------------------------------------------------------------------
# GENERATOR LOOP
# --------------------------------------------------------------------------------
data = []
for _ in range(NUM_SAMPLES):
    camp = random.choice(campaigns)
    chan = random.choice(channels)
    aud = random.choice(audiences)
    cat = random.choice(categories)
    disc = random.choice(discounts)
    dur = random.randint(1, 14)
    past_ctr = round(random.uniform(0.5, 4.5), 2)
    aov = random.choice([800, 1200, 1500, 2500, 5000, 12000])

    # Construct Input String
    input_text = (
        f"Campaign: {camp}\n"
        f"Channel: {chan}\n"
        f"Budget: {random.randint(1, 20)} Lakh INR\n"
        f"Audience: {aud}\n"
        f"Discount: {disc}\n"
        f"Duration: {dur} days\n"
        f"Past CTR: {past_ctr}%\n"
        f"Avg AOV: {aov} INR\n"
        f"Category: {cat}"
    )

    output_label = get_label(camp, chan, disc, past_ctr)

    data.append({
        "instruction": "Predict promotion effectiveness.",
        "input": input_text,
        "output": output_label
    })

# --------------------------------------------------------------------------------
# SAVE TO JSONL
# --------------------------------------------------------------------------------
split_idx = int(NUM_SAMPLES * TRAIN_RATIO)
train_data = data[:split_idx]
val_data = data[split_idx:]

with open(OUTPUT_FILES["train"], "w") as f:
    for entry in train_data:
        f.write(json.dumps(entry) + "\n")

with open(OUTPUT_FILES["val"], "w") as f:
    for entry in val_data:
        f.write(json.dumps(entry) + "\n")

print(f"✅ Success! Generated {len(train_data)} training and {len(val_data)} validation examples.")
print(f"Files saved: {OUTPUT_FILES['train']}, {OUTPUT_FILES['val']}")


✅ Success! Generated 400 training and 100 validation examples.
Files saved: /content/drive/MyDrive/data/promotion/train.jsonl, /content/drive/MyDrive/data/promotion/validation.jsonl


In [7]:
!ls -ltr ./drive/MyDrive/

total 3833
-rw------- 1 root root 3919961 Nov 28 10:16  Scanned_20251128-1546.pdf
drwx------ 2 root root    4096 Dec  1 06:47 'Colab Notebooks'


In [12]:
!mkdir ./drive/MyDrive/data/promotion