In [5]:
import pandas as pd
import numpy as np 
from datetime import timedelta
import os

np.random.seed(42)
os.makedirs("data/raw", exist_ok=True)


n_users = 100000

users = pd.DataFrame({
    "user_id": [f"U{i:05d}" for i in range(1, n_users + 1)],
    "signup_date": pd.to_datetime("2024-01-01") 
                    + pd.to_timedelta(np.random.randint(0, 60, n_users), unit="D"),
    "country": np.random.choice(
        ["USA", "India", "UK", "Germany"], n_users, p=[0.4, 0.3, 0.2, 0.1]
    ),
    "persona": np.random.choice(
        ["student", "hobbyist", "professional", "team"], n_users,
        p=[0.35, 0.25, 0.25, 0.15]
    ),
    "acquisition_channel": np.random.choice(
        ["organic", "paid_ads", "email", "referral"], n_users,
        p=[0.4, 0.3, 0.2, 0.1]
    )
})

users.to_csv("data/raw/users.csv", index=False)


In [9]:
events = []

for _, row in users.iterrows():
    uid = row["user_id"]
    signup_time = row["signup_date"]
    persona = row["persona"]
    channel = row["acquisition_channel"]

    # Base events
    events.append([uid, signup_time, "signup", "web", f"S_{uid}_1"])

    # Most users run code
    if np.random.rand() < 0.85:
        events.append([uid, signup_time + timedelta(hours=1),
                       "code_run", "web", f"S_{uid}_1"])

    # Persona-based probabilities
    if persona == "student":
        deploy_prob = 0.30
        collab_prob = 0.10
        upgrade_prob = 0.08
    elif persona == "hobbyist":
        deploy_prob = 0.45
        collab_prob = 0.15
        upgrade_prob = 0.12
    elif persona == "professional":
        deploy_prob = 0.65
        collab_prob = 0.30
        upgrade_prob = 0.25
    else:  # team
        deploy_prob = 0.80
        collab_prob = 0.55
        upgrade_prob = 0.45

    # Channel quality adjustment
    if channel == "referral":
        upgrade_prob *= 1.3
    elif channel == "email":
        upgrade_prob *= 1.1
    elif channel == "paid_ads":
        upgrade_prob *= 0.8

    # Deploy event
    if np.random.rand() < deploy_prob:
        events.append([uid, signup_time + timedelta(days=1),
                       "deploy", "web", f"S_{uid}_2"])

    # Collaborate event
    if np.random.rand() < collab_prob:
        events.append([uid, signup_time + timedelta(days=2),
                       "collaborate", "web", f"S_{uid}_3"])

    # Upgrade event (later in lifecycle)
    if np.random.rand() < upgrade_prob:
        events.append([uid, signup_time + timedelta(days=np.random.randint(5, 15)),
                       "upgrade", "web", f"S_{uid}_4"])

events_df = pd.DataFrame(
    events,
    columns=["user_id", "event_time", "event_name", "platform", "session_id"]
)

events_df.to_csv("data/raw/events.csv", index=False)


In [7]:
upgraded_users = events_df[events_df["event_name"] == "upgrade"]["user_id"].unique()

subscriptions = pd.DataFrame({
    "user_id": upgraded_users,
    "plan_type": np.random.choice(["pro", "team"], len(upgraded_users), p=[0.7, 0.3]),
    "start_date": pd.to_datetime("2024-01-10")
                    + pd.to_timedelta(np.random.randint(0, 20, len(upgraded_users)), unit="D"),
    "mrr": np.random.choice([20, 40], len(upgraded_users), p=[0.7, 0.3]),
    "churned": np.random.choice([True, False], len(upgraded_users), p=[0.15, 0.85])
})

subscriptions.to_csv("data/raw/subscriptions.csv", index=False)


In [8]:
dates = pd.date_range("2024-01-01", "2024-03-01")

marketing_spend = pd.DataFrame({
    "date": np.repeat(dates, 4),
    "channel": ["organic", "paid_ads", "email", "referral"] * len(dates),
    "spend": np.random.randint(50, 600, len(dates) * 4)
})

marketing_spend.to_csv("data/raw/marketing_spend.csv", index=False)
