In [29]:
import numpy as np
import pandas as pd
from scipy.special import expit
import matplotlib.pyplot as plt

np.random.seed(42)


In [30]:
N_USERS = 50_000
MAX_MONTHS = 24  # observation window

CHANNELS = {
    "paid_search": {"base_cost": 3.0},
    "social_ads": {"base_cost": 2.0},
    "referral": {"base_cost": 1.0},
    "affiliate": {"base_cost": 2.5},
    "organic": {"base_cost": 0.2},
}


In [31]:
users = pd.DataFrame({
    "user_id": np.arange(N_USERS),
    "age": np.random.randint(18, 65, N_USERS),
    "income_band": np.random.choice(
        ["low", "mid", "high"], size=N_USERS, p=[0.4, 0.4, 0.2]
    ),
})

users["baseline_intent"] = np.random.normal(0, 1, N_USERS)

users["risk_score"] = expit(
    -0.5 * users["baseline_intent"] + np.random.normal(0, 0.5, N_USERS)
)


In [32]:
def assign_channel(row):
    intent = row["baseline_intent"]

    probs = {
        "paid_search": expit(intent),
        "social_ads": 0.6,
        "referral": expit(intent - 0.5),
        "affiliate": expit(intent - 0.3),
        "organic": 0.3,
    }

    probs = np.array(list(probs.values()))
    probs /= probs.sum()

    return np.random.choice(list(CHANNELS.keys()), p=probs)

users["channel"] = users.apply(assign_channel, axis=1)


In [33]:
channel_exposure = users[["user_id", "channel"]].copy()
channel_exposure["exposed"] = 1
channel_exposure["cost"] = channel_exposure["channel"].apply(
    lambda c: CHANNELS[c]["base_cost"] * np.random.uniform(0.8, 1.2)
)


In [34]:
def channel_uplift(channel, intent):
    if channel == "paid_search":
        return 0.6 * (intent > 0)
    if channel == "social_ads":
        return 0.3
    if channel == "referral":
        return 0.4 * (intent > -0.5)
    if channel == "affiliate":
        return 0.25
    return 0.0

users["p_convert"] = users.apply(
    lambda r: expit(r["baseline_intent"] + channel_uplift(r["channel"], r["baseline_intent"])),
    axis=1,
)

users["converted"] = np.random.binomial(1, users["p_convert"])


In [35]:
users.head()

Unnamed: 0,user_id,age,income_band,baseline_intent,risk_score,channel,p_convert,converted
0,0,56,low,0.235973,0.404887,social_ads,0.630875,0
1,1,46,low,1.013807,0.271536,affiliate,0.779681,1
2,2,32,low,1.143814,0.36016,paid_search,0.851171,0
3,3,60,low,-0.261093,0.407633,social_ads,0.509726,1
4,4,25,mid,0.044375,0.556474,organic,0.511092,1


In [36]:
BASE_HAZARD = 0.02

CHANNEL_HAZARD_MULTIPLIER = {
    "social_ads": 1.8,
    "paid_search": 1.1,
    "affiliate": 1.2,
    "organic": 1.0,
    "referral": 0.6,
}


In [37]:
def simulate_time_to_churn(row):
    if row["converted"] == 0:
        return np.inf  # never becomes a customer

    hazard = (
        BASE_HAZARD
        * CHANNEL_HAZARD_MULTIPLIER[row["channel"]]
        * (1 + row["risk_score"])
    )

    return np.random.exponential(1 / hazard)

users["time_to_churn"] = users.apply(simulate_time_to_churn, axis=1)


In [38]:
users["observed_months"] = np.minimum(users["time_to_churn"], MAX_MONTHS)
users["churned"] = (users["time_to_churn"] <= MAX_MONTHS).astype(int)


In [39]:
users.head()

Unnamed: 0,user_id,age,income_band,baseline_intent,risk_score,channel,p_convert,converted,time_to_churn,observed_months,churned
0,0,56,low,0.235973,0.404887,social_ads,0.630875,0,inf,24.0,0
1,1,46,low,1.013807,0.271536,affiliate,0.779681,1,47.96629,24.0,0
2,2,32,low,1.143814,0.36016,paid_search,0.851171,0,inf,24.0,0
3,3,60,low,-0.261093,0.407633,social_ads,0.509726,1,34.12238,24.0,0
4,4,25,mid,0.044375,0.556474,organic,0.511092,1,30.375622,24.0,0


In [40]:
BASE_MONTHLY_REVENUE = {
    "paid_search": 40,
    "social_ads": 25,
    "referral": 60,
    "affiliate": 35,
    "organic": 30,
}


In [41]:
revenue_rows = []

for _, row in users.iterrows():
    if row["converted"] == 0:
        continue

    base = BASE_MONTHLY_REVENUE[row["channel"]]

    for month in range(1, int(row["observed_months"]) + 1):
        revenue = max(
            5,
            base + np.random.normal(0, 8)
        )

        revenue_rows.append({
            "user_id": row["user_id"],
            "month": month,
            "revenue": revenue
        })

revenue = pd.DataFrame(revenue_rows)



In [42]:
users_table = users[
    ["user_id", "age", "income_band", "risk_score", "channel", "converted"]
]
churn_table = users[
    ["user_id", "observed_months", "churned"]
]


In [43]:
users_table.to_csv("users.csv", index=False)
channel_exposure.to_csv("channel_exposure.csv", index=False)
revenue.to_csv("revenue.csv", index=False)
churn_table.to_csv("churn.csv", index=False)


In [44]:
users_table.head()

Unnamed: 0,user_id,age,income_band,risk_score,channel,converted
0,0,56,low,0.404887,social_ads,0
1,1,46,low,0.271536,affiliate,1
2,2,32,low,0.36016,paid_search,0
3,3,60,low,0.407633,social_ads,1
4,4,25,mid,0.556474,organic,1


In [45]:
channel_exposure.head()

Unnamed: 0,user_id,channel,exposed,cost
0,0,social_ads,1,1.777098
1,1,affiliate,1,2.191188
2,2,paid_search,1,3.052957
3,3,social_ads,1,1.65055
4,4,organic,1,0.175349


In [46]:
revenue.head()

Unnamed: 0,user_id,month,revenue
0,1,1,35.01612
1,1,2,29.664867
2,1,3,37.167168
3,1,4,49.424365
4,1,5,45.078892


In [47]:
churn_table.head()

Unnamed: 0,user_id,observed_months,churned
0,0,24.0,0
1,1,24.0,0
2,2,24.0,0
3,3,24.0,0
4,4,24.0,0
