In [1]:
# Libraries
import numpy as np
import pandas as pd

## Proportions definitions

In [2]:
# --------------------------------------
# Set seed and basic sizes
# --------------------------------------
# Fix random seed for reproducibility
rng = np.random.default_rng(42)

# Define total sample and split evenly into A and B
n_total = 40_000
n_A = n_B = n_total // 2

# --------------------------------------
# Experiment parameters
# --------------------------------------
# Base conversion for A
pA = 0.030  # 3.0%

# Absolute uplift for B (in probability points)
uplift_B = 0.006  # +0.6 p.p. => 3.6% average target for B

# Segment share (heterogeneity): proportion of new users
p_new = 0.60  # 60% new, 40% returning

# Make new users convert less by this absolute difference
delta_new = 0.010  # new users convert 1.0 p.p. less than the average

# Compute compensating offset for returning users to keep A's overall average at pA
# pA_overall​ = p_new​*(pA − delta_new​) + (1 − p_new​)*(pA + offset_returning​)
# pA_overall​ = pA => So that the overall conversion for group A stays at the target pA
# => offset_returning = p_new*delta_new / (1 - p_new)
offset_returning = (p_new * delta_new) / (1 - p_new)

## Creation of groups - A & B

In [3]:
# --------------------------------------
# Build base user frame
# --------------------------------------
# Create separate frames to force exact 20k per group
df_A = pd.DataFrame({
    "user_id": np.arange(1, n_A + 1),
    "group": "A"
})
df_B = pd.DataFrame({
    "user_id": np.arange(n_A + 1, n_total + 1),
    "group": "B"
})
df = pd.concat([df_A, df_B], ignore_index=True)
display(df)

Unnamed: 0,user_id,group
0,1,A
1,2,A
2,3,A
3,4,A
4,5,A
...,...,...
39995,39996,B
39996,39997,B
39997,39998,B
39998,39999,B


## Assigning random dates to each user

In [5]:
# Define date range and day-to-day noise
start_date = pd.to_datetime("2025-07-01")
end_date   = pd.to_datetime("2025-07-30")
dates = pd.date_range(start_date, end_date, freq="D")

# Assign a random date across the range (uniformly)
date_idx = rng.integers(0, len(dates), size=n_total)
df["date"] = dates[date_idx].date

# Assign segment: new user (1) vs returning (0)
df["is_new_user"] = rng.binomial(1, p_new, size=n_total)

display(df.head())

Unnamed: 0,user_id,group,date,is_new_user
0,1,A,2025-07-01,0
1,2,A,2025-07-25,1
2,3,A,2025-07-04,0
3,4,A,2025-07-23,0
4,5,A,2025-07-01,1


## Build per-user conversion probability

In [6]:
# Precompute day-level noise map

# Day-level noise standard deviation (absolute probability)
# ~0.15 p.p. day-to-day variation
daily_noise_sd = 0.0015

daily_noise_map = {d.date(): rng.normal(0.0, daily_noise_sd) for d in dates}

# Set base probability by group
p_base = np.where(df["group"].eq("A"), pA, pA + uplift_B)

# Add segment effect: new users get -delta_new, returning get +offset_returning
segment_adjustment = np.where(
    df["is_new_user"].eq(1),
    -delta_new,
    offset_returning
)

# Add date noise (small daily drift)
date_noise = df["date"].map(daily_noise_map).to_numpy()

# Combine effects and clip to [0, 1]
p_conv = p_base + segment_adjustment + date_noise
p_conv = np.clip(p_conv, 0.0001, 0.9999)  # avoid degenerate probabilities

## Simulate conversion and revenue

In [7]:
# Simulate conversion as Bernoulli draws
df["converted"] = rng.binomial(1, p_conv, size=n_total)

In [8]:
# Simulate revenue:
# - If not converted: 0
# - If converted: > 0, drawn from a lognormal (long-tailed)
#   Choose parameters to get ~60 average revenue
target_mean = 60.0
sigma = 0.9  # dispersion in log space (heavy tail)
mu = np.log(target_mean) - 0.5 * sigma**2  # ensure mean ≈ target_mean

# If X is lognormal ->  E[X]=e^(μ+​σ^2 /2)

revenue_if_converted = rng.lognormal(mean=mu, sigma=sigma, size=df["converted"].sum())
df["revenue"] = 0.0
df.loc[df["converted"].eq(1), "revenue"] = revenue_if_converted

In [9]:
# Set convenient dtypes
df["group"] = df["group"].astype("category")
df["is_new_user"] = df["is_new_user"].astype("int8")
display(df.head())

Unnamed: 0,user_id,group,date,is_new_user,converted,revenue
0,1,A,2025-07-01,0,1,26.122426
1,2,A,2025-07-25,1,0,0.0
2,3,A,2025-07-04,0,0,0.0
3,4,A,2025-07-23,0,0,0.0
4,5,A,2025-07-01,1,0,0.0


## Quick sanity checks

In [None]:
def summarize(_df: pd.DataFrame) -> pd.Series:
    conv = _df["converted"].mean()
    arpu = _df["revenue"].mean()  # includes zeros
    return pd.Series({
        "conversion_rate": conv,
        "Average Revenue Per User": arpu,
        "n": len(_df)
    })

overall = summarize(df)
by_group = df.groupby("group", observed=True).apply(summarize)
by_group_segment = df.groupby(["group", "is_new_user"], observed=True).apply(summarize)

print("=== Overall ===")
print(overall, "\n")

print("=== By group ===")
print(by_group, "\n")

print("=== By group & segment ===")
print(by_group_segment, "\n")

=== Overall ===
conversion_rate        0.032050
ARPU                   1.849004
n                  40000.000000
dtype: float64 

=== By group ===
       conversion_rate      ARPU        n
group                                    
A               0.0278  1.545595  20000.0
B               0.0363  2.152412  20000.0 

=== By group & segment ===
                   conversion_rate      ARPU        n
group is_new_user                                    
A     0                   0.041241  2.224682   7929.0
      1                   0.018971  1.099528  12071.0
B     0                   0.053623  3.223764   7963.0
      1                   0.024840  1.443666  12037.0 



  by_group = df.groupby("group", observed=True).apply(summarize)
  by_group_segment = df.groupby(["group", "is_new_user"], observed=True).apply(summarize)


In [11]:
# --------------------------------------
# Save to CSV
# --------------------------------------
df.to_csv("ab_data_simulated.csv", index=False)