Introduction

This notebook constructs a synthetic dataset reflecting realistic customer behavior in subscription-based digital services. The objective is not to simulate exact reality, but to create a defensible environment for decision-oriented analysis.

In [9]:
import numpy as np
import pandas as pd
import os

Assumption Definitions

>Why tenure is short-skewed

>Why complaints are sparse

>Why discounts affect churn

In [10]:
np.random.seed(42)

n = 1000
data = pd.DataFrame({
    'tenure_months': np.random.randint(1, 25, n),
    'usage_frequency': np.random.poisson(8, n),
    'payment_method': np.random.choice(
        ['UPI', 'Card', 'Wallet', 'COD'], n, p=[0.45, 0.25, 0.2, 0.1]
    ),
    'complaints': np.random.poisson(0.8, n),
    'discount_used_pct': np.random.randint(0, 61, n),
    'plan_type': np.random.choice(
        ['Free', 'Basic', 'Premium'], n, p=[0.4, 0.4, 0.2]
    ),
    'monthly_charges': np.random.choice([99, 199, 299], n, p=[0.4, 0.4, 0.2])
})


In [11]:
churn_prob = (
    0.4 * (data['tenure_months'] < 6) +
    0.3 * (data['usage_frequency'] < 5) +
    0.5 * (data['complaints'] > 2) +
    0.3 * (data['discount_used_pct'] > 40)
)

churn_prob = np.clip(churn_prob, 0, 1)
data['churn'] = np.random.binomial(1, churn_prob)


In [12]:
save_dir = "../data"
os.makedirs(save_dir, exist_ok=True)

data.to_csv(
    f"{save_dir}/synthetic_customer_churn.csv",
    index=False
)