In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)
n_customers = 1000

# Scenario 1: Generate a simulated training dataset with moderate churn rate (< 20%)
df1 = pd.DataFrame({
    'customer_id': np.arange(1, n_customers + 1),
    'signup_days_ago': np.random.randint(30, 1000, size=n_customers),
    'last_active_days_ago': np.random.randint(0, 300, size=n_customers),
    'num_logins_last_month': np.random.poisson(5, size=n_customers),
    'avg_session_length': np.round(np.random.normal(15, 5, size=n_customers), 2),
    'num_support_tickets': np.random.poisson(0.5, size=n_customers),
    'is_premium': np.random.choice([0, 1], size=n_customers, p=[0.7, 0.3]),
    'churned': np.random.choice([0, 1], size=n_customers, p=[0.8, 0.2])
})

df1.to_csv("../data/user_activity.csv", index=False)

# Scenario 2: Generate a simulated training dataset with low churn rate (< 10%)
df2 = pd.DataFrame({
    'customer_id': np.arange(1, n_customers + 1),
    'signup_days_ago': np.random.randint(30, 1000, size=n_customers),
    'last_active_days_ago': np.random.randint(0, 300, size=n_customers),
    'num_logins_last_month': np.random.poisson(5, size=n_customers),
    'avg_session_length': np.round(np.random.normal(15, 5, size=n_customers), 2),
    'num_support_tickets': np.random.poisson(0.5, size=n_customers),
    'is_premium': np.random.choice([0, 1], size=n_customers, p=[0.7, 0.3]),
    'churned': np.random.choice([0, 1], size=n_customers, p=[0.91, 0.09])  # 9% churn
})

df2.to_csv("../data/user_activity_lowchurn.csv", index=False)

# Scenario 3: Generate a simulated training dataset with rare churn rate (< 5%)
df3 = pd.DataFrame({
    'customer_id': np.arange(1, n_customers + 1),
    'signup_days_ago': np.random.randint(30, 1000, size=n_customers),
    'last_active_days_ago': np.random.randint(0, 300, size=n_customers),
    'num_logins_last_month': np.random.poisson(5, size=n_customers),
    'avg_session_length': np.round(np.random.normal(15, 5, size=n_customers), 2),
    'num_support_tickets': np.random.poisson(0.5, size=n_customers),
    'is_premium': np.random.choice([0, 1], size=n_customers, p=[0.7, 0.3]),
    'churned': np.random.choice([0, 1], size=n_customers, p=[0.95, 0.05])  # 5% churn
})

df3.to_csv("../data/user_activity_rarechurn.csv", index=False)