In [None]:
# 📁 01_generate_churn_data.ipynb

# ## 1. Introduction
"""
This notebook generates a realistic synthetic dataset for customer churn prediction.
The dataset simulates a telco-like business with features like usage, complaints, and promotions.
"""

# ## 2. Imports
import pandas as pd
import numpy as np

# ## 3. Configuration
np.random.seed(42)
n = 1000  # number of customers

# ## 4. Feature Generation
customer_id = [f"CUST{i:04d}" for i in range(n)]
tenure_months = np.random.randint(1, 60, size=n)
monthly_spend = np.round(np.random.normal(50, 15, size=n), 2)
monthly_spend = np.clip(monthly_spend, 10, 150)
num_complaints = np.random.poisson(1.5, size=n)
is_on_promo_plan = np.random.choice([0, 1], size=n, p=[0.7, 0.3])
used_app_this_week = np.random.choice([0, 1], size=n, p=[0.4, 0.6])
segments = np.random.choice(['Basic', 'Plus', 'Premium'], size=n, p=[0.5, 0.3, 0.2])

# ## 5. Churn Label Logic
churn_prob = (
    0.2 * (tenure_months < 12).astype(int) +
    0.2 * (monthly_spend < 30).astype(int) +
    0.3 * (num_complaints > 2).astype(int) +
    0.1 * (is_on_promo_plan == 0).astype(int) +
    0.1 * (used_app_this_week == 0).astype(int)
)
churned = np.random.binomial(1, churn_prob.clip(0, 1))

# ## 6. Assemble DataFrame
df_churn = pd.DataFrame({
    'customer_id': customer_id,
    'tenure_months': tenure_months,
    'monthly_spend': monthly_spend,
    'num_complaints': num_complaints,
    'is_on_promo_plan': is_on_promo_plan,
    'used_app_this_week': used_app_this_week,
    'segment': segments,
    'churned': churned
})

# ## 7. Save Dataset
df_churn.to_csv("retention_insights_fake_customers.csv", index=False)
df_churn.head()