In [1]:
# data_simulation.py
"""
Script to generate a synthetic customer churn dataset for proof-of-concept.
This dataset mimics web analytics and CRM data for a subscription-based business.
"""

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Set random seed for reproducibility
np.random.seed(42)

# Define the number of samples
n_samples = 10000

print("Generating synthetic customer data...")

# 1. Generate basic customer demographics and static features
df = pd.DataFrame({
    'customer_id': range(1, n_samples + 1),
    'tenure': np.random.randint(1, 72, n_samples),  # tenure in months (1-72 months)
    'age': np.random.normal(45, 15, n_samples).astype(int),  # customer age
    'support_calls': np.random.poisson(0.5, n_samples),  # number of support calls
    'subscription_type': np.random.choice(['Basic', 'Premium', 'Enterprise'], n_samples, p=[0.6, 0.3, 0.1]),
    'country': np.random.choice(['USA', 'UK', 'Germany', 'France', 'India'], n_samples, p=[0.5, 0.2, 0.1, 0.1, 0.1])
})

# 2. Generate behavioral features (Web Analytics Metrics)
# These features are designed to be predictive of churn
df['monthly_usage'] = np.random.normal(15, 5, n_samples)  # average sessions per month
df['avg_session_duration'] = np.random.gamma(5, 5, n_samples)  # in minutes
df['feature_1_usage'] = np.random.exponential(2, n_samples)
df['feature_2_usage'] = np.random.exponential(3, n_samples)

# 3. Introduce correlations: Make behavior depend on tenure and subscription type
df['monthly_usage'] = df['monthly_usage'] * (1 + 0.02 * df['tenure'])  # usage increases with tenure
df['avg_session_duration'] = df['avg_session_duration'] * (1 + 0.01 * df['tenure'])  # session duration increases with tenure

# Premium users use more features
df.loc[df['subscription_type'] == 'Premium', 'feature_1_usage'] *= 1.5
df.loc[df['subscription_type'] == 'Enterprise', 'feature_1_usage'] *= 2.0
df.loc[df['subscription_type'] == 'Premium', 'feature_2_usage'] *= 1.3
df.loc[df['subscription_type'] == 'Enterprise', 'feature_2_usage'] *= 1.8

# 4. Create a target variable 'churn' based on a logical function of the features
# We create a "risk_score" that is a linear combination of features
risk_score = (
    -0.1 * df['tenure']  # Higher tenure -> lower risk
    + 0.05 * df['age']  # Slight increase with age
    + 0.8 * df['support_calls']  # More support calls -> much higher risk
    - 0.5 * df['monthly_usage']  # Higher usage -> lower risk
    - 0.3 * df['avg_session_duration']  # Longer sessions -> lower risk
    - 0.2 * df['feature_1_usage']  # More feature use -> lower risk
    - 0.2 * df['feature_2_usage']  # More feature use -> lower risk
    + np.random.normal(0, 1, n_samples)  # Add some randomness
)

# Convert the risk_score to a probability using the logistic function
churn_probability = 1 / (1 + np.exp(-risk_score))

# Generate binary churn labels (0/1) based on the probability
df['churn'] = np.random.binomial(1, churn_probability)

# Ensure a minimum number of churners for the class imbalance (roughly 20%)
# This is a common ratio in churn datasets
if df['churn'].mean() < 0.15:
    adjustment = risk_score > np.percentile(risk_score, 75)
    df.loc[adjustment, 'churn'] = 1
elif df['churn'].mean() > 0.35:
    adjustment = risk_score < np.percentile(risk_score, 25)
    df.loc[adjustment, 'churn'] = 0

print(f"Final churn rate: {df['churn'].mean():.2%}")

# 5. Perform final feature engineering
# Create a key predictive feature: usage intensity (usage per month of tenure)
df['usage_intensity'] = df['monthly_usage'] / df['tenure'].clip(lower=1)  # Avoid division by zero

# 6. Save the synthetic dataset to a CSV file
output_file = 'synthetic_churn_data.csv'
df.to_csv(output_file, index=False)

print(f"Synthetic data generation complete!")
print(f"Dataset saved to '{output_file}'.")
print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows of the dataset:")
print(df.head())

Generating synthetic customer data...
Final churn rate: 25.00%
Synthetic data generation complete!
Dataset saved to 'synthetic_churn_data.csv'.
Dataset shape: (10000, 12)

First 5 rows of the dataset:
   customer_id  tenure  age  support_calls subscription_type country  \
0            1      52   75              0             Basic      UK   
1            2      15   37              1           Premium  France   
2            3      61   53              1        Enterprise      UK   
3            4      21   65              0             Basic  France   
4            5      24   21              0           Premium      UK   

   monthly_usage  avg_session_duration  feature_1_usage  feature_2_usage  \
0      32.374618             16.473425         1.157110         6.040795   
1      19.353461             31.269014         2.707515         1.549471   
2      30.092976             78.577621         4.528766        12.918040   
3      32.243516             10.020259         0.949944       