In [2]:
!pip install faker pandas numpy


Collecting faker
  Using cached faker-37.6.0-py3-none-any.whl.metadata (15 kB)
Using cached faker-37.6.0-py3-none-any.whl (1.9 MB)
Installing collected packages: faker
Successfully installed faker-37.6.0


In [6]:
# scripts/01_synthetic_data_generation.py
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta

# Initialize Faker and set a seed for reproducibility
fake = Faker()
np.random.seed(42)

# Define parameters
num_customers = 1000
start_date = datetime(2022, 1, 1)
end_date = datetime(2024, 12, 31)

# Generate a base customer dataframe
customer_ids = [f"CUST_{i:04d}" for i in range(1, num_customers+1)]
signup_dates = [fake.date_between(start_date=start_date, end_date=end_date - timedelta(days=180)) for _ in range(num_customers)] # Ensure all have been customers for at least 6 months

df_customers = pd.DataFrame({
    'customer_id': customer_ids,
    'signup_date': signup_dates,
    'region': [fake.country() for _ in range(num_customers)],
    'subscription_tier': np.random.choice(['Basic', 'Pro', 'Enterprise'], size=num_customers, p=[0.5, 0.35, 0.15]),
    'initial_mrr': np.random.uniform(50, 1000, size=num_customers).round(2)
})

# Generate monthly snapshot data for each customer
records = []
for _, cust in df_customers.iterrows():
    cust_start = cust['signup_date']
    # Simulate churn: some customers churn after a certain period
    months_active = np.random.geometric(p=0.05) # Most will have many months, some will churn early
    active_months = list(pd.date_range(cust_start, periods=min(months_active, 36), freq='MS')) # Cap at 36 months

    for snapshot_date in active_months:
        if snapshot_date > pd.Timestamp(end_date):
            break

        # Simulate metrics that might indicate churn risk
        login_count = np.random.negative_binomial(30, 0.7) # Most will have decent login counts, some low
        support_tickets = np.random.poisson(lam=1.5) # Some users generate more tickets
        # Mrr can grow or shrink slightly
        mrr_change_pct = np.random.normal(0.02, 0.1)
        current_mrr = round(max(10, cust['initial_mrr'] * (1 + mrr_change_pct)), 2)

        records.append({
            'customer_id': cust['customer_id'],
            'snapshot_yearmonth': snapshot_date.strftime('%Y-%m'),
            'snapshot_date': snapshot_date,
            'login_count_last_30d': login_count,
            'support_tickets_last_30d': support_tickets,
            'current_mrr': current_mrr,
            'is_churned': 0 # Will be updated later
        })

    # After the active months, the customer churns
    if months_active < 36 and active_months:
        churn_date = active_months[-1] + pd.offsets.MonthEnd()
        records.append({
            'customer_id': cust['customer_id'],
            'snapshot_yearmonth': churn_date.strftime('%Y-%m'),
            'snapshot_date': churn_date,
            'login_count_last_30d': 0,
            'support_tickets_last_30d': 0,
            'current_mrr': 0,
            'is_churned': 1
        })

df_monthly_snapshots = pd.DataFrame(records)

# Calculate a "Churn Risk Score" based on behavior (a simple example)
df_monthly_snapshots['churn_risk_score'] = (
    (df_monthly_snapshots['login_count_last_30d'] < 5).astype(int) * 0.4 +
    (df_monthly_snapshots['support_tickets_last_30d'] > 5).astype(int) * 0.3 +
    (df_monthly_snapshots['current_mrr'] < df_monthly_snapshots.groupby('customer_id')['current_mrr'].transform('max') * 0.8).astype(int) * 0.3
).round(2)

# Merge with customer static data
final_df = pd.merge(df_monthly_snapshots, df_customers, on='customer_id', how='left')

# Save the final dataset
final_df.to_csv(r"C:\Users\LOQ\saas_customer_churn_analysis\data\processed\synthetic_churn_data.csv", index=False)
print("Synthetic dataset generated successfully!")

Synthetic dataset generated successfully!
