In [1]:
!pip install faker

Collecting faker
  Downloading faker-37.8.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.8.0-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.8.0


In [4]:

import pandas as pd
import numpy as np
from faker import Faker
import os
from tqdm.auto import tqdm

# Initialize Faker to generate realistic company names
fake = Faker()

def create_synthetic_data(records=100000):
    """
    Generates a DataFrame with synthetic client data.

    The churn logic is intentionally multi-factorial. Churn is influenced by a
    combination of factors like low satisfaction (NPS), poor project outcomes,
    lack of communication, and being a relatively new client.
    """
    data = []
    print(f"Generating {records} client records...")

    # Use tqdm for a progress bar
    for _ in tqdm(range(records)):
        # --- Core Client Metrics ---
        contract_value = np.random.randint(10000, 750000)
        tenure = np.random.randint(3, 72)  # Contract tenure in months
        nps = np.random.randint(1, 11)  # Net Promoter Score (1-10)
        communication_freq = np.random.randint(1, 40)  # Communications per month
        avg_rating = round(np.random.uniform(2.0, 5.0), 1)

        # --- Multi-Factor Churn Logic ---
        churn_prob = 0.05  # Start with a base probability

        # NPS is a very strong indicator
        if nps < 7:
            churn_prob += (7 - nps) * 0.08

        # Poor project ratings increase risk significantly
        if avg_rating < 3.5:
            churn_prob += (3.5 - avg_rating) * 0.15

        # Lack of communication is a major red flag
        if communication_freq < 5:
            churn_prob += 0.15

        # Newer clients are less "sticky"
        if tenure < 12:
            churn_prob += 0.10

        # High-value clients who are not delighted are a flight risk
        if contract_value > 500000 and nps < 8:
            churn_prob += 0.10

        # Determine final churn based on the calculated probability
        churn = 1 if np.random.rand() < churn_prob else 0

        data.append({
            'client_id': fake.uuid4(),
            'company_name': fake.company(),
            'industry': np.random.choice(['Tech', 'Finance', 'Healthcare', 'Retail', 'Manufacturing', 'Logistics']),
            'company_size': np.random.choice(['<50', '50-250', '250-1000', '1000+']),
            'contract_value': contract_value,
            'contract_tenure_months': tenure,
            'service_level': np.random.choice(['Basic', 'Premium', 'Enterprise']),
            'projects_completed': np.random.randint(1, 50),
            'avg_project_rating': avg_rating,
            'communication_freq_monthly': communication_freq,
            'nps_score': nps,
            'payment_history': np.random.choice(['On-time', 'Late', 'Consistently Late']),
            'churn': churn
        })

    return pd.DataFrame(data)

# --- Main execution block ---
if __name__ == "__main__":
    # Generate the dataset
    df = create_synthetic_data(records=100000)

    # Define the output path and ensure the directory exists
    output_dir = 'data'
    os.makedirs(output_dir, exist_ok=True)

    # Save the file
    output_path = os.path.join(output_dir, 'consulting_churn_data.csv')
    df.to_csv(output_path, index=False)

    print(f"\n✅ Dataset successfully generated and saved to: {output_path}")

Generating 100000 client records...


  0%|          | 0/100000 [00:00<?, ?it/s]


✅ Dataset successfully generated and saved to: data/consulting_churn_data.csv
