In [4]:
pip install pandas numpy scikit-learn matplotlib seaborn joblib


Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Using cached pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.1-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.1-cp312-cp312-win_amd64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.56.0-cp312-cp312-win_amd64.whl.metadata (103 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.8-cp312-cp312-win

In [5]:
import pandas as pd
import numpy as np
import random

# Set random seed for reproducibility
np.random.seed(42)

# Parameters for dataset generation
num_customers = 1000  # Number of customers to simulate

# Generate Customer IDs
customer_ids = [f"C{str(i).zfill(4)}" for i in range(1, num_customers + 1)]

# Generate Customer Demographics
ages = np.random.randint(18, 80, size=num_customers)  # Age between 18 and 80
locations = np.random.choice(['Urban', 'Rural', 'Suburban'], size=num_customers, p=[0.5, 0.3, 0.2])
plan_types = np.random.choice(['Basic', 'Premium'], size=num_customers, p=[0.7, 0.3])

# Generate Usage Patterns
average_call_duration = np.random.uniform(1, 30, size=num_customers)  # Average call duration in minutes
data_usage = np.random.uniform(0.5, 50, size=num_customers)  # Data usage in GB
number_of_calls = np.random.randint(10, 300, size=num_customers)  # Number of calls per month

# Generate Billing Information
monthly_charges = np.random.uniform(20, 150, size=num_customers)  # Monthly charges in dollars
payment_methods = np.random.choice(['CreditCard', 'BankTransfer'], size=num_customers, p=[0.6, 0.4])

# Generate Churn Status
# Churn is influenced by factors like high monthly charges, low usage, and plan type
churn_probabilities = (
    0.3 * (monthly_charges > 100) +  # Higher churn probability for high charges
    0.2 * (data_usage < 5) +        # Higher churn probability for low data usage
    0.1 * (plan_types == 'Basic')   # Higher churn probability for Basic plans
)
churn_status = np.random.choice(['Yes', 'No'], size=num_customers, p=[0.2, 0.8])

# Create DataFrame
data = pd.DataFrame({
    'CustomerID': customer_ids,
    'Age': ages,
    'Location': locations,
    'PlanType': plan_types,
    'AverageCallDuration': average_call_duration,
    'DataUsage': data_usage,
    'NumberOfCalls': number_of_calls,
    'MonthlyCharges': monthly_charges,
    'PaymentMethod': payment_methods,
    'ChurnStatus': churn_status
})

# Save to CSV
data.to_csv('customer_churn_data.csv', index=False)

print("Synthetic dataset generated and saved as 'customer_churn_data.csv'.")

Synthetic dataset generated and saved as 'customer_churn_data.csv'.
