DATASET CREATION

Create a synthetic dataset to simulate customer data, including features such as age, gender, tenure, monthly data usage, call duration, and churn indicators.

In [6]:
import pandas as pd
import numpy as np

# Step 1: Define the number of records
num_customers = 1000

# Step 2: Create the synthetic dataset
np.random.seed(42)  # For reproducibility

data = {
    'Customer ID': [f'CUST{str(i).zfill(4)}' for i in range(1, num_customers + 1)],
    'Age': np.random.randint(18, 71, num_customers),
    'Gender': np.random.choice(['Male', 'Female'], num_customers),
    'Tenure (Years)': np.random.randint(1, 11, num_customers),
    'Region': np.random.choice(['Nairobi', 'Mombasa', 'Kisumu', 'Nakuru'], num_customers),
    'Monthly Data Usage (MB)': np.random.randint(500, 10001, num_customers),
    'Call Duration (Minutes)': np.random.randint(0, 1001, num_customers),
    'SMS Sent': np.random.randint(0, 501, num_customers),
    'M-Pesa Transactions': np.random.randint(0, 201, num_customers),
    'Customer Service Interactions': np.random.randint(0, 11, num_customers),
    'Subscription Type': np.random.choice(['Prepaid', 'Postpaid'], num_customers),
    'Churn Indicator': np.random.choice([0, 1], num_customers, p=[0.8, 0.2]),  # 20% churn rate
    'Last Month Data Usage (MB)': np.random.randint(500, 10001, num_customers),
    'Second Last Month Data Usage (MB)': np.random.randint(500, 10001, num_customers),
    'Third Last Month Data Usage (MB)': np.random.randint(500, 10001, num_customers),
    'Last Month Call Duration (Minutes)': np.random.randint(0, 1001, num_customers),
    'Second Last Month Call Duration (Minutes)': np.random.randint(0, 1001, num_customers),
    'Third Last Month Call Duration (Minutes)': np.random.randint(0, 1001, num_customers),
    'Last Month SMS Sent': np.random.randint(0, 501, num_customers),
    'Second Last Month SMS Sent': np.random.randint(0, 501, num_customers),
    'Third Last Month SMS Sent': np.random.randint(0, 501, num_customers),
    'Promotions': np.random.randint(0, 6, num_customers),  # Number of promotions received
    'Payment Method': np.random.choice(['Credit Card', 'Mobile Money', 'Debit Card', 'Cash'], num_customers),
    'Data Rate per MB': 0.05,  # Assuming a fixed rate
    'Call Rate per Minute': 0.02,  # Assuming a fixed rate
    'SMS Rate per Message': 0.01,  # Assuming a fixed rate
    'Churn History': np.random.randint(0, 2, num_customers)  # Number of times the customer has churned
}

# Step 3: Create DataFrame
Safaricom_churn_data = pd.DataFrame(data)

# Step 4: Save the DataFrame to a CSV file
Safaricom_churn_data.to_csv('Safaricom_churn_data.csv', index=False)

# Display the first few rows of the dataset
Safaricom_churn_data.head()


Unnamed: 0,Customer ID,Age,Gender,Tenure (Years),Region,Monthly Data Usage (MB),Call Duration (Minutes),SMS Sent,M-Pesa Transactions,Customer Service Interactions,...,Third Last Month Call Duration (Minutes),Last Month SMS Sent,Second Last Month SMS Sent,Third Last Month SMS Sent,Promotions,Payment Method,Data Rate per MB,Call Rate per Minute,SMS Rate per Message,Churn History
0,CUST0001,56,Male,2,Mombasa,6768,434,107,35,9,...,440,37,335,92,1,Credit Card,0.05,0.02,0.01,1
1,CUST0002,69,Male,10,Mombasa,1410,383,331,17,8,...,946,42,145,459,5,Credit Card,0.05,0.02,0.01,0
2,CUST0003,46,Female,9,Nakuru,4840,559,422,89,9,...,780,116,309,64,1,Debit Card,0.05,0.02,0.01,0
3,CUST0004,32,Female,8,Nakuru,4509,193,89,180,9,...,895,404,126,320,4,Mobile Money,0.05,0.02,0.01,0
4,CUST0005,60,Female,9,Nairobi,7128,735,204,198,0,...,919,474,174,9,5,Mobile Money,0.05,0.02,0.01,0
