**Assignment: Customer Churn Prediction**

In [1]:
## Generating a Synthetic Dataset on Customer Records With Specific Characteristics and Attributes

# Importing the required libraries
import pandas as pd
import numpy as np
from faker import Faker
import random

In [2]:
num_records = 5000
fake = Faker()
#Using the Random Module to Generate random Data
np.random.seed(64)
random.seed(64)

In [3]:
#Age
ages = [fake.random_int(min=18, max=70) for _ in range(num_records)]

# Gender
genders = [fake.random_element(elements=('Male', 'Female')) for _ in range(num_records)]

# ContractType
contract_types = [fake.random_element(elements=('Month-to-month', 'One year', 'Two year')) for _ in range(num_records)]

# MonthlyCharges
monthly_charges = [round(fake.random_number(digits=2, fix_len=True) + fake.random_number(digits=2, fix_len=False) / 100, 2) for _ in range(num_records)]

# Tenure
tenures = [fake.random_int(min=1, max=72) for _ in range(num_records)]

# TotalCharges
total_charges = [round(monthly_charges[i] * tenures[i], 2) for i in range(num_records)]

# TechSupport
tech_support = [fake.random_element(elements=('Yes', 'No')) for _ in range(num_records)]

# InternetService
internet_services = [fake.random_element(elements=('DSL', 'Fiber optic', 'No')) for _ in range(num_records)]

# PaperlessBilling
paperless_billing = [fake.random_element(elements=('Yes', 'No')) for _ in range(num_records)]

# PaymentMethod
payment_methods = [fake.random_element(elements=('Electronic check', 'Mailed check', 'Bank transfer (automatic)', 'Credit card (automatic)')) for _ in range(num_records)]

# Churn
churn= np.random.choice([0, 1], num_records, p=[0.7, 0.3])

In [4]:
#Create derived features
average_monthly_charges = [round(total_charges[i] / tenures[i], 2) if tenures[i] != 0 else 0 for i in range(num_records)]
customer_lifetime_value = [round(total_charges[i] * (1 + (tenures[i] / 12)), 2) for i in range(num_records)]


In [5]:
# Generate customer IDs
customer_ids = [fake.uuid4() for _ in range(num_records)]

# Create DataFrame
df = pd.DataFrame({
    'CustomerID': customer_ids,
    'Age': ages,
    'Gender': genders,
    'ContractType': contract_types,
    'MonthlyCharges': monthly_charges,
    'TotalCharges': total_charges,
    'TechSupport': tech_support,
    'InternetService': internet_services,
    'Tenure': tenures,
    'PaperlessBilling': paperless_billing,
    'PaymentMethod': payment_methods,
    'Churn': churn,
    'AverageMonthlyCharges': average_monthly_charges,
    'CustomerLifetimeValue': customer_lifetime_value
})

In [6]:
# Introduce outliers
outlier_indices = np.random.choice(df.index, size=int(num_records * 0.01), replace=False)
df.loc[outlier_indices, 'MonthlyCharges'] *= 3
df.loc[outlier_indices, 'TotalCharges'] *= 3


In [7]:
#Save The Dataset
df.to_csv('customer_data.csv', index=False)