In [10]:
import numpy as np
import pandas as pd

# Number of samples
num_customers = 1000

# Set random seed for reproducibility
np.random.seed(42)

# 1. Customer ID
customer_id = np.arange(1, num_customers + 1)

# 2. Age
age = np.random.randint(18, 70, size=num_customers)

# 3. Gender
gender = np.random.choice(['Male', 'Female', 'Other'], size=num_customers, p=[0.48, 0.48, 0.04])

# 4. Annual Income (in USD)
income = np.random.normal(loc=60000, scale=15000, size=num_customers)
income = np.clip(income, 20000, 120000)  # Limiting income between 20k and 120k

# 5. Spending Score (1-100)
spending_score = np.random.uniform(1, 100, size=num_customers)

# 6. Membership Status
membership_status = np.random.choice(
    ['Bronze', 'Silver', 'Gold', 'Platinum'],
    size=num_customers,
    p=[0.5, 0.3, 0.15, 0.05]
)

# Convert membership_status to pandas Series
membership_status_series = pd.Series(membership_status)

# 7. Product Category Preference
product_category = np.random.choice(
    ['Electronics', 'Fashion', 'Grocery', 'Home & Living', 'Sports', 'Health & Beauty'],
    size=num_customers
)

# 8. Purchase Frequency (per year)
# Map membership statuses to multipliers
membership_multiplier = membership_status_series.map({
    'Bronze': 1.0,
    'Silver': 1.2,
    'Gold': 1.5,
    'Platinum': 2.0
}).values

# Base frequency influenced by spending score
base_frequency = (spending_score / 100) * np.random.uniform(1, 12, size=num_customers)

# Calculate purchase frequency
purchase_frequency = base_frequency * membership_multiplier
purchase_frequency = np.clip(purchase_frequency, 1, 50).astype(int)

# 9. Average Transaction Value
average_transaction_value = (income / 1000) * (spending_score / 100) * np.random.uniform(10, 100)
average_transaction_value = np.clip(average_transaction_value, 10, 1000)

# 10. Churn Risk Score (0 to 1)
# Map membership statuses to churn factors
churn_factor = membership_status_series.map({
    'Bronze': 0.9,
    'Silver': 0.7,
    'Gold': 0.5,
    'Platinum': 0.3
}).values

# Calculate churn risk
churn_risk = 1 - (spending_score / 100) * churn_factor
churn_risk += np.random.normal(0, 0.05, num_customers)
churn_risk = np.clip(churn_risk, 0, 1)

# D. Create Target Variables
# 1. Purchase in Next Month (Yes/No)
purchase_prob = (1 - churn_risk) * (spending_score / 100)
purchase_next_month = np.random.binomial(1, purchase_prob)

# 2. Expected Spend Next Month
expected_spend = purchase_next_month * average_transaction_value * np.random.uniform(0.5, 1.5, size=num_customers)

# E. Assemble the DataFrame
data = pd.DataFrame({
    'CustomerID': customer_id,
    'Age': age,
    'Gender': gender,
    'AnnualIncome': income.astype(int),
    'SpendingScore': spending_score,
    'MembershipStatus': membership_status,
    'ProductCategory': product_category,
    'PurchaseFrequency': purchase_frequency,
    'AvgTransactionValue': average_transaction_value,
    'ChurnRisk': churn_risk,
    'PurchaseNextMonth': purchase_next_month,
    'ExpectedSpend': expected_spend
})


In [11]:
data

Unnamed: 0,CustomerID,Age,Gender,AnnualIncome,SpendingScore,MembershipStatus,ProductCategory,PurchaseFrequency,AvgTransactionValue,ChurnRisk,PurchaseNextMonth,ExpectedSpend
0,1,56,Female,43615,1.001152,Gold,Health & Beauty,1,28.254951,1.000000,0,0.000000
1,2,69,Male,55253,42.240017,Bronze,Electronics,4,1000.000000,0.574014,0,0.000000
2,3,46,Male,78196,52.728508,Gold,Home & Living,6,1000.000000,0.760902,0,0.000000
3,4,32,Male,62125,6.408808,Bronze,Electronics,1,257.634820,0.918445,0,0.000000
4,5,60,Male,94789,97.334738,Silver,Electronics,13,1000.000000,0.335050,1,885.421100
...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,60,Male,64794,83.460042,Bronze,Health & Beauty,5,1000.000000,0.309600,1,1001.712817
996,997,64,Female,52437,54.489357,Bronze,Sports,1,1000.000000,0.519148,0,0.000000
997,998,62,Female,58777,84.621925,Gold,Fashion,5,1000.000000,0.545572,0,0.000000
998,999,35,Female,65215,43.698869,Bronze,Grocery,4,1000.000000,0.609473,0,0.000000


In [12]:
data.to_csv('customer_behavior_dataset.csv', index=False)
