In [1]:
import numpy as np
import pandas as pd
import random
from datetime import datetime, timedelta

In [2]:
# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# Constants
NUM_CUSTOMERS = 1000

In [3]:
def generate_customer_id(num_customers):
    return [f"CUST_{i+1:04d}" for i in range(num_customers)]

def generate_age(num_customers):
    return np.random.normal(loc=40, scale=12, size=num_customers).astype(int)

def generate_gender(num_customers):
    return np.random.choice(['Male', 'Female'], size=num_customers)

def generate_annual_income(num_customers):
    return np.random.normal(loc=70000, scale=15000, size=num_customers).astype(int)

def generate_spending_score(num_customers):
    return np.random.randint(1, 101, size=num_customers)

def generate_purchase_history(num_customers):
    return [random.choices(range(100, 10000), k=random.randint(1, 10)) for _ in range(num_customers)]

def generate_last_purchase_date(num_customers):
    start_date = datetime(2023, 1, 1)
    return [start_date + timedelta(days=random.randint(0, 365)) for _ in range(num_customers)]

def generate_membership_status(num_customers):
    return np.random.choice(['Regular', 'Silver', 'Gold'], size=num_customers, p=[0.7, 0.2, 0.1])

In [4]:
customer_id = generate_customer_id(NUM_CUSTOMERS)
age = generate_age(NUM_CUSTOMERS)
gender = generate_gender(NUM_CUSTOMERS)
annual_income = generate_annual_income(NUM_CUSTOMERS)
spending_score = generate_spending_score(NUM_CUSTOMERS)
purchase_history = generate_purchase_history(NUM_CUSTOMERS)
last_purchase_date = generate_last_purchase_date(NUM_CUSTOMERS)
membership_status = generate_membership_status(NUM_CUSTOMERS)

In [5]:
df = pd.DataFrame({
    'CustomerID': customer_id,
    'Age': age,
    'Gender': gender,
    'AnnualIncome': annual_income,
    'SpendingScore': spending_score,
    'PurchaseHistory': purchase_history,
    'LastPurchaseDate': last_purchase_date,
    'MembershipStatus': membership_status
})


In [6]:
#Introduce flaws: missing values
for _ in range(int(NUM_CUSTOMERS * 0.05)):  # 5% missing values
    df.loc[random.randint(0, NUM_CUSTOMERS-1), random.choice(df.columns)] = np.nan

# Introduce flaws: outliers
for _ in range(int(NUM_CUSTOMERS * 0.01)):  # 1% outliers
    df.loc[random.randint(0, NUM_CUSTOMERS-1), 'AnnualIncome'] = random.randint(200000, 500000)


In [7]:
df.to_csv('customer_behavior_data.csv', index=False)

print("Data generation complete. File saved as 'customer_behavior_data.csv'")

Data generation complete. File saved as 'customer_behavior_data.csv'
