In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import random
import os

In [2]:
# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

In [3]:
# Define constants
NUM_CUSTOMERS = 500000
NUM_CAMPAIGNS = 10
NUM_PRODUCTS = 50
NUM_ZIPCODES = 1000
START_DATE = datetime(2022, 1, 1)
END_DATE = datetime(2023, 12, 31)
DAYS_RANGE = (END_DATE - START_DATE).days

In [4]:
# Helper functions
def random_date(start_date, end_date):
    time_between_dates = end_date - start_date
    days_between_dates = time_between_dates.days
    random_number_of_days = random.randrange(days_between_dates)
    return start_date + timedelta(days=random_number_of_days)

In [5]:
# 1. Generate Customer_Profile data
print("Generating Customer_Profile data...")
customer_ids = list(range(1, NUM_CUSTOMERS + 1))
ages = np.random.normal(42, 15, NUM_CUSTOMERS).astype(int)
# Ensure ages are within reasonable range (18-90)
ages = np.clip(ages, 18, 90)

# Income with higher probability for middle-income ranges
income_base = np.random.lognormal(mean=11, sigma=0.7, size=NUM_CUSTOMERS)
incomes = np.round(income_base).astype(int)
# Clip to reasonable income range
incomes = np.clip(incomes, 20000, 250000)

genders = np.random.choice(['M', 'F', 'O'], NUM_CUSTOMERS, p=[0.48, 0.48, 0.04])
zip_codes = [f"{random.randint(10000, 99999):05d}" for _ in range(NUM_CUSTOMERS)]
homeowner_flags = np.random.choice([0, 1], NUM_CUSTOMERS, p=[0.35, 0.65])

customer_profile = pd.DataFrame({
    'customer_id': customer_ids,
    'age': ages,
    'income': incomes,
    'gender': genders,
    'zip_code': zip_codes,
    'homeowner_flag': homeowner_flags
})

# 2. Generate Geographic_Data
print("Generating Geographic_Data...")
unique_zip_codes = list(set(zip_codes))
regions = np.random.choice(['North', 'South', 'East', 'West', 'Central'], len(unique_zip_codes))
# Population density with higher values for urban areas
population_density = np.random.exponential(scale=2000, size=len(unique_zip_codes)).astype(int)
population_density = np.clip(population_density, 50, 25000)

geographic_data = pd.DataFrame({
    'zip_code': unique_zip_codes,
    'region': regions,
    'population_density': population_density
})

# 3. Generate Campaign_Responses data
print("Generating Campaign_Responses data...")
# Create multiple campaigns with different response rates
campaign_ids = list(range(1, NUM_CAMPAIGNS + 1))
campaign_response_rates = {
    campaign_id: np.random.uniform(0.05, 0.25) for campaign_id in campaign_ids
}

# Factors that influence response probability
def response_probability(customer_row, campaign_id):
    # Base probability from campaign
    prob = campaign_response_rates[campaign_id]
    
    # Age factor - middle-aged customers respond more
    age = customer_row['age']
    if 30 <= age <= 55:
        prob *= 1.3
    elif age > 70:
        prob *= 0.7
    
    # Income factor - higher income increases response
    income = customer_row['income']
    if income > 100000:
        prob *= 1.4
    elif income < 40000:
        prob *= 0.8
    
    # Homeowner factor
    if customer_row['homeowner_flag'] == 1:
        prob *= 1.2
    
    return min(prob, 0.95)  # Cap at 95% probability

campaign_responses = []
for customer_idx, customer_row in customer_profile.iterrows():
    # Each customer gets exposed to a random subset of campaigns
    num_campaigns_per_customer = random.randint(1, NUM_CAMPAIGNS)
    customer_campaigns = random.sample(campaign_ids, num_campaigns_per_customer)
    
    for campaign_id in customer_campaigns:
        # Determine if customer responded
        prob = response_probability(customer_row, campaign_id)
        response = np.random.choice([0, 1], p=[1-prob, prob])
        
        # If responded, generate response date
        response_date = None
        if response == 1:
            campaign_date = random_date(START_DATE, END_DATE)
            # Response happens within 30 days of campaign
            response_date = campaign_date + timedelta(days=random.randint(1, 30))
            if response_date > END_DATE:
                response_date = END_DATE
        
        campaign_responses.append({
            'customer_id': customer_row['customer_id'],
            'campaign_id': campaign_id,
            'response_flag': response,
            'response_date': response_date
        })

campaign_responses_df = pd.DataFrame(campaign_responses)

# 4. Generate Product_Interactions data
print("Generating Product_Interactions data...")
product_ids = list(range(1, NUM_PRODUCTS + 1))
interaction_types = ['viewed', 'added_to_cart', 'purchased', 'reviewed']

# Probability distribution for interaction types
interaction_probs = {
    'viewed': 0.6,
    'added_to_cart': 0.2,
    'purchased': 0.15,
    'reviewed': 0.05
}

product_interactions = []
# Generate more interactions for customers who responded to campaigns
responsive_customers = set(campaign_responses_df[campaign_responses_df['response_flag'] == 1]['customer_id'])

for customer_id in customer_ids:
    # Determine number of interactions based on responsiveness
    if customer_id in responsive_customers:
        num_interactions = random.randint(3, 15)
    else:
        num_interactions = random.randint(0, 8)
    
    # Generate interactions
    for _ in range(num_interactions):
        product_id = random.choice(product_ids)
        
        # Weighted random choice for interaction type
        interaction_type = random.choices(
            list(interaction_probs.keys()),
            weights=list(interaction_probs.values())
        )[0]
        
        interaction_date = random_date(START_DATE, END_DATE)
        
        product_interactions.append({
            'customer_id': customer_id,
            'product_id': product_id,
            'interaction_type': interaction_type,
            'interaction_date': interaction_date
        })

product_interactions_df = pd.DataFrame(product_interactions)

# 5. Generate Customer_Revenue data
print("Generating Customer_Revenue data...")
# Revenue is influenced by income, age, and product interactions
customer_revenue = []

# Get purchase interactions to correlate with revenue
purchase_interactions = product_interactions_df[product_interactions_df['interaction_type'] == 'purchased']
purchase_customers = set(purchase_interactions['customer_id'])

for customer_id in customer_ids:
    customer_row = customer_profile[customer_profile['customer_id'] == customer_id].iloc[0]
    
    # Determine number of revenue entries
    if customer_id in purchase_customers:
        num_entries = random.randint(1, 10)
    else:
        # Some customers might have revenue without recorded purchases
        num_entries = random.randint(0, 2)
    
    for _ in range(num_entries):
        # Base revenue influenced by income
        income = customer_row['income']
        base_revenue = np.random.lognormal(mean=4, sigma=1)
        
        # Scale by income factor
        income_factor = (income / 50000) ** 0.5
        scaled_revenue = base_revenue * income_factor
        
        # Add randomness
        final_revenue = max(10, int(scaled_revenue * random.uniform(0.7, 1.3)))
        
        # Cap at reasonable amount
        final_revenue = min(final_revenue, 5000)
        
        revenue_date = random_date(START_DATE, END_DATE)
        
        customer_revenue.append({
            'customer_id': customer_id,
            'revenue_amount': final_revenue,
            'date': revenue_date
        })

customer_revenue_df = pd.DataFrame(customer_revenue)

# Save all dataframes to CSV
print("Saving data to CSV files...")
output_dir = os.path.join(os.getcwd(), 'data')
os.makedirs(output_dir, exist_ok=True)

customer_profile.to_csv(f"{output_dir}/customer_profile.csv", index=False)
geographic_data.to_csv(f"{output_dir}/geographic_data.csv", index=False)
campaign_responses_df.to_csv(f"{output_dir}/campaign_responses.csv", index=False)
product_interactions_df.to_csv(f"{output_dir}/product_interactions.csv", index=False)
customer_revenue_df.to_csv(f"{output_dir}/customer_revenue.csv", index=False)

# Print summary statistics
print("\nData Generation Complete!")
print(f"Customer_Profile: {len(customer_profile)} rows")
print(f"Geographic_Data: {len(geographic_data)} rows")
print(f"Campaign_Responses: {len(campaign_responses_df)} rows")
print(f"Product_Interactions: {len(product_interactions_df)} rows")
print(f"Customer_Revenue: {len(customer_revenue_df)} rows")

# Create a data summary file
with open(f"{output_dir}/data_summary.txt", 'w') as f:
    f.write("Marketing Analytics Synthetic Data Summary\n")
    f.write("========================================\n\n")
    
    f.write(f"Total Customers: {NUM_CUSTOMERS}\n")
    f.write(f"Total Campaigns: {NUM_CAMPAIGNS}\n")
    f.write(f"Total Products: {NUM_PRODUCTS}\n")
    f.write(f"Date Range: {START_DATE.strftime('%Y-%m-%d')} to {END_DATE.strftime('%Y-%m-%d')}\n\n")
    
    f.write("Table Sizes:\n")
    f.write(f"- Customer_Profile: {len(customer_profile)} rows\n")
    f.write(f"- Geographic_Data: {len(geographic_data)} rows\n")
    f.write(f"- Campaign_Responses: {len(campaign_responses_df)} rows\n")
    f.write(f"- Product_Interactions: {len(product_interactions_df)} rows\n")
    f.write(f"- Customer_Revenue: {len(customer_revenue_df)} rows\n\n")
    
    f.write("Data Characteristics:\n")
    f.write(f"- Age range: {customer_profile['age'].min()} to {customer_profile['age'].max()}\n")
    f.write(f"- Income range: ${customer_profile['income'].min()} to ${customer_profile['income'].max()}\n")
    f.write(f"- Gender distribution: {dict(customer_profile['gender'].value_counts())}\n")
    f.write(f"- Homeowner percentage: {customer_profile['homeowner_flag'].mean() * 100:.1f}%\n")
    f.write(f"- Average campaign response rate: {campaign_responses_df['response_flag'].mean() * 100:.2f}%\n")
    f.write(f"- Average revenue per transaction: ${customer_revenue_df['revenue_amount'].mean():.2f}\n")

print(f"Data summary saved to {output_dir}/data_summary.txt")

Generating Customer_Profile data...
Generating Geographic_Data...
Generating Campaign_Responses data...
Generating Product_Interactions data...
Generating Customer_Revenue data...
Saving data to CSV files...

Data Generation Complete!
Customer_Profile: 500000 rows
Geographic_Data: 89627 rows
Campaign_Responses: 2752331 rows
Product_Interactions: 3599059 rows
Customer_Revenue: 1893603 rows
Data summary saved to c:\Users\ayoad\OneDrive\Desktop\project\homeserve\data/data_summary.txt
