In [94]:
import pandas as pd
import numpy as np
import random
from faker import Faker

fake = Faker()
Faker.seed(123)
np.random.seed(123)
random.seed(123)

states_list = ['Maine', 'New Hampshire', 'Vermont', 'Massachusetts', 'Rhode Island', 'Connecticut', 'New York', 'New Jersey', 'Pennsylvania']

# Define functions or dictionaries to map attributes to termination probabilities
def gen_purchase_again_prob(product_category, collection, income, different_media, where_learn):
    base_probability = 0.350  # Base probability of purchase_again = Yes

    if product_category == 'Action Figure':
        base_probability += 0.05
    elif product_category == 'Plush':
        base_probability += 0.03

    if collection == 'Yes':
        base_probability += 0.20

    if income == '100k+':
        base_probability += 0.08
    elif income == '50-100k':
        base_probability += 0.4

    if different_media == 'Yes':
        base_probability += 0.15

    if where_learn == 'Commercial':
        base_probability += 0.10
    
    yes_probability = max(0, min(1, base_probability)) # Ensure probability is within valid range (0 to 1)
    
    return np.random.choice(['Yes', 'No'], p = [yes_probability, 1 - yes_probability])

def create_survey_df(rows = 100):

    dataset = []
    for rows in range(rows):
        date = fake.date_time_between(start_date=datetime(2020,1,1), end_date=datetime(2023, 12, 31)).date()
        state = np.random.choice(states_list)
        product_category = np.random.choice(['Action Figures', 'Plush', 'Puzzle'], p = [0.5, 0.3, 0.2])
        gender = np.random.choice(['Male', 'Female', 'Other'], p = [0.52, 0.46, 0.02])
        gift = np.random.choice(['Yes', 'No'], p = [0.35, 0.65])
        collection = np.random.choice(['Yes', 'No', "Don't know"], p = [0.25, 0.40, 0.35])
        income = np.random.choice(['<50k', '50k-100k',' 100k+'], p = [0.30, 0.35, 0.35])
        different_media = np.random.choice(['Yes', 'No', "Don't know"], p = [0.30, 0.50, 0.20])
        where_learn = np.random.choice(['Browsing store', 'Commercial', 'Other'], p = [0.40, 0.40, 0.20])
        purchase_again = gen_purchase_again_prob(product_category, collection, income, different_media, where_learn)
        record = {
            'Date': date,
            'State': state,
            'Product Category': product_category,
            'Gender': gender,
            'Gift': gift,
            'Collection': collection,
            'Income': income,
            'Different Media': different_media,
            'Where Learn': where_learn,
            'Purchase Again': purchase_again
        }
        
        dataset.append(record)

    return pd.DataFrame(dataset)
        

In [100]:
df = create_survey_df(rows = 10_000)
df