In [2]:
import pandas as pd
import numpy as np


## Create users

In [5]:
# ========================
# 1. User Demographics
# ========================
def assign_demographics(num_users=100):
    age_groups = ['18-24','25-34','35-44','45-54','55-64','65+']
    age_probs = [0.12,0.15,0.14,0.15,0.18,0.26]
    genders = ['Male','Female']
    gender_probs = [0.49,0.51]
    income_brackets = ['<50K','50-100K','>100K']
    income_probs = [0.3,0.4,0.3]
    customer_segments = ['Loyal','Frequent','Occasional']
    segment_probs = [0.3,0.2,0.5]

    user_ids = [f"user_{i}" for i in range(1,num_users+1)]
    user_data = []
    for user in user_ids:
        age_group = np.random.choice(age_groups, p=age_probs)
        gender = np.random.choice(genders, p=gender_probs)
        income = np.random.choice(income_brackets, p=income_probs)
        customer_type = np.random.choice(customer_segments, p=segment_probs)
        user_data.append([user, age_group, gender, income, customer_type])
    return pd.DataFrame(user_data, columns=['user_id','age_group','gender','income_bracket','customer_type'])


In [12]:
# ========================
# 2. Regional Location & Age Preferences
# ========================
def assign_location_and_preferences(user_df):
    # Assign Australian states
    states = ['NSW','VIC','QLD','WA','SA','TAS','ACT','NT']
    state_probs = [0.32,0.26,0.2,0.1,0.06,0.03,0.02,0.01]
    user_df['state'] = np.random.choice(states, size=len(user_df), p=state_probs)
    
    def age_group_to_category(age_group):
        if age_group in ['18-24','25-34']:
            return '18-34'
        elif age_group in ['35-44','45-54']:
            return '35-54'
        else:
            return '55+'
        
    user_df['age_cat'] = user_df['age_group'].apply(age_group_to_category)

    # Map age category to preferred tiers
    age_preferences = {
        '18-34': ['high','medium'],
        '35-54': ['medium','low'],
        '55+': ['high','low']
    }
    user_df['preferred_tiers'] = user_df['age_cat'].map(age_preferences)    
    return user_df


In [13]:
# 1. Generate demographics
user_df = assign_demographics(num_users=50)

In [14]:
 #2. assign regional location & age preferences
user_df = assign_location_and_preferences(user_df)

In [15]:
user_df.head()

Unnamed: 0,user_id,age_group,gender,income_bracket,customer_type,state,age_cat,preferred_tiers
0,user_1,55-64,Male,>100K,Frequent,SA,55+,"[high, low]"
1,user_2,35-44,Male,50-100K,Occasional,VIC,35-54,"[medium, low]"
2,user_3,45-54,Male,>100K,Frequent,VIC,35-54,"[medium, low]"
3,user_4,65+,Male,<50K,Occasional,NSW,55+,"[high, low]"
4,user_5,25-34,Male,<50K,Loyal,QLD,18-34,"[high, medium]"


In [16]:
user_df.to_csv('data/users.csv')