In [None]:
# Data Ingestion & Cleaning
import pandas as pd
import numpy as np
import uuid
from datetime import datetime
from datetime import timedelta
import random

In [None]:
def simulate_transactions(products_df, user_df, product_tiers, start_date="2022-08-01", end_date="2025-08-01"):
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    
    high, med, low = product_tiers
    rows = []
    
    # Mean and std per customer type (can adjust for realism)
    transaction_params = {
        'Loyal': {'mean': 10000, 'std': 1500},
        'Frequent': {'mean': 6500, 'std': 1000},
        'Occasional': {'mean': 3000, 'std': 800}
    }
    
    for _, user in user_df.iterrows():
        customer_type = user['customer_type']
        tiers = user['preferred_tiers']  # guaranteed list
        
        # Generate variable transaction count per user using normal distribution
        mean, std = transaction_params[customer_type]['mean'], transaction_params[customer_type]['std']
        transaction_count = max(1, int(np.random.normal(mean, std)))  # Ensure at least 1 transaction
        
        history = []
        for _ in range(transaction_count):
            transaction_id = uuid.uuid4().hex[:10]
            date = start_date + timedelta(days=random.randint(0,(end_date - start_date).days))
            num_items = random.randint(4,7) if customer_type=='Loyal' else random.randint(3,6)
            basket = set()
            while len(basket)<num_items:
                tier_choice = np.random.choice(tiers)
                product = random.choice(high if tier_choice=='high' else med if tier_choice=='medium' else low)
                if product not in basket:
                    basket.add(product)
                    history.append(product)
                    prod_info = products_df.loc[products_df['product_code']==product].iloc[0]
                    price = (prod_info['best_price']*(1-random.uniform(0,0.1)) 
                             if customer_type=='Loyal' 
                             else prod_info['item_price']*(1+random.uniform(0,0.05)))
                    rows.append([transaction_id,user['user_id'],product, \
                                 prod_info['category'], prod_info['item_name'], prod_info['discount_percentage'],date,price,user['age_group'],user['gender'],user['income_bracket'],customer_type,user['state']])
                    
    return pd.DataFrame(rows, columns=['transaction_id','user_id','product_code','category', 'item_name', 'discount_percentage', 'transaction_date','transaction_price','age_group','gender','income_bracket','customer_type','state'])


In [9]:
products_df = pd.read_csv('data/products.csv')
user_df = pd.read_csv('data/users.csv')

In [10]:
def gen_product_tiers(products_df):
     # Prepare product tiers
    codes = products_df['product_code'].unique()
    np.random.shuffle(codes)
    high, med, low = codes[:int(0.2*len(codes))], codes[int(0.2*len(codes)):int(0.5*len(codes))], codes[int(0.5*len(codes)):]
    
    return user_df, (high, med, low)

In [11]:
product_tiers = gen_product_tiers(products_df)

In [None]:
trans_df = simulate_transactions(products_df, user_df, product_tiers, start_date="2022-08-01", end_date="2025-08-01")

In [32]:
trans_df.to_csv('data/trans.csv')

In [31]:
trans_df.columns

Index(['transaction_id', 'user_id', 'product_code', 'category', 'item_name',
       'discount_percentage', 'transaction_date', 'transaction_price',
       'age_group', 'gender', 'income_bracket', 'customer_type', 'state'],
      dtype='object')

In [33]:
trans_df.head()

Unnamed: 0,transaction_id,user_id,product_code,category,item_name,discount_percentage,transaction_date,transaction_price,age_group,gender,income_bracket,customer_type,state
0,452a413df6,user_1,5355182,MENS DEOS & GROOMING,Deo Roll On Men Intense Protection Fresh,0.5,2023-08-21,5.168301,25-34,Male,50-100K,Occasional,NSW
1,452a413df6,user_1,9050664,SNACKS,Original Multipack Potato Chips,0.416667,2023-08-21,6.082251,25-34,Male,50-100K,Occasional,NSW
2,452a413df6,user_1,5055940,INFANT FOOD,Puffcorn BBQ,0.157895,2023-08-21,4.819962,25-34,Male,50-100K,Occasional,NSW
3,dd22f95cce,user_1,3994635,BAKING MIXES,Deluxe Chocolate Layer Cake Mix,0.309091,2023-07-22,5.577268,25-34,Male,50-100K,Occasional,NSW
4,dd22f95cce,user_1,4842440,DENTAL HEALTH,Advanced Whitening Charcoal Toothpaste,0.5,2023-07-22,10.193634,25-34,Male,50-100K,Occasional,NSW
