In [1]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Category to brands mapping
category_brands = {
    "Beverages": ["Coca-Cola", "Pepsi", "Schweppes"],
    "Bakery": ["Tip Top", "Wonder White", "Helga's"],
    "Snacks": ["Smith's", "Doritos", "Pringles"],
    "Dairy": ["Paul's", "Devondale", "Bulla"],
    "Frozen": ["Birds Eye", "McCain", "Ingham"],
    "Produce": ["Fresh Farms", "Aussie Greens"],
    "Meat": ["Coles Brand", "Woolworths Brand", "MeatMaster"],
    "Pantry": ["Heinz", "Uncle Tobys", "Leggo's"]
}

payment_types = ["Credit Card", "Cash", "EFTPOS", "Gift Card"]
subcategories = ["Soft Drinks", "Bread", "Chips", "Milk", "Ice Cream", "Veggies", "Chicken", "Sauces"]

# realistic transaction IDs
def generate_transaction_id(store_code, store_number, register, date, receipt_no):
    return f"{store_code}-{store_number}-{register}-{date.strftime('%Y%m%d')}-{str(receipt_no).zfill(4)}"

def generate_data(store_name="Coles", store_code="COL", store_number="601", register="117", num_rows=100):
    data = []
    start_date = datetime(2025, 3, 1)

    for i in range(num_rows):
        txn_date = start_date + timedelta(days=random.randint(0, 30))
        receipt_no = 5000 + i  
        txn_id = generate_transaction_id(store_code, store_number, register, txn_date, receipt_no)

        category = random.choice(list(category_brands.keys()))
        brand = random.choice(category_brands[category])
        
        unit_price = round(random.uniform(2.0, 20.0), 2)
        discount = round(random.uniform(0.0, unit_price * 0.3), 2)
        best_price = round(unit_price - discount, 2)
        
        quantity = random.randint(1, 5)
        total = round(best_price * quantity, 2)
        
        item_id = f"ITEM{random.randint(1000, 9999)}"
        customer_id = f"CUST{random.randint(1000, 9999)}"
        payment_type = random.choices(payment_types, weights=[0.5, 0.2, 0.25, 0.05])[0]
        sub_category = random.choice(subcategories)

        data.append([
            unit_price, best_price, txn_date.strftime("%Y-%m-%d"), txn_id, quantity,
            store_name, total, sub_category, discount, item_id,
            category, brand, customer_id, payment_type
        ])
    
    columns = [
        "Unit price", "Best price", "Date", "Transaction ID", "Quantity",
        "Store name", "Total", "Sub-category", "Discount", "Item ID",
        "Category", "Brand", "Customer ID", "Payment type"
    ]
    
    df = pd.DataFrame(data, columns=columns)
    df.to_csv("coles_realistic_data.csv", index=False)
    print("realistic coles dataframe created.")

generate_data()


realistic coles dataframe created.


In [2]:
# Loading the existing dataset
df = pd.read_csv("/Users/nikaamini/Desktop/T1-25/SIT374/Synthetic_Data/coles_realistic_Sample_data.csv")

# Sample location list 
locations = ["Melbourne", "Sydney", "Brisbane", "Adelaide", "Perth", "Hobart", "Canberra", "Darwin"]

# Sample gender list
genders = ["Male", "Female"]

# Adding new columns
df["Location"] = [random.choice(locations) for _ in range(len(df))]
df["Gender"] = [random.choice(genders) for _ in range(len(df))]

df.to_csv("coles_realistic_Sample_data.csv", index=False)

print("csv file updated successfully.")

csv file updated successfully.
