Current

In [7]:
import pandas as pd
import random
import re
from faker import Faker
import datetime

faker = Faker()

#load cleaned Woolworths product data
product_data = pd.read_csv("Woolworths_cleaned_dated_sem1.csv")

#include necessary columns, including 'date'
product_catalog = product_data[[
    "product_code", "category", "item_name",
    "best_price", "best_unit_price", "item_price",
    "unit_price", "date"
]].dropna()

#parameters
num_transactions = 10000
num_customers = 10000
payment_methods = ["Credit card", "Cash", "Gift card", "EFTPOS"]
genders = ["Male", "Female"]
locations = ["Sydney", "Melbourne", "Brisbane", "Perth", "Adelaide", "Canberra", "Hobart"]

#keywords for filtering items
wine_keywords = r'\b(wine|cabernet|merlot|shiraz|chardonnay|pinot|riesling)\b'
christmas_keywords = r'\b(chocolate|ham|turkey|pudding|snack|gift|decoration|cake|treat|drink)\b'
new_year_keywords = r'\b(champagne|prosecco|sparkling|bubbly|vodka|rum|whiskey|bourbon|tequila|gin|brut|cocktail|lager|beer|cider)\b'
easter_keywords = r'\b(easter|egg|eggs|bunny|bunnies|hot cross bun|hot cross buns|marshmallow|chocolate|easter basket|easter pack)\b'


synthetic_transactions = []

for _ in range(num_transactions):
    product = product_catalog.sample(1).iloc[0]
    quantity = random.randint(1, 10)

    #format the transaction date
    random_day = faker.date_between(
    start_date=datetime.date(2024, 1, 1),
    end_date=datetime.date(2024, 12, 31)
    )
    date_obj = pd.to_datetime(random_day)
    formatted_date = date_obj.strftime("%-m/%-d/%Y")
    month = date_obj.month

    #extract original unit prices
    unit_price = product["unit_price"]
    best_unit_price = product["best_unit_price"]
    item_name_lower = product["item_name"].lower()
    
    #limit discount for wine
    if re.search(wine_keywords, item_name_lower):
        best_unit_price = unit_price * random.uniform(0.95, 1.00)

    #boost discount for christmas
    if month == 12 and re.search(christmas_keywords, item_name_lower):
        best_unit_price = unit_price * random.uniform(0.60, 0.85)
        quantity = random.randint(2, 12)  #increase in quantity of these items
        
    #check for Easter season (approx March/April) and matching keywords
    if month in [3, 4] and re.search(easter_keywords, item_name_lower):
        best_unit_price = unit_price * random.uniform(0.65, 0.85)
        quantity = random.randint(3, 6)

    #check for New Year's season (late Dec - early Jan) and alcohol
    if month in [12, 1] and re.search(new_year_keywords, item_name_lower):
        best_unit_price = unit_price * random.uniform(0.80, 0.95)
        quantity = random.randint(6, 15)
        
    #random everyday discount (20% chance if no discount applied yet)
    if best_unit_price == unit_price:
        if random.random() < 0.2:
            best_unit_price = unit_price * random.uniform(0.90, 0.98)
            discount_reason = "random"

    #format prices as strings for display
    unit_price_str = f"{unit_price:.2f}"
    best_unit_price_str = f"{best_unit_price:.2f}"

    #calculate totals and discounts
    discount = round((unit_price - best_unit_price) * quantity, 2)
    total_price = round(product["best_price"] * quantity, 2)

    synthetic_transactions.append({
        "unit price": round(unit_price, 2),
        "best price": round(best_unit_price, 2),
        "date": formatted_date,
        "transaction ID": f"WOOL-{random.randint(1000000, 9999999)}",
        "quantity": quantity,
        "store name": "Woolworths",
        "total": f"{total_price:.2f}",
        "item name": product["item_name"],
        "discount": f"{discount:.2f}",
        "product code": int(product["product_code"]),
        "category": product["category"],
        "customer ID": f"WOOL-{faker.sha1(raw_output=False)[:10].upper()}",
        "payment types": random.choice(payment_methods),
        "gender": random.choice(genders),
        "location": random.choice(locations)
    })

#create DataFrame
synthetic_df = pd.DataFrame(synthetic_transactions)

#save to CSV
#synthetic_df.to_csv("/users/liamedmunds/Desktop/synthetic_woolworths_10k.csv", index=False)

#show preview
synthetic_df.head()


Unnamed: 0,unit price,best price,date,transaction ID,quantity,store name,total,item name,discount,product code,category,customer ID,payment types,gender,location
0,1.24,1.13,4/16/2024,WOOL-7679865,4,Woolworths,6.2,Mount Franklin Lightly Sparkling Water Raspber...,0.46,178132,Drinks,WOOL-FF67279C72,Credit card,Male,Canberra
1,0.1,0.1,8/9/2024,WOOL-4160002,7,Woolworths,17.85,Hercules Click Zip Large Resealable Twinzip Sa...,0.04,252072,Cleaning & Maintenance,WOOL-69EAF99D6A,Gift card,Female,Brisbane
2,9.33,9.33,2/23/2024,WOOL-1589747,3,Woolworths,16.8,Birds Eye Deli Sweet Potato Chips 600g,0.0,822847,Freezer,WOOL-2240004D78,Credit card,Male,Perth
3,21.25,21.25,3/2/2024,WOOL-5024436,8,Woolworths,68.0,Woolworths Cashews Roasted 400g,0.0,804454,Snacks & Confectionery,WOOL-BBB8DEC4E4,EFTPOS,Male,Hobart
4,0.5,0.5,8/1/2024,WOOL-6010408,4,Woolworths,11.2,Heinz Beanz Baked Beans In Bbq Sauce Barbecue ...,0.0,657431,Pantry,WOOL-4CA97EE374,Cash,Male,Brisbane
