In [17]:
import pandas as pd
import numpy as np
import re
import hashlib
import datetime

np.random.seed(263)

In [19]:
woolworths = pd.read_csv("woolworths_cleaned.csv")

In [21]:
# Function to generate hashed unique transaction ID
def generate_transaction_id(index):
    # Create a unique base string using index and random number
    unique_string = f"txn_{index}_{np.random.random()}"
    
    # Generate SHA256 hash and extract first 10 characters
    hash_part = hashlib.sha256(unique_string.encode()).hexdigest()[:10].upper()
    
    # Format: WOOL-XXXXXXXXXX
    return f"WOOL-{hash_part}"

# Generate Transaction ID column using Hasing
woolworths["Transaction ID"] = [generate_transaction_id(i) for i in range(len(woolworths))]

# Create a Discount Amount column based on substracting Original Price by Discount Price    
woolworths["Discount Amount"] = woolworths["Original Price"] - woolworths["Discount Price"]

# Create a Quantity column with random integers from 1 to 10
woolworths["Quantity"] = np.random.randint(1, 11, size=len(woolworths))

# Create a Store Name column randomly based on two values                                                    
woolworths["Store Name"] = np.random.choice(["Woolworths", "Metro"], size=len(woolworths))

# Create a Total column based on multiplying Discount price and Quantity                                                           
woolworths["Total"] = woolworths["Discount Price"] * woolworths["Quantity"] 

# Function to generate a hashed item ID
def generate_item_id(index):
    unique_string = f"item_{index}_{np.random.random()}"
    return hashlib.sha256(unique_string.encode()).hexdigest()[:10].upper()

# Generate and assign the Item ID column
woolworths["Item ID"] = [generate_item_id(i) for i in range(len(woolworths))]

# Create the "Brand" column by extracting the first word of the "Item Name"
woolworths["Brand"] = woolworths["Item Name"].astype(str).apply(lambda x: x.split()[0] if pd.notnull(x) else None)

# Function to generate hashed unique transaction ID
def generate_customer_id(index):
    # Create a unique base string using index and random number
    unique_string = f"txn_{index}_{np.random.random()}"
    
    # Generate SHA256 hash and extract first 10 characters
    hash_part = hashlib.sha256(unique_string.encode()).hexdigest()[:8].upper()
    
    # Format: WOOL-XXXXXXXXXX
    return f"CUST-{hash_part}"

# Example: Generate 5 transaction IDs
woolworths["Customer ID"] = [generate_transaction_id(i) for i in range(len(woolworths))]

# Create a Payment Type column randomly based on five values  
woolworths["Payment Type"] = np.random.choice(["Credit card", "Gift card", "Store card", "EFTPOS", "Cash"], size = len(woolworths))

# Create a Location column randomly based on five values  
woolworths["Location"] = np.random.choice(["Brisbane", "Sydney", "Melbourne", "Hobart", "Adelaide", "Perth", "Darwin", "Canberra"], size = len(woolworths))

# Create a Location column randomly based on three values  
woolworths["Gender"] = np.random.choice(["Male", "Female", "Other"], size =len(woolworths))

# Create a Purchase Date column randomly from the range between start_date and end_date
start_date = datetime.datetime(2024, 8, 4)
end_date = datetime.datetime(2025, 3, 4)

date_range = pd.date_range(start=start_date, end=end_date)
woolworths["Purchase Date"] = np.random.choice(date_range, size=len(woolworths))

# Drop unnecessary columns
woolworths = woolworths.drop(["Date", "Best Unit Price", "Item Price", "Unit Price", "Discount Price"], axis = 1)

# Change the Orginal Price column name to Unit Name
woolworths.rename(columns={"Original Price": "Unit Price"}, inplace=True)

# Reorder the columns
new_column_order = [
    'Transaction ID', 'Purchase Date', 'Store Name', 'Location', 'Customer ID', 'Gender', 'Item ID', 'Brand', 'Item Name', 'Category', 
    'Product Code', 'Best Price', 'Unit Price', 'Discount Amount', 'Quantity', 'Total', 'Payment Type'
]
woolworths = woolworths[new_column_order]

woolworths.to_csv("synthetic_woolworths_cleaned.csv", index=False)