# Importing packages

In [45]:
import pandas as pd
from faker import Faker
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import random

In [3]:
original_data= pd.read_csv('Australia_Grocery_2022Sep.csv')

In [26]:
original_data.columns

Index(['index', 'Postal_code', 'Category', 'Sub_category', 'Product_Group',
       'Product_Name', 'Package_price', 'Price_per_unit', 'package_size',
       'is_estimated', 'is_special', 'in_stock', 'Retail_price', 'Product_Url',
       'Brand', 'Sku', 'RunDate', 'unit_price', 'unit_price_unit', 'state',
       'city', 'tid'],
      dtype='object')

In [62]:
fake = Faker('en_AU')  # Set Faker to use Australian locale


sku_product_data = original_data[['Sku', 
                                  'Product_Name', 
                                  'Category', 
                                  'Sub_category', 
                                  'Product_Group', 
                                  'Brand',
                                  'unit_price',
                                  ]].dropna().drop_duplicates()

In [63]:
def generate_ages(mean_age=55, std_dev=15, min_age=10, max_age=100, num_entries=1000):
    
    def generate_age():
        age = int(np.random.normal(loc=mean_age, scale=std_dev))
        return min(max(age, min_age), max_age)
    return [generate_age() for _ in range(num_entries)]

In [None]:
city_population = {
    'Sydney': 5312000,
    'Melbourne': 5078000,
    'Brisbane': 2515000,
    'Perth': 2117000,
    'Adelaide': 1372000,
    'Gold Coast': 683000,
    'Canberra': 453000,
    'Hobart': 232000,
    'Darwin': 148000
}

def get_population_probabilities(population_data):
    total_population = sum(population_data.values())
    return {k: v / total_population for k, v in population_data.items()}

city_probabilities = get_population_probabilities(city_population)

In [73]:
def choose_from_probabilities(probabilities):
        choices, probs = zip(*probabilities.items())
        return np.random.choice(choices, p=probs)

def generate_synthetic_data(num_entries):
    age_list = generate_ages(num_entries=num_entries)
    
    data = []
    for _ in range(num_entries):
        selected_product = sku_product_data.sample(n=1).iloc[0]  # Assuming sku_product_data is defined
        
        transaction_id = fake.unique.random_int(min=1000000000, max=9999999999)
        date = fake.date_between(start_date=datetime(2018, 1, 1), end_date=datetime.now())
        age = random.choice(age_list)

        # Determine gender
        gender = 'female' if random.random() < 0.7 else 'male'
        # Determine gender
        is_special = True if random.random() > 0.9 else False
        
        # Generate names based on gender
        first_name = fake.first_name_female() if gender == 'female' else fake.first_name_male()
        last_name = fake.last_name()
        customer_name = f"{first_name} {last_name}"

        # Assign city based on population ratios
        city = choose_from_probabilities(city_probabilities)
        
        payment_method = fake.random_element(elements=["Debit Card", "Credit Card", "Coupons"])
        access_url = 'Mobile App' if random.random() > 0.3 else 'Website' 
        
        unit_price = selected_product['unit_price']
        total_items = fake.random_int(min=1, max=10)
        if is_special:   
            total_cost = unit_price * total_items *0.8  
        else:
            total_cost = unit_price * total_items
        
        
        # Append data
        data.append({
            "Transaction_ID": transaction_id,
            "Date": date,
            "SKU": selected_product['Sku'],
            "Category": selected_product['Category'],
            "Sub_category": selected_product['Sub_category'],
            "Product_Name": selected_product['Product_Group'],
            "unit_price": unit_price,
            "total_items": total_items,
            "is_special": is_special,
            "total_cost" : total_cost,
            "Customer_Name": customer_name,
            "Gender": gender,
            "Age": age,
            "City": city, 
            "payment_method":payment_method,
            "access_url": access_url
        })

    return pd.DataFrame(data)



In [74]:
# Example usage
num_entries = 10000
synthetic_data = generate_synthetic_data(num_entries)
synthetic_data.head()
synthetic_data.to_csv('synthetic_data.csv', index=False)