## GENERATING SALES SYNTHETIC DATA

The notebook's purpose is to generate synthetic data from a fake retail company focused on the field of clothing, in this notebook you will find a simple simulation from this company in the international {mercado} about 2 years

In [1]:
!pip install faker --break-system-packages

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import numpy as np
import random
import datetime
from datetime import datetime, timedelta, date
from faker import Faker
import sys
sys.path.append('../../libraries')
import utils

### Definition of records
Defining the structure for the records to add them to a csv file & or streamline

- `sales`
  - sku
  - quantity
  - site_code
  - date


#### Simulation to generate sales
And the last step just simulate the process of sales

In [3]:
def get_current_season(current_date):
    """Determines the current season based on the date."""
    year_day = current_date.timetuple().tm_yday
    if 80 <= year_day <= 172:
        return "Spring"
    elif 173 <= year_day <= 266:
        return "Summer"
    elif 267 <= year_day <= 355:
        return "Fall"
    return "Winter"

In [4]:
def get_sales_consumption_weight(sales_consumption_df, country_code, season):
    """Retrieves the sales consumption weight for a country and season."""
    weight_rows = sales_consumption_df[sales_consumption_df['country'].str.upper() == country_code]
    return weight_rows[season].values[0] if not weight_rows.empty else 1.0

In [5]:
def generate_sales_quantity(sales_consumption_weight):
    """Generates the quantity of sales for a day."""
    return max(0, int(random.gauss(50 * sales_consumption_weight, 10)))

In [6]:
def select_product_from_category(products_df, category, available_products):
    """Selects a product from a category, considering available products."""
    # Filter products by category and available product codes
    category_products = products_df[
        (products_df['category'].str.contains(category, case=False)) & 
        (products_df['productCode'].isin(available_products))
    ]
    return random.choice(category_products.index) if not category_products.empty else None

In [7]:
def generate_date_format(date):
    """Returns the 6-month period of the given date."""
    date_formats = [
        "%Y-%m-%d",      # 2023-12-31
        "%d/%m/%Y",      # 31/12/2023
        "%m-%d-%Y",      # 12-31-2023
        "%B %d, %Y",     # December 31, 2023
    ]
    index = random.randint(0, len(date_formats)-1)
    return date.strftime(date_formats[index])

In [8]:
def record_sale(sales_data_list, current_date, store, sku, quantity):
    """
    Records a sale with potential null values and more noise.
    
    Parameters:
    producer (confluent-kafka.Producer): Producer object of confluent-kafka to send data to a Kafka topic
    current_date (datetime): Date of the sale occurred
    store (str): site_code of the store based on its unique identifier
    sku (str): unique code of the product
    quantity (int): quantity of the selected product 
    """
    sale_record = [sku, quantity, store, generate_date_format(current_date)]
    if random.random() < 0.25:
        sale_record[2] = randomize_case(sale_record[2])

    if random.random() < 0.1: sale_record[1] *= -1

    if random.random() < 0.1:
        sale_record[random.randint(0, 3)] = None
    
    sales_data_list.append(sale_record)

def record_replenishment(soh_data_list, current_date, store, sku, replenishment_quantity):
    """
    Records a soh row with potential null values and more noise.
    
    Parameters: 
    producer (confluent-kafka.Producer): Producer object of confluent-kafka to send data to a Kafka topic
    current_date (datetime): Date of the replenishment
    store (str): site_code of the store based on its identifier
    sku (str): unique code of the product
    quantity (int): quantity of replenishment for the selected product 
    """
    soh_record = [store, sku, replenishment_quantity, generate_date_format(current_date)]
    if random.random() < 0.25:
        soh_record[0] = randomize_case(soh_record[0])

    if random.random() < 0.1: soh_record[2] *= -1

    if random.random() < 0.1: 
        soh_record[random.randint(0, 3)] = None
        
    soh_data_list.append(soh_record)

def randomize_case(store_code):
    """
    Randomly changes the case of the store code.
    
    Parameters:
    store_code (str): The store code to modify.
    
    Returns:
    (str) The store code with random case changes.
    """
    return ''.join(random.choice([char.upper(), char.lower()]) for char in store_code)


In [9]:
def get_6month_period(date):
    """Returns the 6-month period of the given date."""
    year = date.year
    period = (date.month - 1) // 6 + 1
    return year, period

In [10]:
def simulate_sales(products_df, weights_df, inventory,
                          sales_consumption_df, site_codes, current_date):
    """Generate random sales of a single day"""

    sales_data_list = []
    soh_data_list = []

    for store in site_codes:
        country_code = store[:3].upper()
        if country_code[-1] == '0': country_code = country_code[:2]
        country = weights_df[weights_df['country'].str.upper().str.startswith(country_code)].iloc[0].country

        season = get_current_season(current_date)
        sales_consumption_weight = get_sales_consumption_weight(sales_consumption_df, country_code, season)
        num_sales = generate_sales_quantity(sales_consumption_weight)

        country_weights = weights_df[weights_df['country'] == country]
        categories = country_weights['category'].tolist()
        weights = country_weights['consumption'].tolist()

        available_products = inventory.loc[store].index.tolist()
        
        for _ in range(num_sales):
            chosen_category = random.choices(categories, weights=weights, k=1)[0]
            chosen_product_index = select_product_from_category(products_df, chosen_category, available_products)

            if chosen_product_index is not None:
                product = products_df.loc[chosen_product_index]
                sku = product['productCode']

                try:
                    current_stock = inventory.loc[(store, sku), 'quantity']
                    if current_stock > 0:
                        quantity = min(current_stock, random.randint(1, 15))
                        inventory.loc[(store, sku), 'quantity'] -= quantity
                        record_sale(sales_data_list, current_date, store, sku, quantity)

                        current_stock = inventory.loc[(store, sku), 'quantity']
                        if (random.random() < 0.05 and current_stock < 1000) or current_stock < 20:
                            replenishment_quantity = random.randint(150, 500)
                            inventory.loc[(store, sku), 'quantity'] += replenishment_quantity
                except KeyError:
                    print(f"KeyError: {(store, sku)}")
                    return
        for sku in available_products: 
            record_replenishment(soh_data_list, current_date, store, sku, inventory.loc[(store, sku), 'quantity']) 
    random.shuffle(sales_data_list)
    return sales_data_list, soh_data_list


In [11]:
def simulate_daily_sales(products_df, weights_df, initial_inventory_df,
                          sales_consumption_df, start_date, end_date):
    """Simulates daily sales and inventory management, dividing sales into 6-month files."""
    current_date = start_date

    sales_data = []
    site_codes = initial_inventory_df.site_code.unique()
    inventory = initial_inventory_df.set_index(['site_code', 'sku'])

    current_period = get_6month_period(start_date)
    sales_filename = None

    while current_date <= end_date:
        sales_data, soh_data = simulate_sales(products_df, weights_df, inventory, sales_consumption_df, site_codes, current_date)

        if len(soh_data) > 0:
            utils.save_row_csv(soh_data, 'soh.csv', '../../data')

        current_period = get_6month_period(current_date)
        if get_6month_period(current_date) != get_6month_period(current_date - timedelta(days=1)):
            year, period = current_period
            last_record_time = datetime.now().strftime("%Y%m%d_%H%M%S")
            sales_filename = f'sales_{year}_P{period}_{last_record_time}.csv'
            if sales_filename:
                last_record_time = datetime.now().strftime("%Y%m%d_%H%M%S")
                utils.initialize_csv(['sku', 'quantity', 'site_code', 'date'], sales_filename, '../../data')
        utils.save_row_csv(sales_data, sales_filename, '../../data')
        sales_data = []

        current_date += timedelta(days=1)

In [12]:
distribution_by_cat = utils.load_data('distribution_by_category.csv', '../../data')
sites = distribution_by_cat.country.unique()
distribution_of_sales = utils.load_data('distribution_of_sales_by_country.csv', '../../data')
products = utils.load_data('products.csv', '../../data')
soh = utils.load_data('soh.csv', '../../data')

simulate_daily_sales(
    products, distribution_by_cat,
    soh, distribution_of_sales, date(2023, 1, 1), date(2025, 1, 1)
)

Rows saved to: /mnt/sda2/ICC/pasantia/final-project/data/soh.csv
CSV file created with header: ['sku', 'quantity', 'site_code', 'date']
Rows saved to: /mnt/sda2/ICC/pasantia/final-project/data/sales_2023_P1_20250331_155602.csv
Rows saved to: /mnt/sda2/ICC/pasantia/final-project/data/soh.csv
Rows saved to: /mnt/sda2/ICC/pasantia/final-project/data/sales_2023_P1_20250331_155602.csv
Rows saved to: /mnt/sda2/ICC/pasantia/final-project/data/soh.csv
Rows saved to: /mnt/sda2/ICC/pasantia/final-project/data/sales_2023_P1_20250331_155602.csv
Rows saved to: /mnt/sda2/ICC/pasantia/final-project/data/soh.csv
Rows saved to: /mnt/sda2/ICC/pasantia/final-project/data/sales_2023_P1_20250331_155602.csv
Rows saved to: /mnt/sda2/ICC/pasantia/final-project/data/soh.csv
Rows saved to: /mnt/sda2/ICC/pasantia/final-project/data/sales_2023_P1_20250331_155602.csv
Rows saved to: /mnt/sda2/ICC/pasantia/final-project/data/soh.csv
Rows saved to: /mnt/sda2/ICC/pasantia/final-project/data/sales_2023_P1_20250331_1556