In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from tqdm import tqdm

# Load data
ordini_per_cliente = json.load(open('data/utils/ordini_per_client.json'))
tot_per_cliente = json.load(open('data/utils/tot_per_client.json')) 
quantity = pd.read_csv('data/utils/quantity_per_client.csv')

# Utils
clienti = list(ordini_per_cliente.keys())
tot = sum(tot_per_cliente.values())
norm = {k: v/tot for k,v in tot_per_cliente.items()}

In [272]:
### Generate synthetic data

## params
num_samples = 10        # Number of samples to generate

# Set to True to generate data starting from today (orders in entrata oggi)
today = True        

# Altrimenti imposta un range
start = '2024-10-01'
end = '2024-12-31'

new_data = []

# Loop to generate synthetic data with progress bar
for _ in tqdm(range(num_samples), desc="Generating data"):

    # Scegli cliente in modo uniforme
    i = np.round(np.random.uniform(0, len(clienti) - 1))
    cliente = clienti[int(i)]
    prob = norm[cliente]
    num = int(np.round(prob * num_samples))

    rows = quantity[quantity['Cliente'] == cliente].shape[0]
    if rows > 0:
        articoli = quantity[quantity['Cliente'] == cliente]
        
        for _ in range(num + 1):
            art_index = int(np.round(np.random.uniform(0, rows - 1)))
            art = articoli.iloc[art_index]

            articolo = art['cod articolo']
            quantity_mean = art['mean']
            quantity_std = art['std']
            giorni = art['mean_differenza_giorni']

            # Ensure kg is always positive
            kg = np.round(np.random.normal(quantity_mean, quantity_std))
            if kg <= 0:
                kg = quantity_mean
            
            if today:
                data_inizio = pd.Timestamp.today().date()
            else:
                date_range = pd.date_range(start=start, end=end)
                data_inizio = np.random.choice(date_range)
                data_inizio = pd.Timestamp(data_inizio).date()

            # Calculate the end date
            data_fine = data_inizio + pd.Timedelta(days=giorni)
            data_fine = pd.Timestamp(data_fine).date()

            # Append the generated data to new_data list
            new_data.append({
                'cliente': cliente,
                'cod_articolo': articolo,
                'quantity': kg,
                'data inserimento': data_inizio,
                'data consegna': data_fine
            })

columns = ['cliente', 'cod_articolo', 'quantity', 'data inserimento', 'data consegna']
res = pd.DataFrame(data=new_data, columns=columns)

res.to_csv('data/new_orders.csv', index=False)
res

Generating data: 100%|██████████| 10/10 [00:00<00:00, 1324.17it/s]


Unnamed: 0,cliente,cod_articolo,quantity,data inserimento,data consegna
0,MANIFATTURA ITALIA CUCIRINI,2221ZN,40.0,2024-10-22,2024-10-23
1,SCORTA,1725DN,400.0,2024-10-22,2024-11-21
2,SCORTA,31004DN220379 dm84,1200.0,2024-10-22,2024-11-17
3,CALZIFICIO CAROL,1724S+ZN,20.0,2024-10-22,2024-11-17
4,MICHELE LETIZIA,X24-7802S+ZN,347.0,2024-10-22,2024-11-11
5,CSP INTERNATIONAL,1724S+ZN,118.0,2024-10-22,2024-11-17
6,GIZETA,4407DN,128.0,2024-10-22,2024-10-27
7,GIZETA,7036ZN,735.0,2024-10-22,2024-11-04
8,CALZIFICIO POLAR,2236S-ZN1,273.0,2024-10-22,2024-11-17
9,JACOB ROHNER,S18-160040DN,0.0,2024-10-22,2024-11-17
