### Carga de dependencias

In [1]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random

#### Se construye una función que permite:

* Calcular cuántos días hay entre start_date y end_date usando la diferencia de fechas. En resumen, el rango de fechas solicitado.
* Tener una lista con todas las fechas
* Poner a disposición unos pesos para cumplir con el incremental de registros hacia fechas más recientes que pide el ejercicio.
* Aplicar un pesos adicional a las fechas de diciembre (estacionalidad de ventas)
* Escoger filas más recientes y de diciembre como filas, asignando mayor probabilidad

In [2]:
def generate_dates_with_seasonality(start_date, end_date, n_rows):
    days = (end_date - start_date).days
    base_dates = [start_date + timedelta(days=x) for x in range(days)]
    weights = np.linspace(1, 2, len(base_dates))
    
    for i, date in enumerate(base_dates):
        if date.month == 12:
            weights[i] *= 1.5
    
    return random.choices(base_dates, weights=weights, k=n_rows)

In [3]:
#Aquí simplemente lo que se pide en el ejercicio, 50 mil registros, con unas semillas que permitan reproducibilidad
n_rows = 50000
np.random.seed(42)
random.seed(42)

In [4]:
# Se aplica la función
dates = generate_dates_with_seasonality(
    datetime(2023, 1, 1),
    datetime(2024, 12, 31),
    n_rows
)

In [5]:
#Aquí se pueden observar las fechas creadas
dates

[datetime.datetime(2024, 6, 7, 0, 0),
 datetime.datetime(2023, 1, 29, 0, 0),
 datetime.datetime(2023, 9, 24, 0, 0),
 datetime.datetime(2023, 8, 11, 0, 0),
 datetime.datetime(2024, 8, 9, 0, 0),
 datetime.datetime(2024, 7, 2, 0, 0),
 datetime.datetime(2024, 11, 13, 0, 0),
 datetime.datetime(2023, 4, 4, 0, 0),
 datetime.datetime(2024, 1, 4, 0, 0),
 datetime.datetime(2023, 2, 3, 0, 0),
 datetime.datetime(2023, 8, 7, 0, 0),
 datetime.datetime(2024, 3, 6, 0, 0),
 datetime.datetime(2023, 1, 30, 0, 0),
 datetime.datetime(2023, 7, 20, 0, 0),
 datetime.datetime(2024, 6, 14, 0, 0),
 datetime.datetime(2024, 4, 3, 0, 0),
 datetime.datetime(2023, 8, 8, 0, 0),
 datetime.datetime(2024, 5, 4, 0, 0),
 datetime.datetime(2024, 9, 24, 0, 0),
 datetime.datetime(2023, 1, 8, 0, 0),
 datetime.datetime(2024, 9, 22, 0, 0),
 datetime.datetime(2024, 7, 16, 0, 0),
 datetime.datetime(2023, 11, 17, 0, 0),
 datetime.datetime(2023, 6, 10, 0, 0),
 datetime.datetime(2024, 12, 14, 0, 0),
 datetime.datetime(2023, 11, 14, 0

In [6]:
#este es el código sugerido en el ejercicio para crear el df
data = {
    'order_id': [str(uuid.uuid4()) for _ in range(n_rows)],
    'customer_id': pd.Series(np.random.randint(1, 10_001, n_rows), dtype='Int64'),
    'product_id': pd.Series(np.random.randint(1, 1_001, n_rows), dtype='Int64'),
    'quantity': pd.Series(np.random.randint(1, 21, n_rows), dtype='Int64'),
    'price': np.random.uniform(1.0, 500.0, n_rows),
    'order_date': dates,
    'region': np.random.choice(['North', 'South', 'East', 'West'], n_rows)
}

In [7]:
#lo chequeamos 
data

{'order_id': ['53fb9b74-d43f-4d20-b56f-e90da2d14d4a',
  'aae5d91c-3037-48a6-8208-05ae64154158',
  'a9c1f967-f9e5-4276-9994-d621957be8a8',
  '8098059e-caf3-4117-93e9-483d89eadb0a',
  '4a46ebfb-eb22-4b93-873c-bca7b8e16306',
  '7d66c463-5397-4550-88b3-75114feb05e3',
  'b080561e-53e4-479d-a5be-21c051ac4ebf',
  '592704ea-c97f-4977-b00b-823045d5e5a3',
  '4ce81750-95a3-461f-9595-c866c692729d',
  '91b349e7-566b-4e70-a62e-05843b07ea89',
  '21664b3e-9ee7-49ca-9c6d-6e7760347221',
  '1e221897-4e21-40d3-a60c-f3e1b2a1ccf2',
  '8b752709-6144-4604-aced-3e8b2a74abdd',
  'e3bb8102-1667-4b48-982b-9e6184b0c264',
  'e7654dc8-abd5-4607-af74-5a3578f7e7db',
  '40cebbd7-e5b6-48e7-8bdf-93b3f4bf8228',
  '6d699ef8-fde1-4ee9-8b87-6236e9dabcf8',
  '953bdeac-e99a-40fb-bfbc-f0e09b47c0ec',
  '0c9f777a-2ade-4825-b800-a932c08593ac',
  '66e73c47-669c-4962-bc8e-8c3cb4a1f867',
  '348c426d-46c2-4ecb-a34c-2a1ff11d6b82',
  'f77e5b5a-5d43-456b-ae27-a428ab921bb6',
  'ee745f1e-05bd-406f-8e0b-a2051eceb46a',
  'ac67e274-cb06-4ab9-

In [8]:
#se crean los descuentos
max_price = max(data['price'])
data['discount'] = [(1 - p/max_price) * 0.3 * random.uniform(0.8, 1.2) for p in data['price']]
data['discount'] = [min(max(d, 0.0), 0.3) for d in data['discount']]

In [9]:
# se crean los shiping priorities para las regiones usando un dict
priorities = {
    'North': ['High', 'High', 'High', 'High', 'High', 'Medium', 'Medium', 'Medium', 'Low', 'Low'],
    'South': ['High', 'High', 'Medium', 'Medium', 'Medium', 'Medium', 'Medium', 'Low', 'Low', 'Low'],
    'East': ['High', 'High', 'High', 'Medium', 'Medium', 'Medium', 'Medium', 'Low', 'Low', 'Low'],
    'West': ['High', 'High', 'Medium', 'Medium', 'Medium', 'Medium', 'Medium', 'Low', 'Low', 'Low']
}

In [10]:
#aquí el random_choice hace que cada valor sea una prioridad seleccionada aleatoriamente de acuerdo con la región correspondiente
data['shipping_priority'] = [random.choice(priorities[region]) for region in data['region']]

In [11]:
#se crea el df
df = pd.DataFrame(data)

In [12]:
#le hago un head para chequear
df.head()

Unnamed: 0,order_id,customer_id,product_id,quantity,price,order_date,region,discount,shipping_priority
0,53fb9b74-d43f-4d20-b56f-e90da2d14d4a,7271,923,10,410.048837,2024-06-07,East,0.06404,Low
1,aae5d91c-3037-48a6-8208-05ae64154158,861,621,20,466.51911,2023-01-29,East,0.02141,Low
2,a9c1f967-f9e5-4276-9994-d621957be8a8,5391,677,3,35.175263,2023-09-24,West,0.243724,Medium
3,8098059e-caf3-4117-93e9-483d89eadb0a,5192,370,9,75.551426,2023-08-11,West,0.272166,Low
4,4a46ebfb-eb22-4b93-873c-bca7b8e16306,5735,771,15,61.812616,2024-08-09,East,0.233102,High


In [13]:
#el dataset se ve bien, ahora un describe
df.describe()

Unnamed: 0,customer_id,product_id,quantity,price,order_date,discount
count,50000.0,50000.0,50000.0,50000.0,50000,50000.0
mean,4989.01144,498.34016,10.48816,250.019629,2024-02-16 22:05:50.207999744,0.149234
min,1.0,1.0,1.0,1.001136,2023-01-01 00:00:00,0.0
25%,2484.75,247.75,5.0,125.028048,2023-09-04 00:00:00,0.074232
50%,4992.0,498.0,11.0,249.041632,2024-03-01 00:00:00,0.148277
75%,7490.0,748.0,15.0,374.700194,2024-08-17 06:00:00,0.221863
max,10000.0,1000.0,20.0,499.992113,2024-12-30 00:00:00,0.3
std,2887.910142,288.798263,5.765095,144.14838,,0.087315


La información hasta aquí cumple con lo solicitado

In [14]:
# se asigna el ruido de manera más directa
noisy_rows = np.random.choice(n_rows, int(n_rows * 0.05), replace=False)

In [15]:
# se ingresan los demás puntos solicitados, NAN aleatoriors y UNKNOWN, también valores extremos o raros
for idx in noisy_rows:
    # se eligen columnas al azar (excepto order_id) 
    cols = list(df.columns)[1:]  # todas menos order_id
    cols_to_noise = random.sample(cols, 3)
    
    for col in cols_to_noise:
        # Ruido más simple y directo
        if col in ['customer_id', 'product_id', 'quantity']:
            df.at[idx, col] = random.choice([pd.NA, -9999])
        elif col in ['price', 'discount']:
            df.at[idx, col] = random.choice([np.nan, -9999.99])
        elif col == 'order_date':
            df.at[idx, col] = pd.NaT
        else:  # region y shipping_priority
            df.at[idx, col] = 'UNKNOWN'

In [16]:
## de nuevo chequeo
df.head()

Unnamed: 0,order_id,customer_id,product_id,quantity,price,order_date,region,discount,shipping_priority
0,53fb9b74-d43f-4d20-b56f-e90da2d14d4a,7271.0,923,10,410.048837,2024-06-07,East,0.06404,Low
1,aae5d91c-3037-48a6-8208-05ae64154158,861.0,621,20,466.51911,2023-01-29,East,0.02141,Low
2,a9c1f967-f9e5-4276-9994-d621957be8a8,5391.0,677,3,35.175263,2023-09-24,West,0.243724,Medium
3,8098059e-caf3-4117-93e9-483d89eadb0a,5192.0,370,9,75.551426,2023-08-11,West,0.272166,Low
4,4a46ebfb-eb22-4b93-873c-bca7b8e16306,,771,15,61.812616,2024-08-09,UNKNOWN,,High


In [20]:
import os

print(os.getcwd())

C:\Users\carlo\Documents\data-science-project\notebooks


In [22]:
os.chdir('C:/Users/carlo/Documents/data-science-project/')

Se guardan los datos sintéticos en carpeta data

In [24]:
df.to_csv('data/raw_sales_data.csv',sep=";", index=False)