# Creando información para el proceso ETL

El proposito de este ejercicio es simular información para ingestarla en un gestor de base de datos en este caso PostgreSQL.

Notas relevantes: 

1. El código de generación de las tablas se anexa en la ruta **./files_config/Manage_DB.sql**
2. Se crean usuarios y roles para controlar el acceso a la información, en la ruta **./files_config/Roles.sql** se encuentra el código y pasos para realizarlo desde PostgreSQL
3. Se genera un archivo de configuración para conectarse a la bd con la información pertinente (secretos)
4. La manipulación de la información se hara para simular limpieza y transformación de datos
5. El diseño propuesto es de estrella (hechos-dimensión)

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

np.random.seed(21)

In [2]:
# Tabla de vendedores (sales person)

fake_names = ['Alejandro','Daniel','Luis','Carlos','Ricardo','Bruce','Jason','Thalia','Nicki','Maria','Sarah','Marcos','Javier','Ramon','Steve']
fake_lastnames = ['Jiang','Parker', 'Wayne', 'Monroy','Mendoza','Barreto','Ramos','Cortez','Vega','Smith','Suarez','Sotelo','Cervantes','Xiao']
n_fake_names = len(fake_names)
n_fake_lastnames = len(fake_lastnames) 

def position_random(random_nb):
    if random_nb < 0.03:
        nm_position = 'Director of sales'
    elif random_nb >= 0.03 and random_nb < 0.15:
        nm_position = 'Manager'
    else:
        nm_position = 'Sales Representative'
    return nm_position


total_sp = 55
columns_sp = ['EmployeeID','EmployeeKey','NameMain','LastName','Position']
list_sp = [ [i + 500,
             np.random.randint(low=10000,high=100000),
             fake_names[np.random.randint(low=0,high=n_fake_names)],
             fake_lastnames[np.random.randint(low=0,high=n_fake_lastnames)],
             position_random(np.random.uniform())] for i in range(total_sp) ]

pd_sp =  pd.DataFrame(list_sp,columns=columns_sp)
pd_sp['Mail'] = pd_sp.apply(lambda x : x["NameMain"].lower() + '_' + x["LastName"].lower() + '@salesmx.com', axis = 1)

display(pd_sp.head())

Unnamed: 0,EmployeeID,EmployeeKey,NameMain,LastName,Position,Mail
0,500,90841,Nicki,Mendoza,Sales Representative,nicki_mendoza@salesmx.com
1,501,52104,Carlos,Cervantes,Sales Representative,carlos_cervantes@salesmx.com
2,502,74241,Sarah,Vega,Sales Representative,sarah_vega@salesmx.com
3,503,48316,Ramon,Xiao,Sales Representative,ramon_xiao@salesmx.com
4,504,26118,Sarah,Jiang,Manager,sarah_jiang@salesmx.com


In [3]:
# Problemas al simular información, útiles para la carga en la base de datos
pd_sp['full_name'] = pd_sp.apply(lambda x : x["NameMain"] + ' ' + x["LastName"], axis = 1)
pd_sp.groupby('full_name').size().reset_index(name='count').sort_values('count',ascending=False)[0:10]

Unnamed: 0,full_name,count
7,Bruce Sotelo,3
25,Maria Barreto,2
21,Luis Vega,2
10,Daniel Monroy,2
38,Sarah Suarez,1
29,Maria Wayne,1
30,Nicki Barreto,1
31,Nicki Mendoza,1
32,Nicki Parker,1
33,Nicki Sotelo,1


In [4]:
# Tabla de region (region)

columns_region = ['TerritoryKey','Note','Rate']
list_regions = [[1,'Northwest|United States|North America',0.15],
                [2,'Northeast|United States|North America',0.25],
                [3,'Central|United States|North America',0.64],
                [4,'Southwest|Mexico|North America',0.11],
                [5,'South MX|Mexico|North America',0.25],
                [6,'North MX|Mexico|North America',0.32],
                [7,'Central MX|Mexico|North America',0.15],
                [8,'South China|China|Asia',0.33],
                [9,'Central China|China|Asia',0.29],
                [10,'Canada|Canada|North America',0.65],
                [11,'France|France|Europe',0.17],
                [12,'Germany|Germany|Europe',0.39],
                [13,'Australia|Australia|Pacific',0.68]]

pd_region = pd.DataFrame(list_regions,columns=columns_region)
n_region = len(pd_region)
display(pd_region.head())

Unnamed: 0,TerritoryKey,Note,Rate
0,1,Northwest|United States|North America,0.15
1,2,Northeast|United States|North America,0.25
2,3,Central|United States|North America,0.64
3,4,Southwest|Mexico|North America,0.11
4,5,South MX|Mexico|North America,0.25


In [5]:
dict_products = {}

dict_products['Bikes'] = {'subcategory':['Mountain','Classic','Road'],
                          'color':['Black','White','Navy','Brown','Multi'],
                          'size':['XS','S','M','L','XL']}
dict_products['Clothing'] = {'subcategory':['Gloves','Short','Glasses','Vests','Hoodie','Shoes'],
                             'color':['Black','White','Navy','Green','Red','Purple','Violet','Pink','Dark Blue','Dark Red'],
                             'size':['S','M','L','XL']}
dict_products['Accessories'] = {'subcategory':['Lights','Bottles','Helmets','Stamps','Tubes'],
                                'color':['White','Yellow'],
                                'size':['S','M','Custom']}
dict_products['Components'] = {'subcategory':['Chains','Forks','Wheels','Pedals','Brakes','Headset'],
                               'color':['Bronce','Gold','Silver','Design - Special 2000','Design Classic'],
                               'size':['S','M','XL','Custom']}
dict_products['Other'] = {'subcategory':['Paint','Cleaner','Wiper','Rings','Watch','Headphones','Batteries','Speaker'],
                               'color':['Standar','Style default','Summer','Autumn','Spring','Winter','Custom'],
                               'size':['Standar','Custom','Combo 1','Combo 2', 'Combo 3', 'Special Edition']}

columns_product = ['ProductName','StandarCost','Category','Subcategory','Color','SizeStr']
list_products = []
for key_catg in dict_products.keys():
    for key_subcatg in dict_products[key_catg]['subcategory']:
        for key_color in dict_products[key_catg]['color']:
            for key_size in dict_products[key_catg]['size']:
                list_products.append([key_catg + ' ' + key_subcatg,
                                      np.random.uniform(low=100,high=10000),
                                      key_catg,
                                      key_subcatg,
                                      key_color,
                                      key_size])
                
pd_products = pd.DataFrame(list_products,columns=columns_product)
n_product = pd_products.shape[0]
pd_products['IndexStr'] = np.arange(n_product)
pd_products['KeyProduct'] = pd_products.apply(lambda x : x["Category"][0:3].upper() + x["Subcategory"][0:3].upper() + str(x['IndexStr']).zfill(5), axis = 1)


display(pd_products.head())

Unnamed: 0,ProductName,StandarCost,Category,Subcategory,Color,SizeStr,IndexStr,KeyProduct
0,Bikes Mountain,974.028973,Bikes,Mountain,Black,XS,0,BIKMOU00000
1,Bikes Mountain,2597.699463,Bikes,Mountain,Black,S,1,BIKMOU00001
2,Bikes Mountain,2260.763557,Bikes,Mountain,Black,M,2,BIKMOU00002
3,Bikes Mountain,720.622975,Bikes,Mountain,Black,L,3,BIKMOU00003
4,Bikes Mountain,7799.992997,Bikes,Mountain,Black,XL,4,BIKMOU00004


In [6]:
# Crearemos un catalogo de fechas del 1ro de enero de 2017 al 31 de diciembre de 2024

dt_ini = datetime.strptime('2017-01-01', "%Y-%m-%d")
dt_fin = datetime.strptime('2025-01-01', "%Y-%m-%d")

days_diff = (dt_fin - dt_ini).days

columns_dt = ['Date_dt']
list_dt = [dt_ini + timedelta(days=day_i) for day_i in range(days_diff)]
pd_dates = pd.DataFrame(list_dt,columns=columns_dt)

pd_dates['Year_dt'] = pd_dates.apply(lambda x : x['Date_dt'].year, axis = 1)
pd_dates['Month_dt'] = pd_dates.apply(lambda x : x['Date_dt'].month, axis = 1)
pd_dates['Day_dt'] = pd_dates.apply(lambda x : x['Date_dt'].day, axis = 1)
pd_dates['WeekDay_dt'] = pd_dates.apply(lambda x : x['Date_dt'].weekday(), axis = 1)

display(pd_dates.head())


Unnamed: 0,Date_dt,Year_dt,Month_dt,Day_dt,WeekDay_dt
0,2017-01-01,2017,1,1,6
1,2017-01-02,2017,1,2,0
2,2017-01-03,2017,1,3,1
3,2017-01-04,2017,1,4,2
4,2017-01-05,2017,1,5,3


In [7]:
# Tabla de ventas

columns_sales = ['OrderDate','KeyProduct','TerritoryKey','EmployeeKey','Quantity']

def date_random(dt_ini = "2020-01-01", dt_fin = "2023-12-31"):

    dt_ini = datetime.strptime(dt_ini, "%Y-%m-%d")
    dt_fin = datetime.strptime(dt_fin, "%Y-%m-%d")

    days_diff = (dt_fin - dt_ini).days

    days_random = np.random.randint(0, days_diff)

    dt_random = dt_ini + timedelta(days=days_random)

    return dt_random


employees_correct = pd_sp[(pd_sp['Position'] == 'Sales Representative') & (~pd_sp['EmployeeID'].isin([506,507,512,519,522]))]['EmployeeID'].tolist()
len_employees_correct = len(employees_correct)


list_sales = [[date_random(),
               pd_products[pd_products['IndexStr'] == np.random.randint(n_product)]['KeyProduct'].values[0],
               np.random.randint(1,n_region),
               employees_correct[np.random.randint(len_employees_correct)], 
               np.random.randint(1,15)]  for i in range(350000)]

pd_sales = pd.DataFrame(list_sales,columns=columns_sales)

display(pd_sales.head())

Unnamed: 0,OrderDate,KeyProduct,TerritoryKey,EmployeeKey,Quantity
0,2023-01-20,CLOSHO00129,1,508,5
1,2020-09-13,COMCHA00355,5,545,12
2,2021-08-08,BIKMOU00003,11,509,14
3,2021-11-02,COMPED00419,3,547,7
4,2020-08-18,BIKROA00070,7,531,14


In [8]:
# SAVE DATAFRAMES

pd_sp[['EmployeeID','EmployeeKey','NameMain','LastName','Position','Mail']].to_csv('./files_sales/persons.csv',index=False)
pd_region.to_csv('./files_sales/region.csv',index=False)
pd_products.to_csv('./files_sales/products.csv',index=False)
pd_dates.to_csv('./files_sales/dates.csv',index=False)
pd_sales.to_csv('./files_sales/sales.csv',index=False)