### This notebook runs the dimensional load for all dims from sybase.

In [None]:
%run SLV_NB_Functions

StatementMeta(, f12487cd-84ed-4c68-b0ab-2929d9eb9ae9, 3, Finished, Available, Finished)



### Dim Store Section

In [None]:
tiendas_df = spark.read.parquet(get_latest_file('ftiendas'))
tiendas_ext_df = spark.read.parquet(get_latest_file('ftiendas_ext'))
distritos_df = spark.read.parquet(get_latest_file('distritos'))
regiones_df = spark.read.parquet(get_latest_file('regiones'))
zonas_df = spark.read.parquet(get_latest_file('zonas'))

StatementMeta(, bfecfefb-9269-4c8b-a4bf-14463b340311, 4, Finished, Available, Finished)

In [None]:
tiendas_df.createTempView('temp_stores')
tiendas_ext_df.createTempView('temp_stores_ext')
distritos_df.createTempView('temp_districts')
zonas_df.createTempView('temp_zones')
regiones_df.createTempView('temp_regions')

StatementMeta(, bfecfefb-9269-4c8b-a4bf-14463b340311, 5, Finished, Available, Finished)

In [None]:
%%sql
with cte_first_date_sales as (
    select store_id_from, min(movement_date) as mdate
    from fmovarti_intermediate_silver
    where movement_id = 'VT'
        GROUP BY store_id_from
)
    SELECT 
        t.TCLAVE AS src_store_id,
        t.TNOMBRE AS store_name,
        t.UDDATE AS update_date_stores,
        t.tcolonia AS neighborhood,
        t.tdelmun AS borough,
        t.testado AS state,
        t.tcp AS zipcode,
        t.tcalle AS street,
        t.c_pais AS country_code,
        t.c_estado AS state_code,
        t.c_municipio AS borough_code,
        t.c_colonia AS neighbor_code,
        t.TTAMANOTIPO as store_classification,
        d.ddesc AS district_manager,
        d.dclave AS src_district_id,
        z.zona_desc AS zone_manager,
        z.zona_id AS src_zone_id,
        z.supervisor AS zone_supervisor,
        r.rdesc AS region_name,
        r.rclave AS src_region_id,
        CASE 
            WHEN t.TCLAVE BETWEEN 991 AND 999 THEN 'Virtual Store'
            WHEN t.TCLAVE BETWEEN 1000 AND 1099 THEN 'Warehouse'
            ELSE 'Store'
        END AS store_type,
        COALESCE(c.mdate, t2.date_6, t2.date_1, t2.date_2) AS first_sale_or_fallback_date,

        -- CASE 
        --     WHEN (
        --         SELECT COUNT(DISTINCT fis.movement_date) 
        --         FROM fmovarti_intermediate_silver fis 
        --         WHERE fis.store_id_from = t.TCLAVE
        --         AND fis.movement_id = 'VT'
        --         AND fis.movement_date >= DATE_SUB(CURRENT_DATE(), 370)
        --     ) >= 365
        --     THEN 1 ELSE 0 
        -- END AS same_store_sales_flag, -- do we want to include money
        --CASE WHEN tdatosarrendador = 1 THEN 'Active' ELSE 'Inactive' END as store_status
        CASE t2.int_1 WHEN 0 THEN "Closed" WHEN 1 THEN "Open" WHEN 2 THEN "To Be Opened" WHEN 3 THEN "Closure" WHEN 4 THEN "Remodeling" Else "Unknown" End as store_status
    FROM temp_stores t
    LEFT JOIN temp_stores_ext t2 on t.tclave = t2.tclave
    LEFT JOIN temp_districts d ON d.dclave = t.dclave
    LEFT JOIN temp_zones z ON z.zona_id = d.zona_id
    LEFT JOIN temp_regions r ON r.rclave = z.rclave
    LEFT JOIN cte_first_date_sales c on c.store_id_from = t.tclave

StatementMeta(, bfecfefb-9269-4c8b-a4bf-14463b340311, 10, Finished, Available, Finished)

<Spark SQL result set with 1000 rows and 23 fields>

In [None]:
%%sql
ALTER TABLE DimStores
ADD COLUMN first_sale_or_fallback_date date

StatementMeta(, bfecfefb-9269-4c8b-a4bf-14463b340311, 11, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [None]:
%%sql
MERGE INTO DimStores AS target
USING (
with cte_first_date_sales as (
    select store_id_from, min(movement_date) as mdate
    from fmovarti_intermediate_silver
    where movement_id = 'VT'
        GROUP BY store_id_from
)
    SELECT 
        t.TCLAVE AS src_store_id,
        t.TNOMBRE AS store_name,
        t.UDDATE AS update_date_stores,
        t.tcolonia AS neighborhood,
        t.tdelmun AS borough,
        t.testado AS state,
        t.tcp AS zipcode,
        t.tcalle AS street,
        t.c_pais AS country_code,
        t.c_estado AS state_code,
        t.c_municipio AS borough_code,
        t.c_colonia AS neighbor_code,
        t.TTAMANOTIPO as store_classification,
        d.ddesc AS district_manager,
        d.dclave AS src_district_id,
        z.zona_desc AS zone_manager,
        z.zona_id AS src_zone_id,
        z.supervisor AS zone_supervisor,
        r.rdesc AS region_name,
        r.rclave AS src_region_id,
        CASE 
            WHEN t.TCLAVE BETWEEN 991 AND 999 THEN 'Virtual Store'
            WHEN t.TCLAVE BETWEEN 1000 AND 1099 THEN 'Warehouse'
            ELSE 'Store'
        END AS store_type,
        COALESCE(c.mdate, t2.date_6, t2.date_1, t2.date_2) AS first_sale_or_fallback_date,
        -- CASE 
        --     WHEN (
        --         SELECT COUNT(DISTINCT fis.movement_date) 
        --         FROM fmovarti_intermediate_silver fis 
        --         WHERE fis.store_id_from = t.TCLAVE
        --         AND fis.movement_id = 'VT'
        --         AND fis.movement_date >= DATE_SUB(CURRENT_DATE(), 370)
        --     ) >= 365
        --     THEN 1 ELSE 0 
        -- END AS same_store_sales_flag, -- do we want to include money
        --CASE WHEN tdatosarrendador = 1 THEN 'Active' ELSE 'Inactive' END as store_status
        CASE t2.int_1 WHEN 0 THEN "Closed" WHEN 1 THEN "Open" WHEN 2 THEN "To Be Opened" WHEN 3 THEN "Closure" WHEN 4 THEN "Remodeling" Else "Unknown" End as store_status
    FROM temp_stores t
    LEFT JOIN temp_stores_ext t2 on t.tclave = t2.tclave
    LEFT JOIN temp_districts d ON d.dclave = t.dclave
    LEFT JOIN temp_zones z ON z.zona_id = d.zona_id
    LEFT JOIN temp_regions r ON r.rclave = z.rclave
    LEFT JOIN cte_first_date_sales c on c.store_id_from = t.tclave
) AS source
ON target.src_store_id = source.src_store_id
WHEN MATCHED THEN
    UPDATE SET
        target.store_name = source.store_name,
        target.update_date_stores = source.update_date_stores,
        target.neighborhood = source.neighborhood,
        target.borough = source.borough,
        target.state = source.state,
        target.zipcode = source.zipcode,
        target.street = source.street,
        target.country_code = source.country_code,
        target.state_code = source.state_code,
        target.borough_code = source.borough_code,
        target.neighbor_code = source.neighbor_code,
        target.district_manager = source.district_manager,
        target.src_district_id = source.src_district_id,
        target.zone_manager = source.zone_manager,
        target.src_zone_id = source.src_zone_id,
        target.zone_supervisor = source.zone_supervisor,
        target.region_name = source.region_name,
        target.src_region_id = source.src_region_id,
        target.store_type = source.store_type,
        --target.same_store_sales_flag = source.same_store_sales_flag,
        target.first_sale_or_fallback_date = source.first_sale_or_fallback_date,
        target.store_status = source.store_status

WHEN NOT MATCHED THEN
    INSERT (
        src_store_id, store_name, update_date_stores, neighborhood, borough, state, 
        zipcode, street, country_code, state_code, borough_code, neighbor_code, 
        district_manager, src_district_id, zone_manager, src_zone_id, zone_supervisor, 
        region_name, src_region_id, store_type,store_status,first_sale_or_fallback_date -- same_store_sales_flag 
    )
    VALUES (
        source.src_store_id, source.store_name, source.update_date_stores, source.neighborhood, 
        source.borough, source.state, source.zipcode, source.street, source.country_code, 
        source.state_code, source.borough_code, source.neighbor_code, source.district_manager, 
        source.src_district_id, source.zone_manager, source.src_zone_id, source.zone_supervisor, 
        source.region_name, source.src_region_id, source.store_type,  source.store_status, source.first_sale_or_fallback_date --source.same_store_sales_flag,
    );


StatementMeta(, bfecfefb-9269-4c8b-a4bf-14463b340311, 14, Finished, Available, Finished)

<Spark SQL result set with 1 rows and 4 fields>

# Dim Products Section

In [2]:
farticulos_df = spark.read.parquet(get_latest_file('farticulos')) ##product
fgrupos_df = spark.read.parquet(get_latest_file('fgrupos')) ##group
flineas_df = spark.read.parquet(get_latest_file('flineas')) ## line
tipo_articulos_df = spark.read.parquet(get_latest_file('tipo_articulos')) ##product_type
estados_articulos = spark.read.parquet(get_latest_file('estado_articulos')) ##product_status


StatementMeta(, f12487cd-84ed-4c68-b0ab-2929d9eb9ae9, 4, Finished, Available, Finished)

In [3]:
farticulos_df.createTempView('temp_products')
fgrupos_df.createTempView('temp_groups')
flineas_df.createTempView('temp_product_lines')
tipo_articulos_df.createTempView('temp_product_types')
estados_articulos.createTempView('temp_product_status')

StatementMeta(, f12487cd-84ed-4c68-b0ab-2929d9eb9ae9, 5, Finished, Available, Finished)

In [None]:
%%sql
select *
from temp_products 

-- where CAST(l.LCLAVE as int) = 1

StatementMeta(, f12487cd-84ed-4c68-b0ab-2929d9eb9ae9, 19, Finished, Available, Finished)

<Spark SQL result set with 1000 rows and 65 fields>

In [None]:
%%sql
select * from temp_product_lines l 

StatementMeta(, f12487cd-84ed-4c68-b0ab-2929d9eb9ae9, 20, Finished, Available, Finished)

<Spark SQL result set with 287 rows and 7 fields>

In [None]:
%%sql
select * from dimproducts
where src_product_id = 16156

StatementMeta(, f12487cd-84ed-4c68-b0ab-2929d9eb9ae9, 24, Finished, Available, Finished)

<Spark SQL result set with 1 rows and 25 fields>

In [None]:
%%sql
MERGE INTO DimProducts AS target
USING (
    SELECT 
        p.ICLAVE AS src_product_id,
        p.ICLAVEMADRE AS is_parent,
        p.ICODIGOBARRAS AS barcode,
        p.ICB AS barcode_2,
        p.IDESC AS product_name,
        p.IUNIDAD AS unit_clean,
        p.iunicant AS unit,
        p.UDDATE AS update_date_product,
        p.USERID AS user_id_product,
        p.IV_CLAVE AS iva_tax_id,
        p.IE_CLAVE AS ieps_tax_id,
        p.desc_1 AS iva_tax_rate,
        p.desc_2 AS ieps_tax_rate,
        p.iclave_padre AS parent_src_id,
        t.type_desc AS product_type,
        p.isustn AS substance,
        p.imarca AS product_brand,
        l.LCLAVE AS src_line_id,
        l.LDESC AS line_name,
        l.lactivo AS line_status,
        g.GDESC AS group_name,
        g.GCLAVE AS src_group_id,
        g.gactivo AS group_status,
        s.descripcion AS desc,
        p.IACTIVO as product_status
    FROM temp_products p
    LEFT JOIN temp_product_lines l 
        ON p.LCLAVE = l.LCLAVE and p.GCLAVE = l.GCLAVE
    LEFT JOIN temp_groups g 
        ON l.GCLAVE = g.GCLAVE
    LEFT JOIN temp_product_types t 
        ON p.itype = t.itype
    LEFT JOIN temp_product_status s 
        ON s.id_estado = p.iactivo
) AS source
ON target.src_product_id = source.src_product_id

WHEN MATCHED THEN
    UPDATE SET
        target.is_parent = source.is_parent,
        target.barcode = source.barcode,
        target.barcode_2 = source.barcode_2,
        target.product_name = source.product_name,
        target.unit_clean = source.unit_clean,
        target.unit = source.unit,
        target.update_date_product = source.update_date_product,
        target.user_id_product = source.user_id_product,
        target.iva_tax_id = source.iva_tax_id,
        target.ieps_tax_id = source.ieps_tax_id,
        target.iva_tax_rate = source.iva_tax_rate,
        target.ieps_tax_rate = source.ieps_tax_rate,
        target.parent_src_id = source.parent_src_id,
        target.product_type = source.product_type,
        target.substance = source.substance,
        target.product_brand = source.product_brand,
        target.src_line_id = source.src_line_id,
        target.line_name = source.line_name,
        target.line_status = source.line_status,
        target.group_name = source.group_name,
        target.src_group_id = source.src_group_id,
        target.group_status = source.group_status,
        target.desc = source.desc,
        target.product_status = source.product_status

WHEN NOT MATCHED THEN
    INSERT (
        src_product_id, is_parent, barcode, barcode_2, product_name, unit_clean, 
        unit, update_date_product, user_id_product, iva_tax_id, ieps_tax_id, 
        iva_tax_rate, ieps_tax_rate, parent_src_id, product_type, substance, 
        product_brand, src_line_id, line_name, line_status, group_name, 
        src_group_id, group_status, desc, product_status
    )
    VALUES (
        source.src_product_id, source.is_parent, source.barcode, source.barcode_2, 
        source.product_name, source.unit_clean, source.unit, source.update_date_product, 
        source.user_id_product, source.iva_tax_id, source.ieps_tax_id, 
        source.iva_tax_rate, source.ieps_tax_rate, source.parent_src_id, source.product_type, 
        source.substance, source.product_brand, source.src_line_id, source.line_name, 
        source.line_status, source.group_name, source.src_group_id, source.group_status, 
        source.desc, source.product_status
    );


StatementMeta(, f12487cd-84ed-4c68-b0ab-2929d9eb9ae9, 23, Finished, Available, Finished)

<Spark SQL result set with 1 rows and 4 fields>

# Dim Supplier Section

In [None]:
fproveedores_df = spark.read.parquet(get_latest_file('fproveedores')) ##supplier 

fproveedores_df.createOrReplaceTempView("Supplier")

StatementMeta(, d66e89fa-c365-40d5-87bc-3b3d61486efc, 6, Finished, Available, Finished)

In [None]:
%%sql 
-- Select PCLAVE, PRAZON, PDIR, PTEL1, PTEL2,PTEL3, PATENCION, PACTIVO, PPLAZO, PNOMBRE, PEMAIL, UUDDATE, 
-- USERID, PTIEMPOENT, MPDIASREC, MPDIASPROV, MPDIASREPARAR, MPDEVOLUCION, MULTA, pcontacto,     from Supplier

In [6]:
# from pyspark.sql import SparkSession
# from pyspark.sql.functions import col, lit, expr, date_format, dayofweek, weekofyear, month, quarter, year, dayofmonth, dayofyear, last_day, when

# # Define the date range
# start_date = "2015-01-01"
# end_date = "2050-12-31"

# # Generate a Date Sequence
# date_df = spark.sql(f"SELECT explode(sequence(to_date('{start_date}'), to_date('{end_date}'), interval 1 day)) as date")

# # Create DimDate attributes
# dim_date_df = date_df.select(
#     col("date").alias("full_date"),
#     year("date").alias("year"),
#     quarter("date").alias("quarter"),
#     month("date").alias("month"),
#     date_format("date", "MMMM").alias("month_name"),
#     date_format("date", "E").alias("day_name"),
#     dayofmonth("date").alias("day_of_month"),
#     dayofweek("date").alias("day_of_week"),  # 1=Sunday, 7=Saturday
#     dayofyear("date").alias("day_of_year"),
#     weekofyear("date").alias("week_of_year"),
#     when((dayofweek("date") == 1) | (dayofweek("date") == 7), lit(1)).otherwise(lit(0)).alias("is_weekend"),
#     expr("CASE WHEN month(date) IN (1, 2, 3) THEN 'Q1' WHEN month(date) IN (4, 5, 6) THEN 'Q2' WHEN month(date) IN (7, 8, 9) THEN 'Q3' ELSE 'Q4' END").alias("quarter_name"),
#     expr("CASE WHEN dayofweek(date) IN (1,7) THEN 'Weekend' ELSE 'Weekday' END").alias("day_type"),
# )

# dim_date_df.write.mode('overwrite').saveAsTable('DimDates')


StatementMeta(, f06bedf1-1a28-4a2b-9697-cf24d681d416, 8, Finished, Available, Finished)

# DimMovements

In [None]:
df = spark.read.format("csv").option("header","true").load("abfss://b2c899fb-e571-4496-aebf-c7a23083635a@onelake.dfs.fabric.microsoft.com/a00cf91a-f92e-498a-9f14-ba10221fb05a/Files/User Created Data/Movement Descriptions/Updated_Movement_Data_Table.csv")
# df now is a Spark DataFrame containing CSV data from "abfss://b2c899fb-e571-4496-aebf-c7a23083635a@onelake.dfs.fabric.microsoft.com/a00cf91a-f92e-498a-9f14-ba10221fb05a/Files/User Created Data/Movement Descriptions/Updated_Movement_Data_Table.csv".


StatementMeta(, fbf0dc4e-c128-4ac2-8ace-83c4709e8d12, 7, Finished, Available, Finished)

In [None]:
df.write.mode('overwrite').saveAsTable('DimMovements')


StatementMeta(, fbf0dc4e-c128-4ac2-8ace-83c4709e8d12, 8, Finished, Available, Finished)