# Extract Pipelines

In [14]:
import duckdb
from deltalake import write_deltalake
import logging
import os

# Configuração do logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

def extract_func_sqlite(db_path: str, sqlite_table: str, bronze_delta_path: str, name_table: str, mode: str = "overwrite"):
    conn = duckdb.connect()
    
    try:
        logger.info(f"Iniciando extração da tabela SQLite - {sqlite_table}")
        conn.execute(f"ATTACH '{db_path}' AS sqlite_db;") # conectar com sqlite usando duckdb
        dataframe = conn.sql(f"SELECT * FROM sqlite_db.{sqlite_table}").arrow()  # converte para PyArrow Table
        table_path = os.path.join(bronze_delta_path, name_table) 
        os.makedirs(table_path, exist_ok=True) # criação de diretorio antes de salvar
        write_deltalake(table_path, dataframe, mode=mode)
        logger.info(f"\033[32m[OK]\033[0m Extração da tabela {sqlite_table} concluída e salva em {table_path}.")

    except Exception as e:
        logger.error(f"\033[031m[ERROR]\033[0m Erro ao processar {sqlite_table}: {str(e)}")

    finally:
        conn.close()

def extract_func_csv(source_path_csv: str, bronze_delta_path: str, name_table: str, mode: str = "overwrite"):
    conn = duckdb.connect()
    
    try:
        logger.info(f"Iniciando extração do arquivo {source_path_csv}")        
        dataframe = conn.sql(f"SELECT * FROM read_csv_auto('{source_path_csv}')").arrow()
        table_path = os.path.join(bronze_delta_path, name_table)
        os.makedirs(table_path, exist_ok=True)
        write_deltalake(table_path, dataframe, mode=mode)
        logger.info(f"\033[32m[OK]\033[0m Processo de extração do .CSV {source_path_csv} foi concluído.")

    except Exception as e:
        logger.error(f"\033[031m[ERROR]\033[0m Erro ao processar {source_path_csv}: {str(e)}")

    finally:
        conn.close()

if __name__ == "__main__":
    # extração de dados .csv
    extract_func_csv(source_path_csv="../data/olist_customers_dataset.csv", name_table="customers_bronze", bronze_delta_path="../delta_lake/bronze")
    extract_func_csv(source_path_csv="../data/olist_geolocation_dataset.csv", name_table="geolocation_bronze", bronze_delta_path="../delta_lake/bronze")
    extract_func_csv(source_path_csv="../data/olist_order_items_dataset.csv", name_table="order_items_bronze", bronze_delta_path="../delta_lake/bronze")
    extract_func_csv(source_path_csv="../data/olist_order_payments_dataset.csv", name_table="payments_bronze", bronze_delta_path="../delta_lake/bronze")
    extract_func_csv(source_path_csv="../data/olist_order_reviews_dataset.csv", name_table="reviews_bronze", bronze_delta_path="../delta_lake/bronze")
    extract_func_csv(source_path_csv="../data/olist_orders_dataset.csv", name_table="orders_bronze", bronze_delta_path="../delta_lake/bronze")
    extract_func_csv(source_path_csv="../data/olist_products_dataset.csv", name_table="products_bronze", bronze_delta_path="../delta_lake/bronze")
    extract_func_csv(source_path_csv="../data/olist_sellers_dataset.csv", name_table="sellers_bronze", bronze_delta_path="../delta_lake/bronze")
    extract_func_csv(source_path_csv="../data/product_category_name_translation.csv", name_table="product_category_name_translation_bronze", bronze_delta_path="../delta_lake/bronze")

    # extração de dados sqlite
    extract_func_sqlite(db_path="../data/olist.sqlite", sqlite_table="leads_qualified", name_table="leads_qualified_bronze", bronze_delta_path="../delta_lake/bronze")
    extract_func_sqlite(db_path="../data/olist.sqlite", sqlite_table="leads_closed", name_table="leads_closed_bronze", bronze_delta_path="../delta_lake/bronze")


2025-04-03 20:32:51,628 - INFO - Iniciando extração do arquivo ../data/olist_customers_dataset.csv
2025-04-03 20:32:52,139 - INFO - [32m[OK][0m Processo de extração do .CSV ../data/olist_customers_dataset.csv foi concluído.
2025-04-03 20:32:52,148 - INFO - Iniciando extração do arquivo ../data/olist_geolocation_dataset.csv
2025-04-03 20:32:52,807 - INFO - [32m[OK][0m Processo de extração do .CSV ../data/olist_geolocation_dataset.csv foi concluído.
2025-04-03 20:32:52,828 - INFO - Iniciando extração do arquivo ../data/olist_order_items_dataset.csv
2025-04-03 20:32:53,182 - INFO - [32m[OK][0m Processo de extração do .CSV ../data/olist_order_items_dataset.csv foi concluído.
2025-04-03 20:32:53,198 - INFO - Iniciando extração do arquivo ../data/olist_order_payments_dataset.csv
2025-04-03 20:32:53,367 - INFO - [32m[OK][0m Processo de extração do .CSV ../data/olist_order_payments_dataset.csv foi concluído.
2025-04-03 20:32:53,377 - INFO - Iniciando extração do arquivo ../data/olist_o

# Transform Pipelines

In [15]:
import pandas as pd
import duckdb

def pandas_sql(query: str) -> pd.DataFrame:
    """Executa uma consulta SQL e retorna um DataFrame."""
    conn = duckdb.connect()
    result = conn.sql(query).fetchdf()
    conn.close()
    return result

pandas_sql("SELECT * FROM delta_scan('../delta_lake/bronze/leads_closed_bronze') LIMIT 5")

Unnamed: 0,mql_id,seller_id,sdr_id,sr_id,won_date,business_segment,lead_type,lead_behaviour_profile,has_company,has_gtin,average_stock,business_type,declared_product_catalog_size,declared_monthly_revenue
0,5420aad7fec3549a85876ba1c529bd84,2c43fb513632d29b3b58df74816f1b06,a8387c01a09e99ce014107505b92388c,4ef15afb4b2723d8f3d81e51ec7afefe,2018-02-26 19:58:54,pet,online_medium,cat,,,,reseller,,0.0
1,a555fb36b9368110ede0f043dfc3b9a0,bbb7d7893a450660432ea6652310ebb7,09285259593c61296eef10c734121d5b,d3d1e91a157ea7f90548eef82f1955e3,2018-05-08 20:17:59,car_accessories,industry,eagle,,,,reseller,,0.0
2,327174d3648a2d047e8940d7d15204ca,612170e34b97004b3ba37eae81836b4c,b90f87164b5f8c2cfa5c8572834dbe3f,6565aa9ce3178a5caf6171827af3a9ba,2018-06-05 17:27:23,home_appliances,online_big,cat,,,,reseller,,0.0
3,f5fee8f7da74f4887f5bcae2bafb6dd6,21e1781e36faf92725dde4730a88ca0f,56bf83c4bb35763a51c2baab501b4c67,d3d1e91a157ea7f90548eef82f1955e3,2018-01-17 13:51:03,food_drink,online_small,,,,,reseller,,0.0
4,ffe640179b554e295c167a2f6be528e0,ed8cb7b190ceb6067227478e48cf8dde,4b339f9567d060bcea4f5136b9f5949e,d3d1e91a157ea7f90548eef82f1955e3,2018-07-03 20:17:45,home_appliances,industry,wolf,,,,manufacturer,,0.0


In [16]:
pandas_sql("DESCRIBE SELECT * FROM delta_scan('../delta_lake/bronze/leads_closed_bronze')")
# pandas_sql("SELECT COUNT(*) FROM delta_scan('../delta_lake/bronze/leads_closed_bronze') WHERE declared_monthly_revenue >= 0.0 LIMIT 5")

Unnamed: 0,column_name,column_type,null,key,default,extra
0,mql_id,VARCHAR,YES,,,
1,seller_id,VARCHAR,YES,,,
2,sdr_id,VARCHAR,YES,,,
3,sr_id,VARCHAR,YES,,,
4,won_date,VARCHAR,YES,,,
5,business_segment,VARCHAR,YES,,,
6,lead_type,VARCHAR,YES,,,
7,lead_behaviour_profile,VARCHAR,YES,,,
8,has_company,BIGINT,YES,,,
9,has_gtin,BIGINT,YES,,,


In [17]:
import os
import logging
import duckdb
from deltalake import write_deltalake

# Configuração do logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

def transform_pipeline_sql(query: str, table_name: str, mode: str = "overwrite"):
    """Executa uma query SQL sobre um Delta Table e salva o resultado na camada Silver."""
    
    silver_path = "../delta_lake/silver/"
    silver_path_delta = f"{silver_path}{table_name}"
    os.makedirs(silver_path, exist_ok=True) # cria o diretório se não existir

    try:
        with duckdb.connect() as conn:
            logger.info(f"Executando transformação na tabela '{table_name}'")
            df_transformed = conn.sql(query).arrow()
            write_deltalake(silver_path_delta, df_transformed, mode=mode) # gravando o resultado em uma delta table
            logger.info(f"\033[32m[OK]\033[0m Tabela '{table_name}' processada com sucesso.")

    except Exception as e:
        logger.error(f"\033[31m[ERROR]\033[0m Erro inesperado ao processar '{table_name}': {str(e)}")

    finally:
        if conn:
            conn.close()

## Transform pipelines SQL

In [18]:
# customers table
transform_pipeline_sql(
    query=f"""
    SELECT
        customer_id,
        customer_unique_id,
        LOWER(TRIM(customer_zip_code_prefix)) AS customer_cep,
        LOWER(TRIM(customer_city)) AS customer_city,
        UPPER(TRIM(customer_state)) AS customer_state,
    FROM delta_scan('../delta_lake/bronze/customers_bronze')
    """,
    table_name="customers_silver"
)

2025-04-03 20:32:56,601 - INFO - Executando transformação na tabela 'customers_silver'
2025-04-03 20:32:56,789 - INFO - [32m[OK][0m Tabela 'customers_silver' processada com sucesso.


In [19]:
# geolocation table
transform_pipeline_sql(
    query=f"""
    SELECT
        geolocation_zip_code_prefix AS geolocation_cep,
        COALESCE(geolocation_lat, NULL) AS geolocation_lat,  -- Mantém NULL para evitar coordenadas erradas
        COALESCE(geolocation_lng, NULL) AS geolocation_lng,
        LOWER(TRIM(COALESCE(geolocation_city, 'N/A'))) AS geolocation_city,
        UPPER(TRIM(COALESCE(geolocation_state, 'N/A'))) AS geolocation_state
    FROM delta_scan('../delta_lake/bronze/geolocation_bronze');
    """, 
    table_name="geolocation_silver"
)

2025-04-03 20:32:56,806 - INFO - Executando transformação na tabela 'geolocation_silver'
2025-04-03 20:32:56,999 - INFO - [32m[OK][0m Tabela 'geolocation_silver' processada com sucesso.


In [20]:
# order_items table
transform_pipeline_sql(
    query=f"""
    SELECT 
        order_id,
        order_item_id,
        product_id,
        seller_id,
        CAST(shipping_limit_date AS TIMESTAMP) AS shipping_limit_date,
        CAST(price AS DOUBLE) AS price,
        CAST(freight_value AS DOUBLE) AS freight_value
    FROM delta_scan('../delta_lake/bronze/order_items_bronze')
    """,
    table_name="order_items_silver"
)

2025-04-03 20:32:57,023 - INFO - Executando transformação na tabela 'order_items_silver'
2025-04-03 20:32:57,201 - INFO - [32m[OK][0m Tabela 'order_items_silver' processada com sucesso.


In [21]:
# payments table
transform_pipeline_sql(query="""
SELECT 
    order_id,
    payment_sequential,
    LOWER(TRIM(COALESCE(payment_type, 'N/A'))) AS payment_type, 
    COALESCE(payment_installments, 0) AS payment_installments,
    COALESCE(payment_value, 0) AS payment_value
FROM delta_scan('../delta_lake/bronze/payments_bronze');
""", table_name="payments_silver")

2025-04-03 20:32:57,220 - INFO - Executando transformação na tabela 'payments_silver'
2025-04-03 20:32:57,345 - INFO - [32m[OK][0m Tabela 'payments_silver' processada com sucesso.


In [22]:
# orders reviews table
transform_pipeline_sql(query="""
SELECT 
    review_id,
    order_id,
    CAST(review_score AS INT) AS review_score,
    COALESCE(review_comment_title, 'no_title') AS review_comment_title,
    COALESCE(review_comment_message, 'no_message') AS review_comment_message,
    CAST(review_creation_date AS TIMESTAMP) AS review_creation_date,
    CAST(review_answer_timestamp AS TIMESTAMP) AS review_answer_timestamp
FROM delta_scan('../delta_lake/bronze/reviews_bronze');
""", table_name="reviews_silver")

2025-04-03 20:32:57,365 - INFO - Executando transformação na tabela 'reviews_silver'
2025-04-03 20:32:57,571 - INFO - [32m[OK][0m Tabela 'reviews_silver' processada com sucesso.


In [23]:
# orders table - transformar e retornar delta table com apenas os PEDIDOS ENTREGUES
transform_pipeline_sql(
    query=f"""
    SELECT
        CAST(order_id AS VARCHAR) AS order_id,
        CAST(customer_id AS VARCHAR) AS customer_id,
        CAST(order_status AS VARCHAR) AS order_status,
        CAST(order_purchase_timestamp AS TIMESTAMP) AS order_purchase_timestamp,
        CAST(order_approved_at AS TIMESTAMP) AS order_approved_at,
        CAST(order_delivered_carrier_date AS TIMESTAMP) AS order_delivered_carrier_date,
        CAST(order_delivered_customer_date AS TIMESTAMP) AS order_delivered_customer_date,
        CAST(order_estimated_delivery_date AS TIMESTAMP) AS order_estimated_delivery_date
    FROM delta_scan('../delta_lake/bronze/orders_bronze')
    WHERE order_status = 'delivered'
    """,
    table_name="orders_only_delivered_silver"
)

2025-04-03 20:32:57,589 - INFO - Executando transformação na tabela 'orders_only_delivered_silver'
2025-04-03 20:32:57,799 - INFO - [32m[OK][0m Tabela 'orders_only_delivered_silver' processada com sucesso.


In [24]:
# orders table - transformar e retornar delta table com TODOS OS DADOS
transform_pipeline_sql(
    query=f"""
    SELECT
        CAST(order_id AS VARCHAR) AS order_id,
        CAST(customer_id AS VARCHAR) AS customer_id,
        CAST(order_status AS VARCHAR) AS order_status,
        CAST(order_purchase_timestamp AS TIMESTAMP) AS order_purchase_timestamp,
        CAST(order_approved_at AS TIMESTAMP) AS order_approved_at,
        CAST(order_delivered_carrier_date AS TIMESTAMP) AS order_delivered_carrier_date,
        CAST(order_delivered_customer_date AS TIMESTAMP) AS order_delivered_customer_date,
        CAST(order_estimated_delivery_date AS TIMESTAMP) AS order_estimated_delivery_date
    FROM delta_scan('../delta_lake/bronze/orders_bronze')
    """,
    table_name="orders_full_data_silver"
)

2025-04-03 20:32:57,818 - INFO - Executando transformação na tabela 'orders_full_data_silver'
2025-04-03 20:32:58,028 - INFO - [32m[OK][0m Tabela 'orders_full_data_silver' processada com sucesso.


In [25]:
# products table
transform_pipeline_sql(
    query=f"""
    SELECT
        CAST(product_id AS VARCHAR) AS product_id,
        COALESCE(LOWER(TRIM(product_category_name)), 'unknown') AS product_category,
        COALESCE(CAST(product_name_lenght AS INT), 0) AS product_name_length,
        COALESCE(CAST(product_description_lenght AS INT), 0) AS product_description_length,
        COALESCE(CAST(product_photos_qty AS INT), 0) AS product_photos_qty,
        COALESCE(CAST(product_weight_g AS INT), 0) AS product_weight_g,
        COALESCE(CAST(product_length_cm AS INT), 0) AS product_length_cm,
        COALESCE(CAST(product_height_cm AS INT), 0) AS product_height_cm,
        COALESCE(CAST(product_width_cm AS INT), 0) AS product_width_cm
    FROM delta_scan('../delta_lake/bronze/products_bronze');
    """,
    table_name="products_silver"
)

2025-04-03 20:32:58,046 - INFO - Executando transformação na tabela 'products_silver'
2025-04-03 20:32:58,151 - INFO - [32m[OK][0m Tabela 'products_silver' processada com sucesso.


In [26]:
# sellers table
transform_pipeline_sql(
    query=f"""
    SELECT
        seller_id,
        seller_zip_code_prefix AS seller_cep,
        LOWER(TRIM(seller_city)) AS seller_city,
        UPPER(TRIM(seller_state)) AS seller_state
    FROM delta_scan('../delta_lake/bronze/sellers_bronze')
    """,
    table_name="sellers_silver"
)

2025-04-03 20:32:58,168 - INFO - Executando transformação na tabela 'sellers_silver'
2025-04-03 20:32:58,228 - INFO - [32m[OK][0m Tabela 'sellers_silver' processada com sucesso.


In [27]:
# leads_closed table
transform_pipeline_sql(
    query=f"""
    SELECT 
        mql_id,
        seller_id,
        sdr_id,
        sr_id,
        CAST(won_date AS TIMESTAMP) AS won_date,
        LOWER(TRIM(business_segment)) AS business_segment,
        LOWER(TRIM(lead_type)) AS lead_type,
        LOWER(TRIM(lead_behaviour_profile)) AS lead_behaviour_profile,
        COALESCE(has_company, 0) AS has_company,
        COALESCE(has_gtin, 0) AS has_gtin,
        COALESCE(NULLIF(average_stock, ''), 'N/A') AS average_stock,
        LOWER(TRIM(business_type)) AS business_type,
        COALESCE(declared_product_catalog_size, 0.0) AS declared_product_catalog_size,
        COALESCE(declared_monthly_revenue, 0.0) AS declared_monthly_revenue
    FROM delta_scan('../delta_lake/bronze/leads_closed_bronze')
    """,
    table_name="leads_closed_silver"
)

2025-04-03 20:32:58,244 - INFO - Executando transformação na tabela 'leads_closed_silver'
2025-04-03 20:32:58,327 - INFO - [32m[OK][0m Tabela 'leads_closed_silver' processada com sucesso.


## Leads Closed table

In [28]:
pandas_sql("""
    SELECT *
    FROM delta_scan('../delta_lake/bronze/leads_closed_bronze')
    LIMIT 3
    """)

Unnamed: 0,mql_id,seller_id,sdr_id,sr_id,won_date,business_segment,lead_type,lead_behaviour_profile,has_company,has_gtin,average_stock,business_type,declared_product_catalog_size,declared_monthly_revenue
0,5420aad7fec3549a85876ba1c529bd84,2c43fb513632d29b3b58df74816f1b06,a8387c01a09e99ce014107505b92388c,4ef15afb4b2723d8f3d81e51ec7afefe,2018-02-26 19:58:54,pet,online_medium,cat,,,,reseller,,0.0
1,a555fb36b9368110ede0f043dfc3b9a0,bbb7d7893a450660432ea6652310ebb7,09285259593c61296eef10c734121d5b,d3d1e91a157ea7f90548eef82f1955e3,2018-05-08 20:17:59,car_accessories,industry,eagle,,,,reseller,,0.0
2,327174d3648a2d047e8940d7d15204ca,612170e34b97004b3ba37eae81836b4c,b90f87164b5f8c2cfa5c8572834dbe3f,6565aa9ce3178a5caf6171827af3a9ba,2018-06-05 17:27:23,home_appliances,online_big,cat,,,,reseller,,0.0


In [29]:
# visualizando as categorias de business_segment
# pandas_sql("SELECT DISTINCT business_segment FROM delta_scan('../delta_lake/bronze/leads_closed_bronze')")

# visualizando as categorias de lead_behaviour_profile
# pandas_sql("SELECT DISTINCT lead_behaviour_profile FROM delta_scan('../delta_lake/bronze/leads_closed_bronze')")

# visualizando as categorias de has_company
pandas_sql("SELECT * FROM delta_scan('../delta_lake/bronze/leads_closed_bronze') WHERE lead_behaviour_profile == 'eagle, wolf' LIMIT 3")

Unnamed: 0,mql_id,seller_id,sdr_id,sr_id,won_date,business_segment,lead_type,lead_behaviour_profile,has_company,has_gtin,average_stock,business_type,declared_product_catalog_size,declared_monthly_revenue
0,1e2a84cd1c6fcd7b7a07f49f1cf7f6cc,f4009c6b30765309a251c26bc458e0e5,e7dff61b78bebffa71678e126ce669ad,9ae085775a198122c5586fa830ff7f2b,2018-08-07 19:47:27,stationery,industry,"eagle, wolf",1.0,1.0,200+,reseller,,0.0
1,bf3e0df27b04abb4c107ab4df9955b29,b1116e9a35b2fa91eb4a0af8e73cdc3a,9d12ef1a7eca3ec58c545c678af7869c,d3d1e91a157ea7f90548eef82f1955e3,2018-06-14 18:51:26,small_appliances,industry,"eagle, wolf",,,,manufacturer,,0.0
2,d5cb61eef98bd237e41bc2225263f823,7e1f0755f1c75e301dfa37c21fd01efe,e4a6222cdb5b34375400904f03d8e6a5,34d40cdaf94010a1d05b0d6212f9e909,2018-11-12 19:17:25,other,other,"eagle, wolf",1.0,1.0,5-20,other,305.0,120000.0


In [30]:
pandas_sql("SELECT DISTINCT lead_behaviour_profile FROM delta_scan('../delta_lake/bronze/leads_closed_bronze')")

Unnamed: 0,lead_behaviour_profile
0,"shark, wolf"
1,"eagle, wolf"
2,wolf
3,"eagle, cat"
4,eagle
5,shark
6,"shark, cat"
7,
8,cat
9,"cat, wolf"


In [31]:
pandas_sql("DESCRIBE SELECT * FROM delta_scan('../delta_lake/bronze/leads_closed_bronze')")

Unnamed: 0,column_name,column_type,null,key,default,extra
0,mql_id,VARCHAR,YES,,,
1,seller_id,VARCHAR,YES,,,
2,sdr_id,VARCHAR,YES,,,
3,sr_id,VARCHAR,YES,,,
4,won_date,VARCHAR,YES,,,
5,business_segment,VARCHAR,YES,,,
6,lead_type,VARCHAR,YES,,,
7,lead_behaviour_profile,VARCHAR,YES,,,
8,has_company,BIGINT,YES,,,
9,has_gtin,BIGINT,YES,,,


In [32]:
pandas_sql("SELECT * FROM delta_scan('../delta_lake/silver/leads_closed_silver') LIMIT 3")

Unnamed: 0,mql_id,seller_id,sdr_id,sr_id,won_date,business_segment,lead_type,lead_behaviour_profile,has_company,has_gtin,average_stock,business_type,declared_product_catalog_size,declared_monthly_revenue
0,5420aad7fec3549a85876ba1c529bd84,2c43fb513632d29b3b58df74816f1b06,a8387c01a09e99ce014107505b92388c,4ef15afb4b2723d8f3d81e51ec7afefe,2018-02-26 19:58:54,pet,online_medium,cat,0,0,,reseller,0.0,0.0
1,a555fb36b9368110ede0f043dfc3b9a0,bbb7d7893a450660432ea6652310ebb7,09285259593c61296eef10c734121d5b,d3d1e91a157ea7f90548eef82f1955e3,2018-05-08 20:17:59,car_accessories,industry,eagle,0,0,,reseller,0.0,0.0
2,327174d3648a2d047e8940d7d15204ca,612170e34b97004b3ba37eae81836b4c,b90f87164b5f8c2cfa5c8572834dbe3f,6565aa9ce3178a5caf6171827af3a9ba,2018-06-05 17:27:23,home_appliances,online_big,cat,0,0,,reseller,0.0,0.0


## Leads Qualified Table

In [33]:
pandas_sql("SELECT * FROM delta_scan('../delta_lake/bronze/leads_qualified_bronze') LIMIT 5")
# pandas_sql("SELECT COUNT(*) FROM delta_scan('../delta_lake/bronze/leads_qualified_bronze')")

Unnamed: 0,mql_id,first_contact_date,landing_page_id,origin
0,dac32acd4db4c29c230538b72f8dd87d,2018-02-01,88740e65d5d6b056e0cda098e1ea6313,social
1,8c18d1de7f67e60dbd64e3c07d7e9d5d,2017-10-20,007f9098284a86ee80ddeb25d53e0af8,paid_search
2,b4bc852d233dfefc5131f593b538befa,2018-03-22,a7982125ff7aa3b2054c6e44f9d28522,organic_search
3,6be030b81c75970747525b843c1ef4f8,2018-01-22,d45d558f0daeecf3cccdffe3c59684aa,email
4,5420aad7fec3549a85876ba1c529bd84,2018-02-21,b48ec5f3b04e9068441002a19df93c6c,organic_search


In [34]:
pandas_sql("SELECT DISTINCT origin FROM delta_scan('../delta_lake/bronze/leads_qualified_bronze')")

Unnamed: 0,origin
0,organic_search
1,
2,paid_search
3,social
4,display
5,direct_traffic
6,other_publicities
7,other
8,email
9,unknown


In [35]:
pandas_sql("""
CREATE OR REPLACE VIEW leads_qualified AS 
SELECT
    mql_id,
    landing_page_id,
    CAST(first_contact_date AS TIMESTAMP) AS first_contact_date,
    LOWER(TRIM(COALESCE(origin, 'N/A'))) AS came_from
FROM delta_scan('../delta_lake/bronze/leads_qualified_bronze');
SELECT * FROM leads_qualified LIMIT 10;
""")

Unnamed: 0,mql_id,landing_page_id,first_contact_date,came_from
0,dac32acd4db4c29c230538b72f8dd87d,88740e65d5d6b056e0cda098e1ea6313,2018-02-01,social
1,8c18d1de7f67e60dbd64e3c07d7e9d5d,007f9098284a86ee80ddeb25d53e0af8,2017-10-20,paid_search
2,b4bc852d233dfefc5131f593b538befa,a7982125ff7aa3b2054c6e44f9d28522,2018-03-22,organic_search
3,6be030b81c75970747525b843c1ef4f8,d45d558f0daeecf3cccdffe3c59684aa,2018-01-22,email
4,5420aad7fec3549a85876ba1c529bd84,b48ec5f3b04e9068441002a19df93c6c,2018-02-21,organic_search
5,28bdfd5f057764b54c38770f95c69f2f,22c29808c4f815213303f8933030604c,2018-01-14,organic_search
6,126a0d10becbaafcb2e72ce6848cf32c,6a110e795dd487f1cf8d7583671987af,2018-05-15,email
7,f76136f54d14a3345951f25b7932366b,d51b0d02f063ba1d053db6d97226eec3,2018-05-24,email
8,2f838cade4a6012a6cb1016d1d8d95ed,aeac92c0f5ae22a04ed3b746cce3a1b6,2017-11-10,organic_search
9,7281942387a1a0c3f72a50a8b0bb0920,88740e65d5d6b056e0cda098e1ea6313,2017-12-25,social


In [36]:
# leads qualified table
transform_pipeline_sql(
    query="""
    SELECT
        mql_id,
        landing_page_id,
        CAST(first_contact_date AS TIMESTAMP) AS first_contact_date,
        LOWER(TRIM(COALESCE(origin, 'N/A'))) AS came_from
    FROM delta_scan('../delta_lake/bronze/leads_qualified_bronze');
""", table_name="leads_qualified_silver"
)

2025-04-03 20:32:58,974 - INFO - Executando transformação na tabela 'leads_qualified_silver'
2025-04-03 20:32:59,042 - INFO - [32m[OK][0m Tabela 'leads_qualified_silver' processada com sucesso.


## Silver Layer Transformations for especially tables

In this case it will be necessary to create new tables within the Silver layer:

1. customers table with geolocation
2. Aggregate this table with leads_closed_silver to create a new hybrid table with cross-referenced data.

### Leads closed with Leads qualified

In [37]:
# aggregated_leads table
transform_pipeline_sql(query="""
    SELECT
        leads_qualified.mql_id,
        CAST(leads_qualified.first_contact_date AS TIMESTAMP) AS first_contact_date,
        LOWER(TRIM(COALESCE(leads_qualified.origin, 'N/A'))) AS came_from,
        leads_closed.won_date,
        LOWER(TRIM(COALESCE(leads_closed.business_segment, 'N/A'))) AS business_segment,
        LOWER(TRIM(COALESCE(leads_closed.lead_type, 'N/A'))) AS lead_type,
        -- calculo de tempo para conversão dos leads (em dias)
        DATEDIFF(
            'day', 
            leads_qualified.first_contact_date::TIMESTAMP, 
            leads_closed.won_date::TIMESTAMP
        ) AS days_to_convert
    FROM delta_scan('../delta_lake/bronze/leads_qualified_bronze') AS leads_qualified
    LEFT JOIN delta_scan('../delta_lake/bronze/leads_closed_bronze') AS leads_closed
        ON leads_qualified.mql_id = leads_closed.mql_id;
""", table_name="aggregated_leads_silver")

2025-04-03 20:32:59,057 - INFO - Executando transformação na tabela 'aggregated_leads_silver'
2025-04-03 20:32:59,161 - INFO - [32m[OK][0m Tabela 'aggregated_leads_silver' processada com sucesso.


In [38]:
pandas_sql("SELECT * FROM delta_scan('../delta_lake/silver/aggregated_leads_silver') LIMIT 10;")
# pandas_sql("SELECT * FROM delta_scan('../delta_lake/silver/aggregated_leads_silver') WHERE won_date = NULL LIMIT 10;")

Unnamed: 0,mql_id,first_contact_date,came_from,won_date,business_segment,lead_type,days_to_convert
0,5420aad7fec3549a85876ba1c529bd84,2018-02-21,organic_search,2018-02-26 19:58:54,pet,online_medium,5
1,a555fb36b9368110ede0f043dfc3b9a0,2018-04-04,referral,2018-05-08 20:17:59,car_accessories,industry,34
2,327174d3648a2d047e8940d7d15204ca,2018-04-03,organic_search,2018-06-05 17:27:23,home_appliances,online_big,63
3,f5fee8f7da74f4887f5bcae2bafb6dd6,2018-01-14,paid_search,2018-01-17 13:51:03,food_drink,online_small,3
4,ffe640179b554e295c167a2f6be528e0,2017-10-09,unknown,2018-07-03 20:17:45,home_appliances,industry,267
5,b94fba7670eeb44dce2a0d8eb790e9f5,2018-02-06,organic_search,2018-02-07 18:04:05,health_beauty,online_medium,1
6,c3e30ed7ac989117c7e1e719b4ac128f,2018-02-20,direct_traffic,2018-04-16 18:18:22,computers,online_medium,55
7,b02c89251106e1fdd9d92744be9f94f2,2018-04-13,unknown,2018-04-17 17:01:57,health_beauty,offline,4
8,a90a37898cc5f2718385a2fb981caaff,2018-04-27,social,2018-05-14 18:37:15,household_utilities,offline,17
9,0173e8d8b1d94a355b440fb67388f532,2017-10-18,paid_search,2018-04-24 03:00:00,household_utilities,online_medium,188


### Customers Table with Geolocation

In [39]:
pandas_sql("SELECT * FROM delta_scan('../delta_lake/silver/customers_silver') LIMIT 5")

Unnamed: 0,customer_id,customer_unique_id,customer_cep,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP


In [40]:
pandas_sql("SELECT * FROM delta_scan('../delta_lake/silver/geolocation_silver') LIMIT 5")
# pandas_sql("SELECT * FROM delta_scan('../delta_lake/bronze/geolocation_bronze') LIMIT 5")

Unnamed: 0,geolocation_cep,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP


In [41]:
pandas_sql("SELECT COUNT(*) FROM delta_scan('../delta_lake/silver/geolocation_silver')")

Unnamed: 0,count_star()
0,1000163


In [42]:
pandas_sql("SELECT COUNT(*) FROM delta_scan('../delta_lake/silver/customers_silver')")

Unnamed: 0,count_star()
0,99441


In [43]:
# aggregated_customers table
transform_pipeline_sql(query="""
    SELECT
        c.customer_id,
        c.customer_unique_id,
        c.customer_cep,
        c.customer_city,
        c.customer_state,
        AVG(g.geolocation_lat) AS avg_latitude,
        AVG(g.geolocation_lng) AS avg_longitude
    FROM delta_scan('../delta_lake/silver/customers_silver') AS c
    LEFT JOIN delta_scan('../delta_lake/silver/geolocation_silver') AS g
    ON c.customer_cep = g.geolocation_cep
    GROUP BY
        c.customer_id,
        c.customer_unique_id,
        c.customer_cep,
        c.customer_city,
        c.customer_state;
""", table_name="aggregated_customers"
)

2025-04-03 20:32:59,535 - INFO - Executando transformação na tabela 'aggregated_customers'
2025-04-03 20:32:59,959 - INFO - [32m[OK][0m Tabela 'aggregated_customers' processada com sucesso.


In [44]:
pandas_sql("SELECT * FROM delta_scan('../delta_lake/silver/aggregated_customers') WHERE customer_city = 'petrolina' LIMIT 10")

Unnamed: 0,customer_id,customer_unique_id,customer_cep,customer_city,customer_state,avg_latitude,avg_longitude
0,799d813f892834ecc22b05e8dea120dc,ed5a19597135c123f7c58790f3293231,56310,petrolina,PE,-9.381291,-40.532324
1,68605f0c6c6811b45cf553519196dd25,4ceb1c0f041a331209154a49b7c2c940,56308,petrolina,PE,-9.394953,-40.513675
2,3816366333bcbe1daabd284b6c4daacc,38ff5c0eeae7cc7b8facb23461e4f7ed,56314,petrolina,PE,-9.369441,-40.536162
3,0e8be41ec287f1d9753fa200dd8741c4,da2edba0f99595361e7dfd281b1d8b11,56308,petrolina,PE,-9.394953,-40.513675
4,c7fb0da4d519d2827af01bf2aece912c,da818bd327280a0da330a412bb7ef1a4,56310,petrolina,PE,-9.381291,-40.532324
5,fc2a9026566306914692682fc68e346a,cdafe6347e24e22b011b587126de0ef7,56330,petrolina,PE,-9.383556,-40.495644
6,84bff9902277d3155c02447484e34dbb,c3916df18d7c45ecc6abda23a7d2b272,56310,petrolina,PE,-9.381291,-40.532324
7,587cd029767db6468eb78ef30fbfef36,9ac6444ace455b3120db45d86abfecb8,56328,petrolina,PE,-9.388985,-40.488999
8,5df03f4f656f9660622fec4e725a599a,51966fd6328b1e4c5fed16ed597ad96d,56320,petrolina,PE,-9.36895,-40.491002
9,f0833bf12c315d3c01f93d334d6846e1,84c3b9f27b3658e76701ebf57851605a,56304,petrolina,PE,-9.393612,-40.50075
