# SILVER — Limpieza estricta y conformado
**Objetivo:** Consumir BRONZE y construir `silver.iowa_clean_v2_strict` aplicando reglas de calidad.

**Entradas**  
- Bronze/Process (archivos validados/Bronze)

**Salidas**  
- Tabla Delta: `silver.iowa_clean_v2_strict`

**Reglas clave**  
- `sale_date` válido  
- `pack` en [1..48], `bottle_volume_ml` entre [50..2000] y sin valores “desplazados” típicos  
- `sale_bottles` entero y 1..500  
- `state_bottle_cost` y `state_bottle_retail` > 0 y retail ≥ cost

**Parámetros**  
- `catalog`, `schema=silver`, `bronze_schema`, `silver_schema`, `gold_schema`


In [0]:
# --- Widgets estándar (parámetros globales) ---
dbutils.widgets.text("catalog", "ct_andresolguin_finalproject")
dbutils.widgets.text("schema",  "silver") 
dbutils.widgets.text("bronze_schema", "bronze")
dbutils.widgets.text("silver_schema", "silver")
dbutils.widgets.text("gold_schema",   "gold")

catalog = dbutils.widgets.get("catalog")
schema  = dbutils.widgets.get("schema")
bronze  = dbutils.widgets.get("bronze_schema")
silver  = dbutils.widgets.get("silver_schema")
gold    = dbutils.widgets.get("gold_schema")

# Fijar el contexto de ejecución
spark.sql(f"USE CATALOG {catalog}")
spark.sql(f"USE SCHEMA {schema}")

print(f"Contexto activo: {catalog}.{schema}  |  bronze={bronze}, silver={silver}, gold={gold}")


Contexto activo: ct_andresolguin_finalproject.silver  |  bronze=bronze, silver=silver, gold=gold


In [0]:
%sql
USE CATALOG ct_andresolguin_finalproject;
USE SCHEMA silver;

SELECT COUNT(*) AS filas_totales
FROM iowa_sales_clean;


filas_totales
45897000


In [0]:
%sql
USE CATALOG ct_andresolguin_finalproject;
USE SCHEMA silver;

DESCRIBE HISTORY iowa_sales_clean;


version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
2,2025-10-21T22:50:57.000Z,70484384416134,andres.olguin@clear-tech.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [""year"",""month""], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)","List(268698224484188, null, 198623758504693, null, null, null)",List(2853194764169833),1021-224856-d7b1xxm6-v2n,1.0,WriteSerializable,False,"Map(numFiles -> 8, numRemovedFiles -> 3, numRemovedBytes -> 518475899, numDeletionVectorsRemoved -> 0, numOutputRows -> 45897000, numOutputBytes -> 1554575723)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
1,2025-10-21T15:57:28.000Z,70484384416134,andres.olguin@clear-tech.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [""year"",""month""], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(2853194764169833),1021-155615-dpigmdbs-v2n,0.0,WriteSerializable,False,"Map(numFiles -> 3, numRemovedFiles -> 3, numRemovedBytes -> 518475899, numDeletionVectorsRemoved -> 0, numOutputRows -> 15299000, numOutputBytes -> 518475899)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
0,2025-10-20T18:53:18.000Z,70484384416134,andres.olguin@clear-tech.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [""year"",""month""], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(1375037370621803),1020-183545-t1mv2wgq-v2n,,WriteSerializable,False,"Map(numFiles -> 3, numRemovedFiles -> 0, numRemovedBytes -> 0, numDeletionVectorsRemoved -> 0, numOutputRows -> 15299000, numOutputBytes -> 518475899)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13


In [0]:
# Parámetros (widgets) para SILVER
def w(name, default):
    try: dbutils.widgets.text(name, default)
    except: pass

w("catalog", "ct_andresolguin_finalproject")
w("schema_silver", "silver")
w("process_date", "20251018")  # AAAAMMDD

catalog      = dbutils.widgets.get("catalog")
schema_silver= dbutils.widgets.get("schema_silver")
process_date = dbutils.widgets.get("process_date").strip()

# Derivados
yyyy, mm = process_date[:4], process_date[4:6]

# Contexto
spark.sql(f"USE CATALOG {catalog}")

print(f"[SILVER] catalog={catalog} | schema={schema_silver} | process_date={process_date} (yyyy={yyyy}, mm={mm})")


[SILVER] catalog=ct_andresolguin_finalproject | schema=silver | process_date=20251018 (yyyy=2025, mm=10)


## Paso 1 — Definición de tabla destino (CREATE IF NOT EXISTS)
- Crea la tabla Delta objetivo si no existe.  
- Define tipos y particionado si corresponde.


In [0]:
%sql
USE CATALOG ct_andresolguin_finalproject;
CREATE SCHEMA IF NOT EXISTS silver;

CREATE OR REPLACE TABLE ct_andresolguin_finalproject.silver.iowa_sales_clean
USING DELTA
PARTITIONED BY (year, month)
AS
SELECT
  invoice_line_no,
  COALESCE(
    TRY_TO_DATE(date, 'yyyy-MM-dd'),
    TRY_TO_DATE(date, 'MM/dd/yyyy'),
    CAST(TRY_TO_TIMESTAMP(date, "yyyy-MM-dd'T'HH:mm:ss.SSS") AS DATE),
    CAST(TRY_TO_TIMESTAMP(date) AS DATE),
    TRY_TO_DATE(SUBSTR(date, 1, 10), 'yyyy-MM-dd')
  ) AS sale_date,

  TRY_CAST(NULLIF(TRIM(store), '') AS INT)              AS store_id,
  name                                                  AS store_name,
  address,
  city,
  zipcode,
  TRY_CAST(NULLIF(TRIM(county_number), '') AS INT)      AS county_number,
  county,

  TRY_CAST(NULLIF(TRIM(category), '') AS INT)           AS category_id,
  category_name,

  TRY_CAST(NULLIF(TRIM(vendor_no), '') AS INT)          AS vendor_no,
  vendor_name,

  TRY_CAST(NULLIF(TRIM(itemno), '') AS INT)             AS item_no,
  im_desc                                               AS item_desc,

  TRY_CAST(NULLIF(TRIM(pack), '') AS INT)               AS pack,
  TRY_CAST(NULLIF(TRIM(bottle_volume_ml), '') AS INT)   AS bottle_volume_ml,

  TRY_CAST(NULLIF(REGEXP_REPLACE(state_bottle_cost,   '[$,]', ''), '') AS DECIMAL(12,2)) AS state_bottle_cost,
  TRY_CAST(NULLIF(REGEXP_REPLACE(state_bottle_retail, '[$,]', ''), '') AS DECIMAL(12,2)) AS state_bottle_retail,

  TRY_CAST(NULLIF(TRIM(sale_bottles), '') AS INT)       AS sale_bottles,
  TRY_CAST(NULLIF(REGEXP_REPLACE(sale_dollars, '[$,]', ''), '') AS DECIMAL(14,2)) AS sale_dollars,
  TRY_CAST(NULLIF(REGEXP_REPLACE(sale_liters,  '[$,]', ''), '') AS DECIMAL(14,3)) AS sale_liters,
  TRY_CAST(NULLIF(REGEXP_REPLACE(sale_gallons, '[$,]', ''), '') AS DECIMAL(14,3)) AS sale_gallons,

  ingestion_ts,

  TRY_CAST(year  AS INT) AS year,
  TRY_CAST(month AS INT) AS month
FROM ct_andresolguin_finalproject.bronze.iowa_raw_str
WHERE year = 2025 AND month = '10';



num_affected_rows,num_inserted_rows


In [0]:
%sql
SELECT COUNT(*) AS rows_silver_oct25
FROM ct_andresolguin_finalproject.silver.iowa_sales_clean
WHERE year = 2025 AND month = 10;


rows_silver_oct25
15299000


In [0]:
%sql
USE CATALOG ct_andresolguin_finalproject;

-- Diagnóstico de rechazados leyendo directamente desde el Volume
WITH r AS (
  SELECT *
  FROM read_files(
    '/Volumes/ct_andresolguin_finalproject/bronze/flatfiles_managed/Rejected/usa/yyyy=2025/mm=10/dd=18/bad_iowa_dataset_20251018',
    format => 'csv',
    header => true,
    inferSchema => true
  )
),
p AS (  -- parseo "tolerante"
  SELECT
    invoice_line_no,
    TRY_CAST(date AS DATE)                           AS order_date,
    itemno                                           AS item_no,
    im_desc,
    TRY_CAST(pack AS INT)                            AS pack_i,
    TRY_CAST(bottle_volume_ml AS DOUBLE)             AS vol_ml,
    TRY_CAST(state_bottle_cost AS DOUBLE)            AS cost_f,
    TRY_CAST(state_bottle_retail AS DOUBLE)          AS retail_f,
    TRY_CAST(sale_bottles AS DOUBLE)                 AS bottles_f
  FROM r
)
SELECT
  invoice_line_no,
  order_date,
  item_no,
  im_desc,
  pack_i             AS pack,
  vol_ml             AS bottle_volume_ml,
  cost_f             AS state_bottle_cost,
  retail_f           AS state_bottle_retail,
  bottles_f          AS sale_bottles,
  CASE
    WHEN pack_i IN (375, 750, 1000) OR pack_i >= 100
      THEN 'PACK_DESPLAZADO'
    WHEN bottles_f IS NOT NULL AND bottles_f != CAST(bottles_f AS INT)
      THEN 'SALE_BOTTLES_DECIMAL'
    WHEN vol_ml IS NOT NULL AND vol_ml < 50
      THEN 'VOLUME_IRRISORIO/O_DESPLAZADO'
    WHEN retail_f IS NOT NULL AND cost_f IS NOT NULL AND retail_f < cost_f
      THEN 'RETAIL_MENOR_QUE_COSTO'
    WHEN cost_f <= 0 OR retail_f <= 0
      THEN 'COST/RETAIL_NO_POSITIVO'
    WHEN order_date IS NULL
      THEN 'FECHA_INVALIDA/DESPLAZADA'
    ELSE 'OTRO_CASO'
  END AS rejected_reason
FROM p
ORDER BY rejected_reason, invoice_line_no
LIMIT 300;


invoice_line_no,order_date,item_no,im_desc,pack,bottle_volume_ml,state_bottle_cost,state_bottle_retail,sale_bottles,rejected_reason
"PRAIRIE TRAIL SUITE 107-108""",,37.5,6,225,4.5,1.18,831.0,25.0,PACK_DESPLAZADO
"PRAIRIE TRAIL SUITE 107-108""",,49.5,6,297,4.5,1.18,831.0,25.0,PACK_DESPLAZADO
"PRAIRIE TRAIL SUITE 107-108""",,29.99,12,359,9.0,2.37,831.0,25.0,PACK_DESPLAZADO
"PRAIRIE TRAIL SUITE 107-108""",,50.24,6,301,4.5,1.18,831.0,25.0,PACK_DESPLAZADO
"PRAIRIE TRAIL SUITE 107-108""",,29.99,12,359,9.0,2.37,831.0,25.0,PACK_DESPLAZADO
"PRAIRIE TRAIL SUITE 107-108""",,30.0,1,30,0.75,0.19,831.0,25.0,VOLUME_IRRISORIO/O_DESPLAZADO


In [0]:
%sql
USE CATALOG ct_andresolguin_finalproject;

WITH r AS (
  SELECT *
  FROM read_files(
    '/Volumes/ct_andresolguin_finalproject/bronze/flatfiles_managed/Rejected/usa/yyyy=2025/mm=10/dd=18/bad_iowa_dataset_20251018',
    format => 'csv', header => true, inferSchema => true
  )
),
p AS (
  SELECT
    invoice_line_no,
    TRY_CAST(pack AS INT)                AS pack_i,
    TRY_CAST(bottle_volume_ml AS DOUBLE) AS vol_ml,
    TRY_CAST(state_bottle_cost AS DOUBLE)   AS cost_f,
    TRY_CAST(state_bottle_retail AS DOUBLE) AS retail_f,
    TRY_CAST(sale_bottles AS DOUBLE)        AS bottles_f,
    TRY_CAST(date AS DATE) AS order_date,
    im_desc, itemno
  FROM r
),
diag AS (
  SELECT
    invoice_line_no, order_date, itemno, im_desc, pack_i, vol_ml, cost_f, retail_f, bottles_f,
    CASE
      WHEN pack_i IN (375, 750, 1000) OR pack_i >= 100 THEN 'PACK_DESPLAZADO'
      WHEN bottles_f IS NOT NULL AND bottles_f != CAST(bottles_f AS INT) THEN 'SALE_BOTTLES_DECIMAL'
      WHEN vol_ml IS NOT NULL AND vol_ml < 50 THEN 'VOLUME_IRRISORIO/O_DESPLAZADO'
      WHEN retail_f IS NOT NULL AND cost_f IS NOT NULL AND retail_f < cost_f THEN 'RETAIL_MENOR_QUE_COSTO'
      WHEN cost_f <= 0 OR retail_f <= 0 THEN 'COST/RETAIL_NO_POSITIVO'
      WHEN order_date IS NULL THEN 'FECHA_INVALIDA/DESPLAZADA'
      ELSE 'OTRO_CASO'
    END AS rejected_reason
  FROM p
)
SELECT
  rejected_reason,
  COUNT(*)                           AS rows_cnt,
  COUNT(DISTINCT invoice_line_no)    AS invoices_cnt
FROM diag
GROUP BY rejected_reason
ORDER BY rows_cnt DESC;


rejected_reason,rows_cnt,invoices_cnt
PACK_DESPLAZADO,5,1
VOLUME_IRRISORIO/O_DESPLAZADO,1,1


In [0]:
%sql
USE CATALOG ct_andresolguin_finalproject;

-- 1) Tomo los invoice_line_no que cayeron en Rejected (mismo día/carpeta)
WITH r AS (
  SELECT *
  FROM read_files(
    '/Volumes/ct_andresolguin_finalproject/bronze/flatfiles_managed/Rejected/usa/yyyy=2025/mm=10/dd=18/bad_iowa_dataset_20251018',
    format => 'csv', header => true, inferSchema => true
  )
),
p AS (
  SELECT
    invoice_line_no,
    TRY_CAST(pack AS INT)                AS pack_i,
    TRY_CAST(bottle_volume_ml AS DOUBLE) AS vol_ml,
    TRY_CAST(sale_bottles AS DOUBLE)     AS bottles_f,
    TRY_CAST(state_bottle_cost AS DOUBLE)   AS cost_f,
    TRY_CAST(state_bottle_retail AS DOUBLE) AS retail_f,
    TRY_CAST(date AS DATE) AS order_date
  FROM r
),
diag AS (
  SELECT
    invoice_line_no,
    CASE
      WHEN pack_i IN (375, 750, 1000) OR pack_i >= 100 THEN 'PACK_DESPLAZADO'
      WHEN bottles_f IS NOT NULL AND bottles_f != CAST(bottles_f AS INT) THEN 'SALE_BOTTLES_DECIMAL'
      WHEN vol_ml IS NOT NULL AND vol_ml < 50 THEN 'VOLUME_IRRISORIO/O_DESPLAZADO'
      WHEN retail_f IS NOT NULL AND cost_f IS NOT NULL AND retail_f < cost_f THEN 'RETAIL_MENOR_QUE_COSTO'
      WHEN cost_f <= 0 OR retail_f <= 0 THEN 'COST/RETAIL_NO_POSITIVO'
      WHEN order_date IS NULL THEN 'FECHA_INVALIDA/DESPLAZADA'
      ELSE 'OTRO_CASO'
    END AS rejected_reason
  FROM p
),
rej_inv AS (
  SELECT DISTINCT invoice_line_no
  FROM diag
)

-- 2) ¿Algún invoice_line_no de Rejected está en la tabla limpia?
SELECT
  COUNT(*) AS overlaps_cnt
FROM ct_andresolguin_finalproject.silver.iowa_sales_clean c
JOIN rej_inv r
  ON c.invoice_line_no = r.invoice_line_no;


overlaps_cnt
0


In [0]:
%sql
USE CATALOG ct_andresolguin_finalproject;
USE SCHEMA silver;

-- 1) Crear la tabla de auditoría (si no existe)
CREATE TABLE IF NOT EXISTS iowa_rejected_audit (
  process_date    DATE,
  country         STRING,
  rejected_reason STRING,
  rows_cnt        BIGINT,
  invoices_cnt    BIGINT
) USING DELTA;

-- 2) Insertar el resumen de rechazados de la corrida (mismo día/carpeta)
INSERT INTO iowa_rejected_audit
WITH r AS (
  SELECT *
  FROM read_files(
    '/Volumes/ct_andresolguin_finalproject/bronze/flatfiles_managed/Rejected/usa/yyyy=2025/mm=10/dd=18/bad_iowa_dataset_20251018',
    format => 'csv', header => true, inferSchema => true
  )
),
p AS (
  SELECT
    invoice_line_no,
    TRY_CAST(pack AS INT)                AS pack_i,
    TRY_CAST(bottle_volume_ml AS DOUBLE) AS vol_ml,
    TRY_CAST(state_bottle_cost AS DOUBLE)   AS cost_f,
    TRY_CAST(state_bottle_retail AS DOUBLE) AS retail_f,
    TRY_CAST(sale_bottles AS DOUBLE)        AS bottles_f,
    TRY_CAST(date AS DATE) AS order_date
  FROM r
),
diag AS (
  SELECT
    invoice_line_no,
    CASE
      WHEN pack_i IN (375, 750, 1000) OR pack_i >= 100 THEN 'PACK_DESPLAZADO'
      WHEN bottles_f IS NOT NULL AND bottles_f != CAST(bottles_f AS INT) THEN 'SALE_BOTTLES_DECIMAL'
      WHEN vol_ml IS NOT NULL AND vol_ml < 50 THEN 'VOLUME_IRRISORIO/O_DESPLAZADO'
      WHEN retail_f IS NOT NULL AND cost_f IS NOT NULL AND retail_f < cost_f THEN 'RETAIL_MENOR_QUE_COSTO'
      WHEN cost_f <= 0 OR retail_f <= 0 THEN 'COST/RETAIL_NO_POSITIVO'
      WHEN order_date IS NULL THEN 'FECHA_INVALIDA/DESPLAZADA'
      ELSE 'OTRO_CASO'
    END AS rejected_reason
  FROM p
),
agg AS (
  SELECT
    DATE '2025-10-18'                  AS process_date,   -- ajustá si cambia el día
    'usa'                              AS country,
    rejected_reason,
    COUNT(*)                           AS rows_cnt,
    COUNT(DISTINCT invoice_line_no)    AS invoices_cnt
  FROM diag
  GROUP BY rejected_reason
)
SELECT * FROM agg;

-- 3) Vista rápida
SELECT * FROM silver.iowa_rejected_audit
ORDER BY process_date DESC, rows_cnt DESC;


process_date,country,rejected_reason,rows_cnt,invoices_cnt
2025-10-18,usa,PACK_DESPLAZADO,5,1
2025-10-18,usa,VOLUME_IRRISORIO/O_DESPLAZADO,1,1


In [0]:
%sql
USE CATALOG ct_andresolguin_finalproject;
USE SCHEMA silver;

CREATE OR REPLACE VIEW iowa_quality_gate AS
WITH s AS (
  SELECT
    invoice_line_no,
    sale_date,
    item_no,
    item_desc,
    CAST(pack               AS INT)    AS pack,
    CAST(bottle_volume_ml   AS DOUBLE) AS bottle_volume_ml,
    CAST(state_bottle_cost  AS DOUBLE) AS state_bottle_cost,
    CAST(state_bottle_retail AS DOUBLE) AS state_bottle_retail,
    CAST(sale_bottles       AS DOUBLE) AS sale_bottles
  FROM ct_andresolguin_finalproject.silver.iowa_sales_clean
),
flags AS (
  SELECT
    *,
    CASE
      WHEN pack IS NOT NULL
       AND (
            pack IN (375, 750, 1000)
         OR pack >= 100
         OR pack NOT IN (1,2,3,4,6,8,10,12,18,20,24,30,36,44,48)
       ) THEN 1 ELSE 0 END                             AS f_pack,

    CASE WHEN sale_bottles IS NOT NULL
           AND sale_bottles <> CAST(sale_bottles AS BIGINT)
         THEN 1 ELSE 0 END                             AS f_bottles_decimal,

    CASE WHEN bottle_volume_ml IS NOT NULL AND bottle_volume_ml < 50
         THEN 1 ELSE 0 END                             AS f_volume_small,

    CASE WHEN state_bottle_retail IS NOT NULL AND state_bottle_cost IS NOT NULL
           AND state_bottle_retail < state_bottle_cost
         THEN 1 ELSE 0 END                             AS f_retail_lt_cost,

    CASE WHEN (state_bottle_cost  IS NOT NULL AND state_bottle_cost  <= 0)
           OR (state_bottle_retail IS NOT NULL AND state_bottle_retail <= 0)
         THEN 1 ELSE 0 END                             AS f_non_positive
  FROM s
)
SELECT
  invoice_line_no, sale_date, item_no, item_desc,
  pack, bottle_volume_ml, state_bottle_cost, state_bottle_retail, sale_bottles,
  CASE
    WHEN f_pack=1            THEN 'PACK_DESPLAZADO'
    WHEN f_bottles_decimal=1 THEN 'SALE_BOTTLES_DECIMAL'
    WHEN f_volume_small=1    THEN 'VOLUME_IRRISORIO/O_DESPLAZADO'
    WHEN f_retail_lt_cost=1  THEN 'RETAIL_MENOR_QUE_COSTO'
    WHEN f_non_positive=1    THEN 'COST/RETAIL_NO_POSITIVO'
  END AS issue
FROM flags
WHERE (f_pack + f_bottles_decimal + f_volume_small + f_retail_lt_cost + f_non_positive) > 0;


In [0]:
%sql
SELECT COUNT(*) AS issues_in_silver
FROM ct_andresolguin_finalproject.silver.iowa_quality_gate;


issues_in_silver
30092482


In [0]:
%sql
USE CATALOG ct_andresolguin_finalproject;
USE SCHEMA silver;

WITH clean_inv AS (
  SELECT DISTINCT
         regexp_extract(invoice_line_no, '^[0-9]+', 0) AS invoice_id
  FROM iowa_clean
  WHERE year = 2025 AND month = 10
    AND invoice_line_no IS NOT NULL AND invoice_line_no <> ''
),
rej_inv AS (
  SELECT DISTINCT
         regexp_extract(invoice_line_no, '^[0-9]+', 0) AS invoice_id
  FROM iowa_rejected
  WHERE year = 2025 AND month = 10
    AND invoice_line_no IS NOT NULL AND invoice_line_no <> ''
),
overlap AS (
  SELECT COUNT(*) AS invoices_in_both
  FROM clean_inv c
  JOIN rej_inv  r USING (invoice_id)
)
SELECT
  (SELECT COUNT(*) FROM clean_inv) AS clean_invoices,
  (SELECT COUNT(*) FROM rej_inv)   AS rejected_invoices,
  (SELECT invoices_in_both FROM overlap) AS invoices_in_both;


clean_invoices,rejected_invoices,invoices_in_both
81,12,5


In [0]:
%sql
USE CATALOG ct_andresolguin_finalproject;
USE SCHEMA silver;

CREATE OR REPLACE VIEW iowa_clean_strict AS
WITH bad_invoices AS (
  SELECT DISTINCT regexp_extract(invoice_line_no, '^[0-9]+', 0) AS invoice_id
  FROM iowa_rejected
  WHERE year = 2025 AND month = 10
    AND invoice_line_no IS NOT NULL AND invoice_line_no <> ''
)
SELECT c.*
FROM iowa_clean c
LEFT ANTI JOIN bad_invoices b
  ON regexp_extract(c.invoice_line_no, '^[0-9]+', 0) = b.invoice_id
WHERE c.year = 2025 AND c.month = 10;


In [0]:
%sql
USE CATALOG ct_andresolguin_finalproject;
USE SCHEMA silver;

WITH clean_inv AS (
  SELECT DISTINCT regexp_extract(invoice_line_no, '^[0-9]+', 0) AS invoice_id
  FROM iowa_clean_strict
  WHERE invoice_line_no IS NOT NULL AND invoice_line_no <> ''
),
rej_inv AS (
  SELECT DISTINCT regexp_extract(invoice_line_no, '^[0-9]+', 0) AS invoice_id
  FROM iowa_rejected
  WHERE year = 2025 AND month = 10
    AND invoice_line_no IS NOT NULL AND invoice_line_no <> ''
),
overlap AS (
  SELECT COUNT(*) AS invoices_in_both
  FROM clean_inv c
  JOIN rej_inv  r USING (invoice_id)
)
SELECT
  (SELECT COUNT(*) FROM clean_inv) AS clean_invoices,
  (SELECT COUNT(*) FROM rej_inv)   AS rejected_invoices,
  (SELECT invoices_in_both FROM overlap) AS invoices_in_both;


clean_invoices,rejected_invoices,invoices_in_both
76,12,0


In [0]:
%sql
USE CATALOG ct_andresolguin_finalproject;
USE SCHEMA silver;

WITH v AS (
  SELECT COUNT(*) n
  FROM information_schema.views
  WHERE table_catalog='ct_andresolguin_finalproject'
    AND table_schema='silver'
    AND table_name='iowa_clean_strict'
),
cs_total AS (SELECT COUNT(*) n FROM iowa_clean_strict),
cs_m AS (SELECT COUNT(*) n FROM iowa_clean_strict WHERE year=2025 AND month=10),
rej_m AS (SELECT COUNT(*) n FROM iowa_rejected     WHERE year=2025 AND month=10)
SELECT 'clean_strict_exists' AS metric, (SELECT n FROM v)       AS n
UNION ALL
SELECT 'clean_strict_rows_total',        (SELECT n FROM cs_total)
UNION ALL
SELECT 'clean_strict_rows_2025_10',      (SELECT n FROM cs_m)
UNION ALL
SELECT 'rejected_rows_2025_10',          (SELECT n FROM rej_m);


metric,n
clean_strict_exists,1
clean_strict_rows_total,819
clean_strict_rows_2025_10,819
rejected_rows_2025_10,39746934


In [0]:
%sql
-- PASO ÚNICO (diagnóstico en SILVER)
USE CATALOG ct_andresolguin_finalproject;
USE SCHEMA silver;

-- 1) distribución por año/mes en la tabla estricta
SELECT
  YEAR(TRY_CAST(date AS DATE))  AS y,
  MONTH(TRY_CAST(date AS DATE)) AS m,
  COUNT(*)                      AS rows
FROM iowa_clean_strict
GROUP BY 1,2
ORDER BY 1,2;

-- 2) min/max de la fecha en SILVER
SELECT
  COUNT(*)                                     AS rows,
  MIN(TRY_CAST(date AS DATE))                  AS min_date,
  MAX(TRY_CAST(date AS DATE))                  AS max_date,
  SUM(CASE WHEN TRY_CAST(date AS DATE) IS NULL THEN 1 ELSE 0 END) AS null_dates
FROM iowa_clean_strict;

-- 3) chequeo rápido de las columnas de partición que heredamos a GOLD
SELECT
  year, month, COUNT(*) AS rows
FROM iowa_clean_strict
GROUP BY year, month
ORDER BY year, month;


year,month,rows
2025,10,910


In [0]:
%sql
USE CATALOG ct_andresolguin_finalproject;
USE SCHEMA silver;

/* 1) Estructura real de la tabla */
DESCRIBE TABLE iowa_sales_clean;

/* 2) Una muestra cruda (así vemos nombres/formatos reales) */
SELECT *
FROM iowa_sales_clean
LIMIT 5;


invoice_line_no,sale_date,store_id,store_name,address,city,zipcode,county_number,county,category_id,category_name,vendor_no,vendor_name,item_no,item_desc,pack,bottle_volume_ml,state_bottle_cost,state_bottle_retail,sale_bottles,sale_dollars,sale_liters,sale_gallons,ingestion_ts,year,month
INV-14877300005,2018-10-05,4073,"UPTOWN LIQUOR, LLC",306 HWY 69 SOUTH,FOREST CITY,50436,95,WINNEBAGO,1011200,STRAIGHT BOURBON WHISKIES,65,JIM BEAM BRANDS,20248,OLD CROW,6,1750,10.49,15.74,6,94.44,10.5,2.77,2025-10-22T09:04:09.196Z,2025,10
INV-14877700031,2018-10-05,3477,SAM'S CLUB 6472 / COUNCIL BLUFFS,3221 MANAWA CENTRE DR,COUNCIL BLUFFS,51501,78,POTTAWATTAMIE,1032100,IMPORTED VODKAS,370,PERNOD RICARD USA,34007,ABSOLUT SWEDISH VODKA 80PRF,12,1000,14.99,22.49,12,269.88,12.0,3.17,2025-10-22T09:04:09.196Z,2025,10
INV-14888100048,2018-10-08,2560,HY-VEE FOOD STORE / MARION,3600 BUSINESS HWY 151 EAST,MARION,52302,57,LINN,1081400,AMERICAN SCHNAPPS,65,JIM BEAM BRANDS,82606,DEKUYPER SOUR APPLE PUCKER,12,750,6.47,9.71,6,58.26,4.5,1.18,2025-10-22T09:04:09.196Z,2025,10
INV-14909900102,2018-10-08,4152,FOOD LAND SUPER MARKETS,407 W HURON,MISSOURI VALLEY,51555,43,HARRISON,1031100,AMERICAN VODKAS,434,LUXCO INC,36308,HAWKEYE VODKA,6,1750,7.17,10.76,6,64.56,10.5,2.77,2025-10-22T09:04:09.196Z,2025,10
INV-14941300020,2018-10-09,4647,B AND B EAST / WATERLOO,1615 BISHOP AVE,WATERLOO,50707,7,BLACK HAWK,1012100,CANADIAN WHISKIES,260,DIAGEO AMERICAS,10803,CROWN ROYAL REGAL APPLE MINI,10,300,7.35,11.03,4,44.12,1.2,0.31,2025-10-22T09:04:09.196Z,2025,10


In [0]:
%sql
-- PASO 1: reconstruir SILVER estricto usando los nombres reales de iowa_sales_clean
USE CATALOG ct_andresolguin_finalproject;
USE SCHEMA silver;

CREATE OR REPLACE TABLE silver.iowa_clean_v2_strict
USING DELTA
AS
SELECT
    /* clave + fecha consistente */
    invoice_line_no,
    TRY_CAST(sale_date AS DATE)                                  AS sale_date,
    YEAR(TRY_CAST(sale_date AS DATE))                            AS year,
    MONTH(TRY_CAST(sale_date AS DATE))                           AS month,

    /* tienda */
    CAST(store_id AS INT)                                        AS store_id,
    store_name,
    address,
    city,
    zipcode,
    county_number,
    county,

    /* producto / proveedor */
    CAST(item_no AS INT)                                         AS item_no,
    item_desc,
    CAST(category_id AS INT)                                     AS category,         -- renombramos para compatibilidad
    category_name,
    CAST(vendor_no AS INT)                                       AS vendor_no,
    vendor_name,

    /* empaque / volumen / precios */
    CAST(pack AS INT)                                            AS pack,
    CAST(bottle_volume_ml AS INT)                                AS bottle_volume_ml,
    CAST(state_bottle_cost   AS DECIMAL(10,2))                   AS state_bottle_cost,
    CAST(state_bottle_retail AS DECIMAL(10,2))                   AS state_bottle_retail,

    /* ventas */
    CAST(sale_bottles AS INT)                                    AS sale_bottles,
    CAST(sale_dollars AS DECIMAL(12,2))                          AS sale_dollars,
    CAST(sale_liters  AS DECIMAL(12,3))                          AS sale_liters,
    CAST(sale_gallons AS DECIMAL(12,3))                          AS sale_gallons,

    ingestion_ts
FROM silver.iowa_sales_clean
WHERE
      /* 0) fecha válida */
      TRY_CAST(sale_date AS DATE) IS NOT NULL

  AND /* 1) PACK razonable y sin desplazamientos típicos */
      CAST(pack AS INT) BETWEEN 1 AND 48
  AND CAST(bottle_volume_ml AS INT) BETWEEN 50 AND 2000
  AND CAST(bottle_volume_ml AS INT) NOT IN (2,3,4,5,6,12,14,40)

  AND /* 2) sale_bottles entero y en rango */
      sale_bottles = CAST(sale_bottles AS INT)
  AND CAST(sale_bottles AS INT) BETWEEN 1 AND 500

  AND /* 3) precios positivos y retail >= cost */
      CAST(state_bottle_cost   AS DECIMAL(10,2))  > 0
  AND CAST(state_bottle_retail AS DECIMAL(10,2))  > 0
  AND CAST(state_bottle_retail AS DECIMAL(10,2)) >= CAST(state_bottle_cost AS DECIMAL(10,2));


num_affected_rows,num_inserted_rows


In [0]:
%sql
-- 2.1 Conteo total y rango de fechas
USE CATALOG ct_andresolguin_finalproject;
USE SCHEMA silver;

SELECT
  COUNT(*)        AS rows,
  MIN(sale_date)  AS min_date,
  MAX(sale_date)  AS max_date
FROM iowa_clean_v2_strict;


rows,min_date,max_date
108224000,2012-01-03,2025-05-23


In [0]:
%sql
-- 2.2 Distribución por año/mes (deberíamos ver varios años, no sólo 2012)
SELECT
  year, month, COUNT(*) AS rows
FROM iowa_clean_v2_strict
GROUP BY year, month
ORDER BY year, month;


year,month,rows
2012,1,634520
2012,2,658960
2012,3,688710
2012,4,706160
2012,5,827290
2012,6,725780
2012,7,779180
2012,8,817490
2012,9,644000
2012,10,841680


In [0]:
%sql
-- 2.3 Consistencia: año/mes de partición vs fecha real (debe dar 0)
SELECT
  SUM(CASE WHEN YEAR(sale_date)<>year OR MONTH(sale_date)<>month THEN 1 ELSE 0 END)
    AS mismatched_rows
FROM iowa_clean_v2_strict;


mismatched_rows
0
