# Notebook 02  Enriquecimiento y Unificación 

Versión corregida para fijar `USE DATABASE` y `USE SCHEMA` en cada conexión, calificar objetos con esquema y usar `write_pandas(..., schema=...)`.

## 1) Parámetros y conexión

In [44]:
import os, datetime
import pandas as pd
import requests
import snowflake.connector as sf
from snowflake.connector.pandas_tools import write_pandas

# === Variables de entorno ===
SF_ACCOUNT   = os.getenv('SNOWFLAKE_ACCOUNT')
SF_USER      = os.getenv('SNOWFLAKE_USER')
SF_PASSWORD  = os.getenv('SNOWFLAKE_PASSWORD')
SF_ROLE      = os.getenv('SNOWFLAKE_ROLE', 'SYSADMIN')
SF_WAREHOUSE = os.getenv('SNOWFLAKE_WAREHOUSE')
SF_DATABASE  = os.getenv('SNOWFLAKE_DATABASE')
SCHEMA_RAW   = os.getenv('SNOWFLAKE_SCHEMA_RAW', 'RAW')
SCHEMA_AN    =  os.getenv('SNOWFLAKE_SCHEMA_ANALYTICS','analytics')

# Tablas
YELLOW_TABLE = os.getenv('RAW_YELLOW_TABLE', 'RAW_TLC_TRIPS_yellow')
GREEN_TABLE  = os.getenv('RAW_GREEN_TABLE',  'RAW_TLC_TRIPS_green')
ZONES_TABLE  = os.getenv('RAW_TAXI_ZONES_TABLE', 'TAXI_ZONE_LOOKUP_RAW')
DEST_TABLE   = os.getenv('AN_ENRICHED_TABLE', 'TRIPS_ENRICHED_UNIFIED')

# Fuente del Taxi Zone Lookup
TAXI_ZONE_URL = os.getenv('TAXI_ZONE_URL', "https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv")
LOCAL_FALLBACK = os.getenv('TAXI_ZONE_LOCAL_PATH', None)

RUN_ID = os.getenv('RUN_ID') or f"nb02_{datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')}"

assert SF_ACCOUNT and SF_USER and SF_PASSWORD and SF_WAREHOUSE and SF_DATABASE, 'Faltan variables de conexión a Snowflake'
print('Conectar a:', SF_ACCOUNT, SF_DATABASE, '| Schemas:', SCHEMA_RAW, '→', SCHEMA_AN)

Conectar a: HGPAYPL-TF36096 SPARK_DATA | Schemas: SPARK_DATA.RAW → SPARK_DATA.analytics


### Helper de conexión/SQL

In [45]:
def snowflake_conn(schema: str | None = None):
    ctx = sf.connect(
        account=SF_ACCOUNT,
        user=SF_USER,
        password=SF_PASSWORD,
        warehouse=SF_WAREHOUSE,
        role=SF_ROLE,
        database=SF_DATABASE,
        schema=schema or SCHEMA_RAW,
        client_session_keep_alive=True,
    )
    c = ctx.cursor()
    try:
        c.execute(f"USE DATABASE {SF_DATABASE}")
        c.execute(f"USE SCHEMA {schema or SCHEMA_RAW}")
    finally:
        c.close()
    return ctx

def run_sql(sql: str, schema: str | None = None):
    with snowflake_conn(schema or SCHEMA_RAW) as conn:
        cur = conn.cursor()
        try:
            cur.execute(f"USE DATABASE {SF_DATABASE}")
            cur.execute(f"USE SCHEMA {schema or SCHEMA_RAW}")
            res = cur.execute(sql)
            try:
                return res.fetchall()
            except Exception:
                return []
        finally:
            cur.close()
print('✓ Helpers listos')

✓ Helpers listos


## 2) Bootstrap + auditoría

In [46]:
with snowflake_conn(SCHEMA_RAW) as conn:
    cur = conn.cursor()
    try:
        cur.execute(f"USE WAREHOUSE {SF_WAREHOUSE}")
        cur.execute(f"USE DATABASE {SF_DATABASE}")
        cur.execute(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA_RAW}")
        cur.execute(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA_AN}")
        cur.execute(f"""
        CREATE TABLE IF NOT EXISTS {SCHEMA_AN}.LOAD_AUDIT (
          load_ts_utc      TIMESTAMP_NTZ,
          run_id           STRING,
          step             STRING,
          rows_affected    NUMBER,
          details          STRING
        )
        """)
    finally:
        cur.close()
print('✓ Esquemas y auditoría listos')

✓ Esquemas y auditoría listos


## 3) Taxi Zone Lookup → RAW

In [48]:
import io
if LOCAL_FALLBACK and os.path.exists(LOCAL_FALLBACK):
    df_zones = pd.read_csv(LOCAL_FALLBACK)
else:
    r = requests.get(TAXI_ZONE_URL, timeout=30)
    r.raise_for_status()
    df_zones = pd.read_csv(io.StringIO(r.text))

df_zones.columns = [c.strip().upper() for c in df_zones.columns]
expected = ['LOCATIONID','BOROUGH','ZONE','SERVICE_ZONE']
for c in expected:
    if c not in df_zones.columns:
        df_zones[c] = None
df_zones = df_zones[expected]

with snowflake_conn(SCHEMA_RAW) as conn:
    cur = conn.cursor()
    try:
        cur.execute(f"""
        CREATE OR REPLACE TABLE {ZONES_TABLE} (
          LOCATIONID NUMBER,
          BOROUGH STRING,
          ZONE STRING,
          SERVICE_ZONE STRING
        )
        """)
        write_pandas(conn, df_zones, ZONES_TABLE, auto_create_table=False)
    finally:
        cur.close()

run_sql(
    f"""
    INSERT INTO {SCHEMA_AN}.LOAD_AUDIT(load_ts_utc, run_id, step, rows_affected, details)
    SELECT CURRENT_TIMESTAMP(), '{RUN_ID}', 'LOAD_TAXI_ZONE', COUNT(*), '{SCHEMA_RAW}.{ZONES_TABLE}' FROM {SCHEMA_RAW}.{ZONES_TABLE}
    """,
    schema=SCHEMA_AN
)
print('✓ Taxi Zone cargado:', f'{SCHEMA_RAW}.{ZONES_TABLE}', '| Filas:', len(df_zones))

✓ Taxi Zone cargado: SPARK_DATA.RAW.TAXI_ZONE_LOOKUP_RAW | Filas: 265


## 5) Enriquecimiento + Unificación → ANALYTICS

In [53]:
# Helpers para chequear si existe una columna en una tabla
def column_exists(schema: str, table: str, column: str) -> bool:
    q = f"""
    SELECT COUNT(*) FROM {SF_DATABASE}.INFORMATION_SCHEMA.COLUMNS
    WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s AND UPPER(COLUMN_NAME) = UPPER(%s)
    """
    with snowflake_conn(schema) as conn:
        cur = conn.cursor()
        try:
            # En algunos entornos schema puede venir como "DB.SCHEMA"; tomamos la última parte
            schema_name = schema.split('.')[-1] if '.' in schema else schema
            cur.execute(q, (schema_name, table, column))
            return cur.fetchone()[0] > 0
        finally:
            cur.close()

# Detectar columnas opcionales en RAW
has_airport_fee_y = column_exists(SCHEMA_RAW, YELLOW_TABLE, 'AIRPORT_FEE')
has_ehail_fee_y   = column_exists(SCHEMA_RAW, YELLOW_TABLE, 'EHAIL_FEE')

has_airport_fee_g = column_exists(SCHEMA_RAW, GREEN_TABLE,  'AIRPORT_FEE')
has_ehail_fee_g   = column_exists(SCHEMA_RAW, GREEN_TABLE,  'EHAIL_FEE')

# Expresiones según disponibilidad
Y_AIRPORT_EXPR = "t.AIRPORT_FEE::FLOAT AS airport_fee" if has_airport_fee_y else "NULL::FLOAT AS airport_fee"
Y_EHAIL_EXPR   = "t.EHAIL_FEE::FLOAT  AS e_hail_fee"   if has_ehail_fee_y   else "NULL::FLOAT AS e_hail_fee"

G_AIRPORT_EXPR = "t.AIRPORT_FEE::FLOAT AS airport_fee" if has_airport_fee_g else "NULL::FLOAT AS airport_fee"
G_EHAIL_EXPR   = "t.EHAIL_FEE::FLOAT  AS e_hail_fee"   if has_ehail_fee_g   else "NULL::FLOAT AS e_hail_fee"

sql_create = f"""
CREATE OR REPLACE TABLE {SCHEMA_AN}.{DEST_TABLE} AS
WITH payment_map AS (
  SELECT * FROM VALUES
    (0,'Unknown'),(1,'Credit Card'),(2,'Cash'),
    (3,'No Charge'),(4,'Dispute'),(5,'Unknown'),(6,'Voided Trip')
  AS T(payment_type, payment_type_desc)
),
ratecode_map AS (
  SELECT * FROM VALUES
    (1,'Standard rate'),(2,'JFK'),(3,'Newark'),
    (4,'Nassau or Westchester'),(5,'Negotiated fare'),(6,'Group ride')
  AS T(ratecodeid, rate_code_desc)
),
vendor_map AS (
  SELECT * FROM VALUES
    (1,'Creative Mobile Technologies'),(2,'VeriFone Inc.')
  AS T(vendorid, vendor_name)
),
zones AS (
  SELECT
    LOCATIONID::NUMBER AS locationid,
    ZONE::STRING       AS zone,
    BOROUGH::STRING    AS borough,
    SERVICE_ZONE::STRING AS service_zone
  FROM {SCHEMA_RAW}.{ZONES_TABLE}
),
yellow_base AS (
  SELECT
    'yellow'::STRING AS service_type,
    t.vendorid,
    t.PICKUP_DATETIME::TIMESTAMP_NTZ  AS pickup_datetime,
    t.DROPOFF_DATETIME::TIMESTAMP_NTZ AS dropoff_datetime,
    t.pulocationid, t.dolocationid,
    t.passenger_count, t.trip_distance, t.ratecodeid,
    t.store_and_fwd_flag, t.payment_type,
    t.fare_amount, t.extra, t.mta_tax, t.tip_amount, t.tolls_amount,
    t.improvement_surcharge, t.congestion_surcharge, t.total_amount,
    NULL::NUMBER AS trip_type,
    {Y_AIRPORT_EXPR},
    {Y_EHAIL_EXPR},
    t.run_id, t.source_year, t.source_month, t.ingested_at_utc, t.source_path
  FROM {SCHEMA_RAW}.{YELLOW_TABLE} t
  WHERE
    t.PICKUP_DATETIME IS NOT NULL
    AND t.DROPOFF_DATETIME IS NOT NULL
    AND t.trip_distance >= 0
    AND t.total_amount IS NOT NULL
),
green_base AS (
  SELECT
    'green'::STRING AS service_type,
    t.vendorid,
    t.PICKUP_DATETIME::TIMESTAMP_NTZ  AS pickup_datetime,
    t.DROPOFF_DATETIME::TIMESTAMP_NTZ AS dropoff_datetime,
    t.pulocationid, t.dolocationid,
    t.passenger_count, t.trip_distance, t.ratecodeid,
    t.store_and_fwd_flag, t.payment_type,
    t.fare_amount, t.extra, t.mta_tax, t.tip_amount, t.tolls_amount,
    t.improvement_surcharge, t.congestion_surcharge, t.total_amount,
    t.trip_type,
    {G_AIRPORT_EXPR},
    {G_EHAIL_EXPR},
    t.run_id, t.source_year, t.source_month, t.ingested_at_utc, t.source_path
  FROM {SCHEMA_RAW}.{GREEN_TABLE} t
  WHERE
    t.PICKUP_DATETIME IS NOT NULL
    AND t.DROPOFF_DATETIME IS NOT NULL
    AND t.trip_distance >= 0
    AND t.total_amount IS NOT NULL
),
unified AS (
  SELECT * FROM yellow_base
  UNION ALL
  SELECT * FROM green_base
)
SELECT
  b.service_type, b.vendorid, vm.vendor_name,
  b.pickup_datetime, b.dropoff_datetime,
  b.pulocationid,
  pz.zone AS pu_zone, pz.borough AS pu_borough, pz.service_zone AS pu_service_zone,
  b.dolocationid,
  dz.zone AS do_zone, dz.borough AS do_borough, dz.service_zone AS do_service_zone,
  b.passenger_count, b.trip_distance, b.ratecodeid, rm.rate_code_desc,
  b.store_and_fwd_flag, b.payment_type, pm.payment_type_desc,
  b.fare_amount, b.extra, b.mta_tax, b.tip_amount, b.tolls_amount,
  b.improvement_surcharge, b.congestion_surcharge, b.total_amount,
  b.airport_fee, b.e_hail_fee,
  b.trip_type,
  b.run_id, b.source_year, b.source_month, b.ingested_at_utc, b.source_path,
  '{RUN_ID}'::STRING AS nb02_run_id,
  CURRENT_TIMESTAMP()::TIMESTAMP_NTZ AS nb02_processed_at_utc
FROM unified b
LEFT JOIN zones        pz ON pz.locationid = b.pulocationid
LEFT JOIN zones        dz ON dz.locationid = b.dolocationid
LEFT JOIN ratecode_map rm ON rm.ratecodeid = b.ratecodeid
LEFT JOIN vendor_map   vm ON vm.vendorid   = b.vendorid
LEFT JOIN payment_map  pm ON pm.payment_type = b.payment_type
"""
run_sql(sql_create, schema=SCHEMA_AN)

rows = run_sql(f"SELECT COUNT(*) FROM {SCHEMA_AN}.{DEST_TABLE}", schema=SCHEMA_AN)
total = rows[0][0] if rows else 0

run_sql(f"""
INSERT INTO {SCHEMA_AN}.LOAD_AUDIT(load_ts_utc, run_id, step, rows_affected, details)
VALUES(CURRENT_TIMESTAMP(), '{RUN_ID}', 'NB02_CREATE_{DEST_TABLE}', {total}, '{SCHEMA_AN}.{DEST_TABLE}')
""", schema=SCHEMA_AN)

print('✓ Enriquecimiento/unificación OK. Filas destino:', total)


✓ Enriquecimiento/unificación OK. Filas destino: 818500455
   (airport_fee y e_hail_fee gestionadas dinámicamente; service_zone incluido)


## 6) Checks y vista BI

In [54]:
checks = {
  'nulos_en_claves': f"""
    SELECT SUM(IFF(pickup_datetime IS NULL,1,0)) AS null_pickup,
           SUM(IFF(dropoff_datetime IS NULL,1,0)) AS null_dropoff,
           SUM(IFF(pulocationid IS NULL,1,0)) AS null_pu,
           SUM(IFF(dolocationid IS NULL,1,0)) AS null_do
    FROM {SCHEMA_AN}.{DEST_TABLE}
  """,
  'rangos_no_negativos': f"""
    SELECT SUM(IFF(trip_distance < 0,1,0)) AS dist_neg,
           SUM(IFF(total_amount  < 0,1,0)) AS total_neg
    FROM {SCHEMA_AN}.{DEST_TABLE}
  """,
}
for name, q in checks.items():
    out = run_sql(q, schema=SCHEMA_AN)
    print(f"-- {name}:", out[0] if out else None)

run_sql(f"CREATE OR REPLACE VIEW {SCHEMA_AN}.V_{DEST_TABLE} AS SELECT * FROM {SCHEMA_AN}.{DEST_TABLE}", schema=SCHEMA_AN)
print('✓ Vista creada:', f'V_{DEST_TABLE}')

-- nulos_en_claves: (0, 0, 0, 0)
-- rangos_no_negativos: (0, 2018450)
✓ Vista creada: V_TRIPS_ENRICHED_UNIFIED
