# Notebook 04: Validaciones y Exploración sobre **analytics.obt_trips** (solo Snowflake)

Objetivo: validar calidad de datos y hacer una exploración inicial que soporte las 20 preguntas del PSet.

### Incluye:
- Nulos en claves
- Rangos lógicos (duración/distancia/montos) y outliers razonables en `avg_speed_mph`
- Coherencia de fechas PU/DO
- Conteos por `service_type/year/month`
- Distribuciones: `payment_type_desc`, `rate_code_desc`


## 1) Parámetros y conexión

In [41]:
import os
import uuid
from datetime import datetime
import snowflake.connector as sf

# ======= ENTORNO =======
SF_ACCOUNT   = os.getenv('SNOWFLAKE_ACCOUNT')
SF_USER      = os.getenv('SNOWFLAKE_USER')
SF_PASSWORD  = os.getenv('SNOWFLAKE_PASSWORD')
SF_ROLE      = os.getenv('SNOWFLAKE_ROLE', 'SYSADMIN')
SF_WAREHOUSE = os.getenv('SNOWFLAKE_WAREHOUSE')
SF_DATABASE  = os.getenv('SNOWFLAKE_DATABASE')
SCHEMA_AN    =  os.getenv('SNOWFLAKE_SCHEMA_ANALYTICS','analytics')
OBT_TABLE      = os.getenv('AN_OBT_TABLE', 'OBT_TRIPS')

assert SF_ACCOUNT and SF_USER and SF_PASSWORD and SF_WAREHOUSE and SF_DATABASE, 'Faltan variables de conexión a Snowflake'
print('DB:', SF_DATABASE, '| Schema AN:', SCHEMA_AN)

# ======= HELPERS (tus mismos) =======
def snowflake_conn(schema: str | None = None):
    ctx = sf.connect(
        account=SF_ACCOUNT, user=SF_USER, password=SF_PASSWORD,
        warehouse=SF_WAREHOUSE, role=SF_ROLE, database=SF_DATABASE,
        schema=schema or SCHEMA_AN, client_session_keep_alive=True,
    )
    c = ctx.cursor()
    try:
        c.execute(f"USE DATABASE {SF_DATABASE}")
        c.execute(f"USE SCHEMA {schema or SCHEMA_AN}")
    finally:
        c.close()
    return ctx

def run_sql(sql: str, schema: str | None = None):
    with snowflake_conn(schema or SCHEMA_AN) as conn:
        cur = conn.cursor()
        try:
            cur.execute(f"USE DATABASE {SF_DATABASE}")
            cur.execute(f"USE SCHEMA {schema or SCHEMA_AN}")
            res = cur.execute(sql)
            try:
                return res.fetchall()
            except Exception:
                return []
        finally:
            cur.close()
print('✓ Helpers listos')


# ===== Parámetros para revisión de datos =====
YEAR_MIN = int(os.getenv("QA_YEAR_MIN", 2015))
YEAR_MAX = int(os.getenv("QA_YEAR_MAX", 2025))
MAX_SPEED_MPH       = float(os.getenv("QA_MAX_SPEED_MPH", 120))
MAX_TIP_PCT         = float(os.getenv("QA_MAX_TIP_PCT", 1.00))   # 100%
ALLOW_NEG_TOTAL     = int(os.getenv("QA_ALLOW_NEG_TOTAL", 0))    # 0 = no permite, 1 = permite
RUN_ID_QA           = os.getenv("RUN_ID", f"NB04_{datetime.utcnow():%Y%m%d%H%M%S}")
QA_RUN_UUID         = str(uuid.uuid4())

# límites razonables (para marcar atípicos, no para excluir)
MAX_TRIP_DURATION_MIN = int(os.getenv("QA_MAX_TRIP_DURATION_MIN", 1440))  # 24h
MAX_TRIP_DISTANCE_MI  = float(os.getenv("QA_MAX_TRIP_DISTANCE_MI", 250))  # 250 millas
MAX_SPEED_MPH         = float(os.getenv("QA_MAX_SPEED_MPH", 120))         # >120 mph outlier
MAX_TIP_PCT           = float(os.getenv("QA_MAX_TIP_PCT", 1.00))          # >100% outlier

# reconciliación de tarifas: umbral de residuo permitido
FARE_RESIDUAL_TOL     = float(os.getenv("QA_FARE_RESIDUAL_TOL", 2.00))    # USD

# Ping
rows = run_sql(f"SELECT COUNT(*) FROM {SCHEMA_AN}.{OBT_TABLE}")
total = rows[0][0] if rows else 0
print("Tabla:", f"{SCHEMA_AN}.{OBT_TABLE}", "| Filas:", total)
print("Bounds → year:", YEAR_MIN, "…", YEAR_MAX, "| dur_min ≤", MAX_TRIP_DURATION_MIN, "| dist ≤", MAX_TRIP_DISTANCE_MI)



DB: SPARK_DATA | Schema AN: SPARK_DATA.analytics
✓ Helpers listos
Tabla: SPARK_DATA.analytics.OBT_TRIPS | Filas: 818414540
Bounds → year: 2015 … 2025 | dur_min ≤ 1440 | dist ≤ 250.0


In [42]:
# NULOS ===
sql_nulls = f"""
SELECT
  SUM(IFF(pickup_datetime IS NULL,1,0))  AS null_pickup,
  SUM(IFF(dropoff_datetime IS NULL,1,0)) AS null_dropoff,
  SUM(IFF(pu_location_id IS NULL,1,0))   AS null_pu,
  SUM(IFF(do_location_id IS NULL,1,0))   AS null_do,
  SUM(IFF(service_type IS NULL,1,0))     AS null_service
FROM {SCHEMA_AN}.{OBT_TABLE}
"""
out_nulls = run_sql(sql_nulls)
print("Nulos clave:", out_nulls[0] if out_nulls else None)


Nulos clave: (0, 0, 0, 0, 0)


In [43]:
# RANGOS ===
sql_ranges = f"""
SELECT
  SUM(IFF(trip_duration_min < 0,1,0)) AS dur_neg,
  SUM(IFF(trip_distance < 0,1,0))     AS dist_neg,
  SUM(IFF(total_amount  < 0,1,0))     AS total_neg,
  SUM(IFF(avg_speed_mph > {MAX_SPEED_MPH},1,0)) AS speed_outlier,
  SUM(IFF(tip_pct > {MAX_TIP_PCT},1,0))         AS tip_pct_outlier
FROM {SCHEMA_AN}.{OBT_TABLE}
"""
out_ranges = run_sql(sql_ranges)
print("Rangos lógicos:", out_ranges[0] if out_ranges else None)


Rangos lógicos: (0, 0, 2018051, 167375, 122)


In [44]:
#COHERENCIA DE FECHAS ===
sql_dates = f"""
SELECT COUNT(*) AS drop_before_pick
FROM {SCHEMA_AN}.{OBT_TABLE}
WHERE dropoff_datetime < pickup_datetime
"""
out_dates = run_sql(sql_dates)
print("Coherencia (dropoff < pickup):", out_dates[0][0] if out_dates else None)


Coherencia (dropoff < pickup): 0


In [45]:
# === VALIDACIÓN: CONTEOS POR MES/SERVICIO ===
sql_counts = f"""
SELECT service_type, year, month, COUNT(*) AS trips
FROM {SCHEMA_AN}.{OBT_TABLE}
GROUP BY 1,2,3
ORDER BY 2,3,1
"""
rows_counts = run_sql(sql_counts)
print("Filas devueltas:", len(rows_counts))
for r in rows_counts[:20]:
    print(r)


Filas devueltas: 296
('yellow', 2001, 1, 25)
('yellow', 2001, 2, 1)
('yellow', 2001, 8, 1)
('yellow', 2002, 2, 11)
('yellow', 2002, 10, 425)
('yellow', 2002, 12, 47)
('yellow', 2003, 1, 48)
('yellow', 2003, 3, 1)
('yellow', 2003, 12, 1)
('yellow', 2004, 4, 1)
('yellow', 2008, 8, 2)
('green', 2008, 10, 1)
('green', 2008, 12, 112)
('yellow', 2008, 12, 762)
('green', 2009, 1, 312)
('yellow', 2009, 1, 1290)
('yellow', 2010, 8, 1)
('green', 2010, 9, 347)
('yellow', 2011, 1, 2)
('yellow', 2011, 2, 2)


In [46]:
# Conteo por año y detección fuera de ventana
out = run_sql(f"""
WITH base AS (
  SELECT year, COUNT(*) c
  FROM {SCHEMA_AN}.{OBT_TABLE}
  GROUP BY 1
)
SELECT
  SUM(IFF(year < {YEAR_MIN} OR year > {YEAR_MAX}, c, 0)) AS rows_out_of_bounds,
  MIN(year), MAX(year)
FROM base
""")
cnt_oob, min_year, max_year = out[0]
print("Años fuera de ventana:", cnt_oob, "| min_year:", min_year, "| max_year:", max_year)

# Min/Max de timestamps para sanity
mm = run_sql(f"""
SELECT
  MIN(pickup_datetime), MAX(pickup_datetime),
  MIN(dropoff_datetime), MAX(dropoff_datetime)
FROM {SCHEMA_AN}.{OBT_TABLE}
""")[0]
print("Fechas → min/max pickup:", mm[0], mm[1], "| min/max dropoff:", mm[2], mm[3])


Años fuera de ventana: 3450 | min_year: 2001 | max_year: 2098
Fechas → min/max pickup: 2001-01-01 00:01:48 2098-09-11 02:23:31 | min/max dropoff: 2001-01-01 00:04:49 2253-08-23 07:56:38


In [47]:
# Duraciones negativas y excesivas; distancias negativas y excesivas; speeds y tip_pct outliers
q = f"""
SELECT
  SUM(IFF(trip_duration_min < 0,1,0))                                  AS dur_neg,
  SUM(IFF(trip_duration_min > {MAX_TRIP_DURATION_MIN},1,0))            AS dur_excesiva,
  SUM(IFF(trip_distance     < 0,1,0))                                   AS dist_neg,
  SUM(IFF(trip_distance     > {MAX_TRIP_DISTANCE_MI},1,0))             AS dist_excesiva,
  SUM(IFF(avg_speed_mph     > {MAX_SPEED_MPH},1,0))                     AS speed_outlier,
  SUM(IFF(tip_pct           > {MAX_TIP_PCT},1,0))                       AS tip_pct_outlier
FROM {SCHEMA_AN}.{OBT_TABLE}
"""
print("Rangos extendidos:", run_sql(q)[0])


Rangos extendidos: (0, 1849, 0, 10392, 167375, 122)


In [48]:
# Costo esperado 
q = f"""
WITH resid AS (
  SELECT
    ABS(
      NVL(total_amount,0)
      - NVL(fare_amount,0) - NVL(extra,0) - NVL(mta_tax,0)
      - NVL(tip_amount,0)  - NVL(tolls_amount,0)
      - NVL(improvement_surcharge,0) - NVL(congestion_surcharge,0)
      - NVL(airport_fee,0)
    ) AS residual,
    service_type, year, month
  FROM {SCHEMA_AN}.{OBT_TABLE}
)
SELECT
  COUNT_IF(residual > {FARE_RESIDUAL_TOL}) AS rows_resid_over_tol,
  AVG(residual) AS avg_residual, MAX(residual) AS max_residual
FROM resid
"""
print("Reconciliación (residuo > tol):", run_sql(q)[0])

# Top focos por mes/servicio
q2 = f"""
WITH resid AS (
  SELECT
    ABS(
      NVL(total_amount,0)
      - NVL(fare_amount,0) - NVL(extra,0) - NVL(mta_tax,0)
      - NVL(tip_amount,0)  - NVL(tolls_amount,0)
      - NVL(improvement_surcharge,0) - NVL(congestion_surcharge,0)
      - NVL(airport_fee,0)
    ) AS residual,
    service_type, year, month
  FROM {SCHEMA_AN}.{OBT_TABLE}
)
SELECT service_type, year, month, COUNT(*) AS rows_over_tol
FROM resid
WHERE residual > {FARE_RESIDUAL_TOL}
GROUP BY 1,2,3
ORDER BY rows_over_tol DESC
LIMIT 20
"""
print("Top meses con residuo > tol (top 20):")
for r in run_sql(q2):
    print(r)


Reconciliación (residuo > tol): (74854643, 0.2518362981796493, 989965.59)
Top meses con residuo > tol (top 20):
('yellow', 2019, 3, 2719656)
('yellow', 2019, 4, 2562640)
('yellow', 2019, 5, 2558446)
('yellow', 2019, 2, 2321658)
('yellow', 2019, 6, 2311931)
('yellow', 2019, 10, 2296700)
('yellow', 2019, 11, 2148478)
('yellow', 2019, 9, 2106975)
('yellow', 2019, 12, 2103172)
('yellow', 2019, 7, 2039871)
('yellow', 2019, 8, 1957058)
('yellow', 2020, 1, 1936392)
('yellow', 2020, 2, 1912232)
('yellow', 2024, 5, 1089385)
('yellow', 2024, 9, 1081007)
('yellow', 2022, 10, 1077677)
('yellow', 2024, 3, 1076624)
('yellow', 2024, 10, 1074357)
('yellow', 2022, 5, 1072248)
('yellow', 2022, 3, 1061434)


In [49]:
q = f"""
SELECT
  SUM(IFF(LOWER(service_type) NOT IN ('yellow','green'),1,0))               AS bad_service_type,
  SUM(IFF(payment_type NOT IN (0,1,2,3,4,5,6),1,0))                          AS bad_payment_type,
  SUM(IFF(rate_code_id NOT IN (1,2,3,4,5,6) AND rate_code_id IS NOT NULL,1,0)) AS bad_rate_code,
  SUM(IFF(vendor_id NOT IN (1,2),1,0))                                       AS bad_vendor_id
FROM {SCHEMA_AN}.{OBT_TABLE}
"""
print("Catálogos inválidos:", run_sql(q)[0])


Catálogos inválidos: (0, 0, 853472, 938013)


In [50]:
# Rango razonable de pasajeros: [0..6] (ajústalo si trabajas con otro umbral)
q1 = f"""
SELECT
  SUM(IFF(passenger_count < 0,1,0)) AS pax_neg,
  SUM(IFF(passenger_count > 6,1,0)) AS pax_excesivo
FROM {SCHEMA_AN}.{OBT_TABLE}
"""
print("Pasajeros fuera de rango:", run_sql(q1)[0])

# Cobranzas con distancia cero (potencial outlier)
q2 = f"""
SELECT
  COUNT(*) AS zero_dist_paid
FROM {SCHEMA_AN}.{OBT_TABLE}
WHERE trip_distance = 0 AND total_amount > 0
"""
print("Zero-distance con cobro:", run_sql(q2)[0][0])


Pasajeros fuera de rango: (0, 9510)
Zero-distance con cobro: 7177874


In [51]:
# Totales negativos por mes/servicio
q1 = f"""
SELECT service_type, year, month, COUNT(*) AS rows_neg
FROM {SCHEMA_AN}.{OBT_TABLE}
WHERE total_amount < 0
GROUP BY 1,2,3
ORDER BY rows_neg DESC
LIMIT 20
"""
print("Totales negativos (top 20):")
for r in run_sql(q1):
    print(r)

# Speed outliers por mes/servicio
q2 = f"""
SELECT service_type, year, month, COUNT(*) AS rows_speed_out
FROM {SCHEMA_AN}.{OBT_TABLE}
WHERE avg_speed_mph > {MAX_SPEED_MPH}
GROUP BY 1,2,3
ORDER BY rows_speed_out DESC
LIMIT 20
"""
print("\nSpeed outliers (top 20):")
for r in run_sql(q2):
    print(r)

# Tip_pct > 100% por mes/servicio
q3 = f"""
SELECT service_type, year, month, COUNT(*) AS rows_tip_out
FROM {SCHEMA_AN}.{OBT_TABLE}
WHERE tip_pct > {MAX_TIP_PCT}
GROUP BY 1,2,3
ORDER BY rows_tip_out DESC
LIMIT 20
"""
print("\nTip% > 100% (top 20):")
for r in run_sql(q3):
    print(r)


Totales negativos (top 20):
('yellow', 2024, 12, 70321)
('yellow', 2024, 10, 61298)
('yellow', 2024, 11, 60410)
('yellow', 2024, 9, 55958)
('yellow', 2024, 8, 52813)
('yellow', 2024, 7, 50269)
('yellow', 2024, 5, 48862)
('yellow', 2024, 6, 48673)
('yellow', 2024, 3, 44208)
('yellow', 2024, 4, 43649)
('yellow', 2023, 12, 39716)
('yellow', 2023, 10, 36877)
('yellow', 2023, 11, 36637)
('yellow', 2024, 2, 36001)
('yellow', 2024, 1, 35422)
('yellow', 2023, 5, 31704)
('yellow', 2023, 6, 30955)
('yellow', 2023, 8, 30837)
('yellow', 2023, 7, 30719)
('yellow', 2023, 3, 29709)

Speed outliers (top 20):
('yellow', 2016, 5, 2659)
('yellow', 2016, 3, 2508)
('yellow', 2015, 12, 2475)
('yellow', 2016, 6, 2440)
('yellow', 2015, 5, 2395)
('yellow', 2015, 10, 2373)
('yellow', 2015, 3, 2367)
('yellow', 2016, 4, 2354)
('yellow', 2015, 11, 2334)
('yellow', 2016, 10, 2306)
('yellow', 2016, 7, 2279)
('yellow', 2016, 9, 2267)
('yellow', 2016, 12, 2261)
('yellow', 2016, 11, 2255)
('yellow', 2016, 2, 2230)
('ye

In [52]:
# 10 casos de residuo alto
q_resid = f"""
WITH resid AS (
  SELECT
    ABS(
      NVL(total_amount,0)
      - NVL(fare_amount,0) - NVL(extra,0) - NVL(mta_tax,0)
      - NVL(tip_amount,0)  - NVL(tolls_amount,0)
      - NVL(improvement_surcharge,0) - NVL(congestion_surcharge,0)
      - NVL(airport_fee,0)
    ) AS residual,
    *
  FROM {SCHEMA_AN}.{OBT_TABLE}
)
SELECT service_type, year, month, pickup_datetime, dropoff_datetime,
       trip_distance, trip_duration_min, avg_speed_mph,
       fare_amount, extra, mta_tax, tip_amount, tolls_amount,
       improvement_surcharge, congestion_surcharge, airport_fee, total_amount,
       residual
FROM resid
WHERE residual > {FARE_RESIDUAL_TOL}
ORDER BY residual DESC
LIMIT 10
"""
print("Muestra residuo alto:")
for r in run_sql(q_resid):
    print(r)

# 10 casos de speed outlier
q_speed = f"""
SELECT service_type, year, month, pickup_datetime, dropoff_datetime,
       trip_distance, trip_duration_min, avg_speed_mph
FROM {SCHEMA_AN}.{OBT_TABLE}
WHERE avg_speed_mph > {MAX_SPEED_MPH}
ORDER BY avg_speed_mph DESC
LIMIT 10
"""
print("\nMuestra speed outlier:")
for r in run_sql(q_speed):
    print(r)


Muestra residuo alto:
('green', 2015, 1, datetime.datetime(2015, 1, 20, 0, 47, 29), datetime.datetime(2015, 1, 20, 0, 50, 1), 0.18, 3.0, 3.5999999999999996, 3.5, 0.5, 0.5, 0.0, 0.0, 0.3, None, None, 989970.39, 989965.59)
('yellow', 2015, 4, datetime.datetime(2015, 4, 12, 10, 3, 19), datetime.datetime(2015, 4, 12, 10, 10, 27), 1.16, 7.0, 9.942828734775043, 6.5, 0.0, 0.5, 0.0, 0.0, 0.3, None, None, 650262.85, 650255.5499999999)
('yellow', 2015, 3, datetime.datetime(2015, 3, 1, 18, 18, 48), datetime.datetime(2015, 3, 1, 18, 27, 6), 1.17, 9.0, 7.8, 7.0, 0.0, 0.5, 0.0, 0.0, 0.3, None, None, 548463.35, 548455.5499999999)
('yellow', 2015, 2, datetime.datetime(2015, 2, 24, 12, 49, 3), datetime.datetime(2015, 2, 24, 12, 58, 56), 1.19, 9.0, 7.933333333333334, 7.5, 0.0, 0.5, 0.0, 0.0, 0.3, None, None, 86330.55, 86322.25)
('yellow', 2015, 2, datetime.datetime(2015, 2, 5, 20, 56, 7), datetime.datetime(2015, 2, 5, 21, 8, 42), 1.53, 12.0, 7.6499999999999995, 9.0, 0.5, 0.5, 0.0, 0.0, 0.3, None, None, 

In [53]:
# nulos clave
nulls = run_sql(f"""
SELECT
  SUM(IFF(pickup_datetime IS NULL,1,0)),
  SUM(IFF(dropoff_datetime IS NULL,1,0)),
  SUM(IFF(pu_location_id IS NULL,1,0)),
  SUM(IFF(do_location_id IS NULL,1,0)),
  SUM(IFF(service_type IS NULL,1,0))
FROM {SCHEMA_AN}.{OBT_TABLE}
""")[0]

# rangos extendidos
rng = run_sql(f"""
SELECT
  SUM(IFF(trip_duration_min < 0,1,0)),
  SUM(IFF(trip_duration_min > {MAX_TRIP_DURATION_MIN},1,0)),
  SUM(IFF(trip_distance     < 0,1,0)),
  SUM(IFF(trip_distance     > {MAX_TRIP_DISTANCE_MI},1,0)),
  SUM(IFF(avg_speed_mph     > {MAX_SPEED_MPH},1,0)),
  SUM(IFF(tip_pct           > {MAX_TIP_PCT},1,0))
FROM {SCHEMA_AN}.{OBT_TABLE}
""")[0]

# coherencia de fechas
bad_dates = run_sql(f"""
SELECT COUNT(*) FROM {SCHEMA_AN}.{OBT_TABLE}
WHERE dropoff_datetime < pickup_datetime
""")[0][0]

# ventana temporal
years_oob = run_sql(f"""
WITH base AS (
  SELECT year, COUNT(*) c FROM {SCHEMA_AN}.{OBT_TABLE} GROUP BY 1
)
SELECT SUM(IFF(year < {YEAR_MIN} OR year > {YEAR_MAX}, c, 0))
FROM base
""")[0][0]

# reconciliación residuo > tol
res_over = run_sql(f"""
WITH resid AS (
  SELECT ABS(
    NVL(total_amount,0)
    - NVL(fare_amount,0) - NVL(extra,0) - NVL(mta_tax,0)
    - NVL(tip_amount,0)  - NVL(tolls_amount,0)
    - NVL(improvement_surcharge,0) - NVL(congestion_surcharge,0)
    - NVL(airport_fee,0)
  ) AS residual
  FROM {SCHEMA_AN}.{OBT_TABLE}
)
SELECT COUNT(*) FROM resid WHERE residual > {FARE_RESIDUAL_TOL}
""")[0][0]

print("Nulls:", nulls)
print("Rangos:", rng)
print("Fechas incoherentes:", bad_dates)
print("Años fuera de ventana:", years_oob)
print("Residuo>tol:", res_over)

ok = (
    sum(nulls) == 0
    and rng[0]==0 and rng[2]==0 and bad_dates==0
    and years_oob==0
)
print("\nEstado QA extendido:", " OK" if ok else "Revisar")


Nulls: (0, 0, 0, 0, 0)
Rangos: (0, 1849, 0, 10392, 167375, 122)
Fechas incoherentes: 0
Años fuera de ventana: 3450
Residuo>tol: 74854643

Estado QA extendido: ⚠️ Revisar
