In [2]:
import os
import snowflake.connector
from datetime import datetime

# ========= Config =========
THRESHOLDS = {
    "NULL": 100,   # ≤ 100 nulos permitidos por check
    "RANGE": 500,  # ≤ 500 fuera de rango permitidos por check
    "TIME": 0      # 0 incoherencias permitidas
}
SOURCE_SCOPE = "GLOBAL"  # etiqueta de alcance del run

# ========= Conexión =========
conn = snowflake.connector.connect(
    account=os.getenv("SF_ACCOUNT"),
    user=os.getenv("SF_USER"),
    password=os.getenv("SF_PASSWORD"),
    warehouse=os.getenv("SF_WAREHOUSE"),
    database=os.getenv("SF_DATABASE"),
    schema="ANALYTICS",
    role=os.getenv("SF_ROLE"),
)
cur = conn.cursor()

def run(sql, params=None):
    cur.execute(sql) if params is None else cur.execute(sql, params)
    return cur

# ========= Setup tabla de log =========
run("""
CREATE SCHEMA IF NOT EXISTS ANALYTICS;
""")

run("""
CREATE TABLE IF NOT EXISTS ANALYTICS.DATA_QUALITY_LOG (
    check_name   STRING,
    check_type   STRING,     -- NULL | RANGE | TIME | COUNT
    check_status STRING,     -- PASS | FAIL | INFO
    check_value  NUMBER,     -- cantidad observada
    source_scope STRING,     -- p.ej. 'GLOBAL' o '2017_08'
    run_ts       TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP
);
""")

def log_result(name, typ, value, status, scope=SOURCE_SCOPE):
    run("""
        INSERT INTO ANALYTICS.DATA_QUALITY_LOG (check_name, check_type, check_status, check_value, source_scope)
        VALUES (%s, %s, %s, %s, %s);
    """, (name, typ, status, int(value), scope))

def evaluate_and_log(name, typ, value):
    thr = THRESHOLDS[typ]
    status = "PASS" if (value <= thr) else "FAIL"
    log_result(name, typ, value, status)

# ========= 1) NULLS críticos =========
null_checks = {
    "null_pickup_datetime":      "pickup_datetime",
    "null_dropoff_datetime":     "dropoff_datetime",
    "null_pu_location_id":       "pu_location_id",
    "null_do_location_id":       "do_location_id",
    "null_passenger_count":      "passenger_count",
    "null_trip_distance":        "trip_distance",
    "null_fare_amount":          "fare_amount",
    "null_total_amount":         "total_amount",
    "null_service_type":         "service_type",
    "null_vendor_id":            "vendor_id",
    "null_payment_type":         "payment_type",
    "null_rate_code_id":         "rate_code_id"
}

for check_name, col in null_checks.items():
    sql = f"SELECT COUNT(*) FROM ANALYTICS.OBT_TRIPS WHERE {col} IS NULL;"
    cnt = run(sql).fetchone()[0]
    evaluate_and_log(check_name, "NULL", cnt)

# ========= 2) Rangos lógicos =========
range_checks = {
    "range_trip_distance":  "trip_distance < 0 OR trip_distance > 200",
    "range_passenger_count":"passenger_count < 0 OR passenger_count > 6",
    "range_total_amount":   "total_amount < 0 OR total_amount > 1000"
}
for check_name, cond in range_checks.items():
    sql = f"SELECT COUNT(*) FROM ANALYTICS.OBT_TRIPS WHERE {cond};"
    cnt = run(sql).fetchone()[0]
    evaluate_and_log(check_name, "RANGE", cnt)

# (opcional) más rangos: tips negativos extremos, etc.
# tip excesiva respecto a fare (muy conservador)
sql = "SELECT COUNT(*) FROM ANALYTICS.OBT_TRIPS WHERE tip_amount < 0 OR tip_amount > (fare_amount * 2);"
cnt = run(sql).fetchone()[0]
evaluate_and_log("range_tip_amount_vs_fare", "RANGE", cnt)

# ========= 3) Coherencia temporal =========
sql = "SELECT COUNT(*) FROM ANALYTICS.OBT_TRIPS WHERE pickup_datetime >= dropoff_datetime;"
cnt = run(sql).fetchone()[0]
# TIME es crítico: tolerancia 0
status_time = "PASS" if cnt <= THRESHOLDS["TIME"] else "FAIL"
log_result("time_pickup_before_dropoff", "TIME", cnt, status_time)

# ========= 4) Conteos por mes/servicio (INFO) =========
# Registramos por grupo como eventos INFO; además imprimimos una muestra.
counts_cur = run("""
    SELECT year, month, service_type, COUNT(*) AS total_trips
    FROM ANALYTICS.OBT_TRIPS
    GROUP BY 1,2,3
    ORDER BY 1,2,3;
""")
rows = counts_cur.fetchall()

# Loguear como INFO uno por uno (si hay muchos grupos, es normal)
for y, m, svc, total in rows:
    name = f"count_{int(y)}_{int(m):02d}_{svc}"
    log_result(name, "COUNT", total, "INFO")

# Mostrar primeras filas como “dashboard” textual
print("🧮 Conteos por año/mes/servicio (primeros 50):")
for i, r in enumerate(rows[:50], 1):
    print(f"{i:02d}. {r[0]}-{int(r[1]):02d} | {r[2]:6s} | {r[3]:,} viajes")

# ========= Resumen final de este run =========
print("\n✅ Validaciones ejecutadas y registradas en ANALYTICS.DATA_QUALITY_LOG")

cur.close()
conn.close()


🧮 Conteos por año/mes/servicio (primeros 50):
01. 2001-01 | YELLOW | 24 viajes
02. 2001-02 | YELLOW | 1 viajes
03. 2001-08 | YELLOW | 1 viajes
04. 2002-02 | YELLOW | 11 viajes
05. 2002-10 | YELLOW | 439 viajes
06. 2002-12 | YELLOW | 45 viajes
07. 2003-01 | YELLOW | 48 viajes
08. 2003-03 | YELLOW | 1 viajes
09. 2003-12 | YELLOW | 1 viajes
10. 2004-04 | YELLOW | 1 viajes
11. 2007-12 | YELLOW | 1 viajes
12. 2008-08 | YELLOW | 3 viajes
13. 2008-10 | GREEN  | 1 viajes
14. 2008-12 | GREEN  | 113 viajes
15. 2008-12 | YELLOW | 764 viajes
16. 2009-01 | GREEN  | 315 viajes
17. 2009-01 | YELLOW | 1,289 viajes
18. 2010-08 | YELLOW | 1 viajes
19. 2010-09 | GREEN  | 348 viajes
20. 2011-01 | YELLOW | 2 viajes
21. 2011-02 | YELLOW | 2 viajes
22. 2012-02 | YELLOW | 1 viajes
23. 2012-09 | GREEN  | 3 viajes
24. 2014-11 | YELLOW | 1 viajes
25. 2015-01 | GREEN  | 1,508,493 viajes
26. 2015-02 | GREEN  | 1,574,830 viajes
27. 2015-02 | YELLOW | 12,442,394 viajes
28. 2015-03 | GREEN  | 1,722,574 viajes
29. 201