
# Notebook 05: Preguntas de Negocio usando Spark

**Objetivo:** Responder 20 preguntas de negocio usando `ANALYTICS.OBT_TRIPS` con Spark optimizado.

**Optimizaciones implementadas:**
1. Caché estratégico del DataFrame base
2. Reparticionado óptimo para agregaciones
3. Liberación de memoria entre queries
4. Uso de *approx percentiles* para grandes volúmenes
5. Persist solo cuando es necesario


In [1]:

from utils.snowflake_utils import get_spark_session, get_snowflake_options
from pyspark.sql import functions as F
from pyspark.sql import Window
import os, pathlib, gc
from datetime import datetime

spark = get_spark_session("preguntas_negocio_obt")

# Optimizaciones de configuración
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")
spark.conf.set("spark.sql.shuffle.partitions", "200")

sf_options_analytics = get_snowflake_options(schema="ANALYTICS")
db = sf_options_analytics["sfDatabase"]
schema_analytics = sf_options_analytics["sfSchema"]

EVID_DIR = os.environ.get("EVID_DIR", "/home/jovyan/work/evidencias")
pathlib.Path(EVID_DIR).mkdir(parents=True, exist_ok=True)
RUN_ID = os.environ.get("RUN_ID", datetime.utcnow().strftime("%Y%m%d_%H%M%S"))

print("✓ Spark configurado con optimizaciones")
print(f"  Database:  {db}")
print(f"  Schema:    {schema_analytics}")
print(f"  RUN_ID:    {RUN_ID}")
print(f"  Evidencias: {EVID_DIR}")


✓ Snowflake context activo: DB=SPARK_DATA, SCHEMA=SPARK_DATA.RAW, WH=spark_wh, ROLE=ACCOUNTADMIN
✓ Spark configurado con optimizaciones
  Database:  SPARK_DATA
  Schema:    ANALYTICS
  RUN_ID:    20251020_005708
  Evidencias: /home/jovyan/work/evidencias


In [2]:
# Timeouts/heartbeat para conexiones lentas/red con latencia
spark.conf.set("spark.sql.broadcastTimeout", "600")

In [3]:

def save_result(df, filename, description, show_rows=20):
    """Guarda resultado, muestra preview y libera memoria."""
    try:
        print(f"\n{'='*60}")
        print(f"Pregunta: {description}")
        print(f"{'='*60}")
        df.show(show_rows, truncate=False)

        # Guardar CSV pequeño (agregados con límites)
        pdf = df.toPandas()
        filepath = f"{EVID_DIR}/{filename}"
        pdf.to_csv(filepath, index=False)
        print(f"✓ Guardado: {filepath}")
        del pdf
        gc.collect()
    except Exception as e:
        print(f"✗ Error en {description}: {e}")


In [4]:
from pyspark.sql import functions as F
from pyspark import StorageLevel
import os

db = sf_options_analytics["sfDatabase"]
schema_analytics = sf_options_analytics["sfSchema"]
OBT_TABLE = os.environ.get("OBT_TABLE", "OBT_TRIPS")

columns = [
    "service_type", "pickup_datetime", "dropoff_datetime",
    "pickup_date", "pickup_hour", "day_of_week", "month", "year",
    "pu_location_id", "pu_zone", "pu_borough",
    "do_location_id", "do_zone", "do_borough",
    "vendor_id", "vendor_name",
    "rate_code_id", "rate_code_desc",
    "payment_type", "payment_type_desc",
    "trip_type", "passenger_count", "trip_distance",
    "fare_amount", "tip_amount", "tip_pct",
    "tolls_amount", "congestion_surcharge",
    "total_amount", "trip_duration_min", "avg_speed_mph"
]

# 1) Descubre años disponibles con un query pequeño
years_df = (spark.read.format("net.snowflake.spark.snowflake")
            .options(**sf_options_analytics)
            .option("query", f"SELECT DISTINCT year FROM {db}.{schema_analytics}.{OBT_TABLE} ORDER BY year DESC")
            .load())

years = [int(r["YEAR"]) for r in years_df.collect()]
print("Años detectados:", years)

# 2) Lee por partición (año) y une, evitando materializar todo de golpe
dfs = []
for y in years:
    print(f"Cargando year={y} ...")
    part = (spark.read.format("net.snowflake.spark.snowflake")
            .options(**sf_options_analytics)
            .option("query", f"SELECT {', '.join(columns)} FROM {db}.{schema_analytics}.{OBT_TABLE} WHERE year={y}")
            .load())
    dfs.append(part.select(*columns))

assert dfs, "No se cargaron particiones."

df = dfs[0]
for p in dfs[1:]:
    df = df.unionByName(p, allowMissingColumns=True)

# 3) Persistencia segura y gatillo mínimo (sin count, sin foreach RDD)
df = df.repartition(64, "year", "month").persist(StorageLevel.MEMORY_AND_DISK)
df.select("year").groupBy().agg(F.count("*").alias("rows")).show(1)
print("✓ DataFrame cargado y persistido")
df.printSchema()


Años detectados: [2098, 2070, 2053, 2041, 2029, 2028, 2022, 2021, 2020, 2018, 2017, 2016, 2015, 2012, 2011, 2010, 2009, 2008, 2004, 2003, 2002, 2001]
Cargando year=2098 ...
Cargando year=2070 ...
Cargando year=2053 ...
Cargando year=2041 ...
Cargando year=2029 ...
Cargando year=2028 ...
Cargando year=2022 ...
Cargando year=2021 ...
Cargando year=2020 ...
Cargando year=2018 ...
Cargando year=2017 ...
Cargando year=2016 ...
Cargando year=2015 ...
Cargando year=2012 ...
Cargando year=2011 ...
Cargando year=2010 ...
Cargando year=2009 ...
Cargando year=2008 ...
Cargando year=2004 ...
Cargando year=2003 ...
Cargando year=2002 ...
Cargando year=2001 ...


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_com

Py4JError: An error occurred while calling o1083.showString


## Preguntas de Negocio (20 según PDF)

Todas usando Spark con optimizaciones de memoria.


### a) Top 10 zonas de pickup por volumen mensual

In [None]:

qa = (
    df.groupBy("year", "month", "pu_zone")
    .count()
    .withColumnRenamed("count", "trips")
    .orderBy(F.desc("year"), F.desc("month"), F.desc("trips"))
    .limit(120)
)
save_result(qa, "a_top_pu_zones_monthly.csv", "Top 10 zonas de pickup por volumen mensual", 30)


### b) Top 10 zonas de dropoff por volumen mensual

In [None]:

qb = (
    df.groupBy("year", "month", "do_zone")
    .count()
    .withColumnRenamed("count", "trips")
    .orderBy(F.desc("year"), F.desc("month"), F.desc("trips"))
    .limit(120)
)
save_result(qb, "b_top_do_zones_monthly.csv", "Top 10 zonas de dropoff por volumen mensual", 30)


### c) Evolución mensual de total_amount y tip_pct por borough

In [None]:

qc = (
    df.filter(F.col("pu_borough") != "Unknown")
    .groupBy("year", "month", "pu_borough")
    .agg(
        F.round(F.avg("total_amount"), 2).alias("avg_total_amount"),
        F.round(F.avg("tip_pct"), 2).alias("avg_tip_pct"),
        F.count("*").alias("trips")
    )
    .orderBy("year", "month", "pu_borough")
)
save_result(qc, "c_monthly_evolution_by_borough.csv", "Evolución mensual por borough", 30)


### d) Ticket promedio por service_type y mes

In [None]:

qd = (
    df.groupBy("year", "month", "service_type")
    .agg(
        F.round(F.avg("total_amount"), 2).alias("avg_ticket"),
        F.count("*").alias("trips")
    )
    .orderBy("year", "month", "service_type")
)
save_result(qd, "d_avg_ticket_by_service_month.csv", "Ticket promedio por servicio y mes", 30)


### e) Viajes por hora del día y día de semana (picos)

In [None]:

qe = (
    df.groupBy("pickup_hour", "day_of_week")
    .count()
    .withColumnRenamed("count", "trips")
    .orderBy(F.desc("trips"))
    .limit(100)
)
save_result(qe, "e_trips_by_hour_dow.csv", "Viajes por hora y día de semana (picos)", 30)


### f) p50/p90 de trip_duration_min por borough de pickup

In [None]:

qf = (
    df.filter((F.col("pu_borough") != "Unknown") & (F.col("trip_duration_min") > 0))
    .groupBy("pu_borough")
    .agg(
        F.round(F.expr("percentile_approx(trip_duration_min, 0.5)"), 2).alias("p50_duration"),
        F.round(F.expr("percentile_approx(trip_duration_min, 0.9)"), 2).alias("p90_duration"),
        F.count("*").alias("trips")
    )
    .orderBy("pu_borough")
)
save_result(qf, "f_duration_percentiles_by_borough.csv", "Percentiles de duración por borough", 10)


### g) avg_speed_mph por franja horaria y borough

In [None]:

qg = (
    df.filter(
        (F.col("pu_borough") != "Unknown") & 
        (F.col("avg_speed_mph").isNotNull()) & 
        (F.col("avg_speed_mph") > 0)
    )
    .withColumn(
        "time_slot",
        F.when(F.col("pickup_hour").between(6, 9), "06-09 Morning")
         .when(F.col("pickup_hour").between(17, 20), "17-20 Evening")
         .otherwise("Other")
    )
    .groupBy("time_slot", "pu_borough")
    .agg(
        F.round(F.avg("avg_speed_mph"), 2).alias("avg_speed"),
        F.count("*").alias("trips")
    )
    .orderBy("time_slot", "pu_borough")
)
save_result(qg, "g_avg_speed_by_timeslot_borough.csv", "Velocidad promedio por franja y borough", 20)


### h) Participación por payment_type_desc y relación con tip_pct

In [None]:

total_trips = df.count()
qh = (
    df.groupBy("payment_type_desc")
    .agg(
        F.count("*").alias("trips"),
        F.round(F.avg("tip_pct"), 2).alias("avg_tip_pct"),
        F.round(F.avg("tip_amount"), 2).alias("avg_tip_amount")
    )
    .withColumn("pct_of_total", F.round(F.col("trips") * 100.0 / F.lit(total_trips), 2))
    .orderBy(F.desc("trips"))
)
save_result(qh, "h_payment_type_participation.csv", "Participación por tipo de pago", 10)


### i) Rate codes con mayor trip_distance y total_amount

In [None]:

qi = (
    df.groupBy("rate_code_desc")
    .agg(
        F.count("*").alias("trips"),
        F.round(F.sum("trip_distance"), 2).alias("total_distance"),
        F.round(F.avg("trip_distance"), 2).alias("avg_distance"),
        F.round(F.sum("total_amount"), 2).alias("total_revenue"),
        F.round(F.avg("total_amount"), 2).alias("avg_amount")
    )
    .orderBy(F.desc("total_distance"))
)
save_result(qi, "i_rate_codes_distance_revenue.csv", "Rate codes por distancia y revenue", 10)


### j) Mix yellow vs green por mes y borough

In [None]:

qj = (
    df.filter(F.col("pu_borough") != "Unknown")
    .groupBy("year", "month", "pu_borough", "service_type")
    .count()
    .withColumnRenamed("count", "trips")
    .orderBy(F.desc("year"), F.desc("month"), "pu_borough", "service_type")
    .limit(200)
)
save_result(qj, "j_service_mix_by_month_borough.csv", "Mix yellow vs green por mes y borough", 30)


### k) Top 20 flujos PU→DO por volumen y ticket promedio

In [None]:

qk = (
    df.filter((F.col("pu_zone") != "Unknown") & (F.col("do_zone") != "Unknown"))
    .groupBy("pu_zone", "do_zone")
    .agg(
        F.count("*").alias("trips"),
        F.round(F.avg("total_amount"), 2).alias("avg_ticket"),
        F.round(F.sum("total_amount"), 2).alias("total_revenue")
    )
    .orderBy(F.desc("trips"))
    .limit(20)
)
save_result(qk, "k_top_routes_volume.csv", "Top 20 flujos PU→DO", 20)


### l) Distribución de passenger_count y efecto en total_amount

In [None]:

ql = (
    df.filter(F.col("passenger_count").between(1, 9))
    .groupBy("passenger_count")
    .agg(
        F.count("*").alias("trips"),
        F.round(F.avg("total_amount"), 2).alias("avg_total_amount"),
        F.round(F.avg("trip_distance"), 2).alias("avg_distance")
    )
    .orderBy("passenger_count")
)
save_result(ql, "l_passenger_count_distribution.csv", "Distribución passenger_count", 10)


### m) Impacto de tolls_amount y congestion_surcharge por zona

In [None]:

qm = (
    df.filter(F.col("pu_zone") != "Unknown")
    .groupBy("pu_zone")
    .agg(
        F.count("*").alias("trips"),
        F.round(F.avg("tolls_amount"), 2).alias("avg_tolls"),
        F.round(F.avg("congestion_surcharge"), 2).alias("avg_congestion"),
        F.round(F.avg("total_amount"), 2).alias("avg_total")
    )
    .filter(F.col("trips") >= 1000)
    .orderBy(F.desc("avg_tolls"))
    .limit(30)
)
save_result(qm, "m_surcharges_impact_by_zone.csv", "Impacto de tolls y congestion por zona", 30)


### n) Proporción viajes cortos vs largos por borough y estacionalidad

In [None]:

qn = (
    df.filter(F.col("pu_borough") != "Unknown")
    .withColumn(
        "trip_category",
        F.when(F.col("trip_distance") <= 2, "Short (<=2mi)")
         .when(F.col("trip_distance") <= 5, "Medium (2-5mi)")
         .otherwise("Long (>5mi)")
    )
    .groupBy("pu_borough", "month", "trip_category")
    .count()
    .withColumnRenamed("count", "trips")
    .orderBy("pu_borough", "month", "trip_category")
    .limit(300)
)
save_result(qn, "n_trip_length_distribution.csv", "Proporción viajes cortos vs largos", 30)


### o) Diferencias por vendor en avg_speed_mph y trip_duration_min

In [None]:

qo = (
    df.filter((F.col("avg_speed_mph").isNotNull()) & (F.col("avg_speed_mph") > 0))
    .groupBy("vendor_name")
    .agg(
        F.count("*").alias("trips"),
        F.round(F.avg("avg_speed_mph"), 2).alias("avg_speed"),
        F.round(F.avg("trip_duration_min"), 2).alias("avg_duration"),
        F.round(F.avg("trip_distance"), 2).alias("avg_distance")
    )
    .orderBy(F.desc("trips"))
)
save_result(qo, "o_vendor_performance.csv", "Diferencias por vendor", 10)


### p) Relación método de pago ↔ tip_amount por hora

In [None]:

qp = (
    df.groupBy("pickup_hour", "payment_type_desc")
    .agg(
        F.count("*").alias("trips"),
        F.round(F.avg("tip_amount"), 2).alias("avg_tip"),
        F.round(F.avg("tip_pct"), 2).alias("avg_tip_pct")
    )
    .orderBy("pickup_hour", F.desc("trips"))
    .limit(200)
)
save_result(qp, "p_payment_tip_by_hour.csv", "Método de pago y propina por hora", 30)


### q) Zonas con percentil 99 de duración/distancia fuera de rango

In [None]:

qq = (
    df.filter(F.col("pu_zone") != "Unknown")
    .groupBy("pu_zone")
    .agg(
        F.count("*").alias("trips"),
        F.round(F.expr("percentile_approx(trip_duration_min, 0.99)"), 2).alias("p99_duration"),
        F.round(F.expr("percentile_approx(trip_distance, 0.99)"), 2).alias("p99_distance")
    )
    .filter(F.col("trips") >= 1000)
    .orderBy(F.desc("p99_duration"))
    .limit(30)
)
save_result(qq, "q_zones_extreme_p99.csv", "Zonas con p99 extremo (congestión potencial)", 30)


### r) Yield por milla (total_amount/trip_distance) por borough y hora

In [None]:

qr = (
    df.filter((F.col("pu_borough") != "Unknown") & (F.col("trip_distance") > 0))
    .groupBy("pu_borough", "pickup_hour")
    .agg(
        F.count("*").alias("trips"),
        F.round(F.avg(F.col("total_amount") / F.col("trip_distance")), 2).alias("yield_per_mile"),
        F.round(F.avg("total_amount"), 2).alias("avg_amount"),
        F.round(F.avg("trip_distance"), 2).alias("avg_distance")
    )
    .orderBy(F.desc("yield_per_mile"))
    .limit(100)
)
save_result(qr, "r_yield_per_mile.csv", "Yield por milla por borough y hora", 30)


### s) Cambios YoY en volumen y ticket promedio por service_type

In [None]:

yearly = (
    df.groupBy("year", "service_type")
    .agg(
        F.count("*").alias("trips"),
        F.round(F.avg("total_amount"), 2).alias("avg_ticket")
    )
)

w = Window.partitionBy("service_type").orderBy("year")
qs = (
    yearly
    .withColumn("prev_year_trips", F.lag("trips").over(w))
    .withColumn("prev_year_ticket", F.lag("avg_ticket").over(w))
    .withColumn("yoy_trips_pct", F.round((F.col("trips") - F.col("prev_year_trips")) * 100.0 / F.col("prev_year_trips"), 2))
    .withColumn("yoy_ticket_pct", F.round((F.col("avg_ticket") - F.col("prev_year_ticket")) * 100.0 / F.col("prev_year_ticket"), 2))
    .filter(F.col("prev_year_trips").isNotNull())
    .orderBy("year", "service_type")
)
save_result(qs, "s_yoy_changes.csv", "Cambios YoY en volumen y ticket", 30)


### t) Días con alta congestion_surcharge: efecto en total_amount vs días normales

In [None]:

daily = (
    df.filter(F.col("pickup_date").isNotNull())
    .groupBy("pickup_date")
    .agg(
        F.avg("congestion_surcharge").alias("avg_congestion"),
        F.avg("total_amount").alias("avg_total"),
        F.count("*").alias("trips")
    )
)

classified = (
    daily.select(
        F.when(F.col("avg_congestion") > 2, "High Congestion").otherwise("Normal").alias("day_type"),
        F.col("avg_congestion"),
        F.col("avg_total"),
        F.col("trips")
    )
)

qt = (
    classified.groupBy("day_type")
    .agg(
        F.count("*").alias("days"),
        F.round(F.avg("avg_congestion"), 2).alias("avg_congestion_charge"),
        F.round(F.avg("avg_total"), 2).alias("avg_total_amount"),
        F.round(F.avg("trips"), 0).alias("avg_trips_per_day")
    )
    .orderBy("day_type")
)
save_result(qt, "t_congestion_effect.csv", "Efecto de alta congestión vs días normales", 10)


## Resumen de Ejecución

In [None]:

import os

csv_files = sorted([f for f in os.listdir(EVID_DIR) if f.endswith('.csv')])


print(f"RUN_ID: {RUN_ID}")
print(f" Evidencias en: {EVID_DIR}")
print("\n📋 Archivos generados (muestra):")
for f in csv_files[:10]:
    print(f"   - {f}")
if len(csv_files) > 10:
    print(f"   ... y {len(csv_files) - 10} más")

gc.collect()
