# 05 - Análisis de Datos (20 Preguntas de Negocio)

## Objetivo
Responder las 20 preguntas de negocio especificadas en el PDF

In [14]:
from utils.snowflake_utils import get_spark_session, get_snowflake_options
from pyspark.sql import functions as F
import os, gc
from pathlib import Path
from datetime import datetime

spark = get_spark_session("data_analysis")
sf_options = get_snowflake_options(schema="ANALYTICS")

# Configuración para optimizar memoria
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")
spark.conf.set("spark.sql.shuffle.partitions", "200")

RUN_ID = os.environ.get("RUN_ID", datetime.utcnow().strftime("%Y%m%d_%H%M%S"))
EVID_DIR = Path("/home/jovyan/work/evidencias")
EVID_DIR.mkdir(parents=True, exist_ok=True)

print(f"✓ Spark configurado con optimizaciones de memoria")
print(f"  RUN_ID: {RUN_ID}")
print(f"  Evidencias: {EVID_DIR}")

✓ Snowflake context activo: DB=SPARK_DATA, SCHEMA=SPARK_DATA.RAW, WH=spark_wh, ROLE=ACCOUNTADMIN
✓ Spark configurado con optimizaciones de memoria
  RUN_ID: 20251020_011122
  Evidencias: /home/jovyan/work/evidencias


In [15]:
def query_snowflake(sql, description=""):
    """Ejecuta query directamente en Snowflake y retorna DataFrame pequeño."""
    print(f"\n{'='*60}")
    print(f"Pregunta: {description}")
    print(f"{'='*60}")
    
    df = (spark.read.format("net.snowflake.spark.snowflake")
          .options(**sf_options)
          .option("query", sql)
          .load())
    return df

def save_and_show(df, filename, description, limit=20):
    """Guarda resultado y muestra preview, luego libera memoria."""
    try:
        # Forzar ejecución y mostrar
        df.show(limit, truncate=False)
        
        # Guardar solo el resultado final (ya ejecutado)
        pdf = df.toPandas()
        filepath = EVID_DIR / filename
        pdf.to_csv(filepath, index=False)
        print(f"✓ Guardado: {filepath}")
        
        # Liberar memoria
        del pdf
        
    except Exception as e:
        print(f"✗ Error en {description}: {e}")
    finally:
        # Forzar garbage collection
        gc.collect()

## Preguntas de Negocio 

### a) Top 10 zonas de pickup por volumen mensual

In [16]:
sql_a = """
SELECT 
    year,
    month,
    pu_zone,
    COUNT(*) AS trips
FROM SPARK_DATA.ANALYTICS.OBT_TRIPS
GROUP BY year, month, pu_zone
ORDER BY year DESC, month DESC, trips DESC
LIMIT 120
"""

df_a = query_snowflake(sql_a, "Top 10 zonas de pickup por volumen mensual")
save_and_show(df_a, "a_top_pu_zones_monthly.csv", "Top PU zones", 20)


Pregunta: Top 10 zonas de pickup por volumen mensual
+----+-----+----------------------------+-----+
|YEAR|MONTH|PU_ZONE                     |TRIPS|
+----+-----+----------------------------+-----+
|2098|9    |Midtown North               |1    |
|2090|12   |JFK Airport                 |1    |
|2088|1    |Bloomingdale                |1    |
|2088|1    |Central Harlem              |1    |
|2084|11   |Upper West Side North       |2    |
|2084|11   |TriBeCa/Civic Center        |1    |
|2084|11   |Upper East Side North       |1    |
|2084|11   |East Chelsea                |1    |
|2084|11   |Murray Hill                 |1    |
|2084|11   |Gramercy                    |1    |
|2084|11   |Greenwich Village North     |1    |
|2081|6    |Flushing Meadows-Corona Park|1    |
|2070|8    |Queensbridge/Ravenswood     |1    |
|2066|12   |UN/Turtle Bay South         |1    |
|2062|8    |Astoria                     |1    |
|2058|12   |Upper East Side South       |1    |
|2058|12   |Penn Station/Madison S

### b) Top 10 zonas de dropoff por volumen mensual

In [17]:
sql_b = """
SELECT 
    year,
    month,
    do_zone,
    COUNT(*) AS trips
FROM SPARK_DATA.ANALYTICS.OBT_TRIPS
GROUP BY year, month, do_zone
ORDER BY year DESC, month DESC, trips DESC
LIMIT 120
"""

df_b = query_snowflake(sql_b, "Top 10 zonas de dropoff por volumen mensual")
save_and_show(df_b, "b_top_do_zones_monthly.csv", "Top DO zones", 20)


Pregunta: Top 10 zonas de dropoff por volumen mensual
+----+-----+-----------------------+-----+
|YEAR|MONTH|DO_ZONE                |TRIPS|
+----+-----+-----------------------+-----+
|2098|9    |TriBeCa/Civic Center   |1    |
|2090|12   |Midtown North          |1    |
|2088|1    |Midtown East           |1    |
|2088|1    |Morningside Heights    |1    |
|2084|11   |Upper West Side North  |2    |
|2084|11   |Upper East Side South  |1    |
|2084|11   |TriBeCa/Civic Center   |1    |
|2084|11   |Upper East Side North  |1    |
|2084|11   |East Chelsea           |1    |
|2084|11   |Kips Bay               |1    |
|2084|11   |Murray Hill            |1    |
|2081|6    |Hammels/Arverne        |1    |
|2070|8    |Queensbridge/Ravenswood|1    |
|2066|12   |JFK Airport            |1    |
|2062|8    |NULL                   |1    |
|2058|12   |Midtown Center         |1    |
|2058|12   |Midtown East           |1    |
|2058|12   |Midtown North          |1    |
|2053|7    |NULL                   |1    |

### c) Evolución mensual de total_amount y tip_pct por borough

In [18]:
sql_c = """
SELECT 
    year,
    month,
    pu_borough,
    ROUND(AVG(total_amount), 2) AS avg_total_amount,
    ROUND(AVG(tip_pct), 2) AS avg_tip_pct,
    COUNT(*) AS trips
FROM SPARK_DATA.ANALYTICS.OBT_TRIPS
WHERE pu_borough != 'Unknown'
GROUP BY year, month, pu_borough
ORDER BY year, month, pu_borough
"""

df_c = query_snowflake(sql_c, "Evolución mensual por borough")
save_and_show(df_c, "c_monthly_evolution_by_borough.csv", "Evolution borough", 30)


Pregunta: Evolución mensual por borough
+----+-----+----------+----------------+-----------+-----+
|YEAR|MONTH|PU_BOROUGH|AVG_TOTAL_AMOUNT|AVG_TIP_PCT|TRIPS|
+----+-----+----------+----------------+-----------+-----+
|2001|1    |Brooklyn  |8.8             |0.0        |1    |
|2001|1    |Manhattan |18.99           |0.01       |18   |
|2001|1    |Queens    |49.9            |0.03       |5    |
|2001|2    |Queens    |3.8             |0.0        |1    |
|2001|8    |Queens    |24.55           |0.0        |1    |
|2002|2    |Manhattan |17.9            |0.0        |11   |
|2002|10   |Bronx     |3.8             |0.0        |2    |
|2002|10   |Brooklyn  |25.91           |0.01       |14   |
|2002|10   |EWR       |92.14           |0.06       |3    |
|2002|10   |Manhattan |18.96           |0.05       |309  |
|2002|10   |Queens    |44.46           |0.05       |78   |
|2002|12   |Brooklyn  |31.3            |0.0        |1    |
|2002|12   |Manhattan |18.44           |0.02       |22   |
|2002|12   |Que

### d) Ticket promedio por service_type y mes

In [19]:
sql_d = """
SELECT 
    year,
    month,
    service_type,
    ROUND(AVG(total_amount), 2) AS avg_ticket,
    COUNT(*) AS trips
FROM SPARK_DATA.ANALYTICS.OBT_TRIPS
GROUP BY year, month, service_type
ORDER BY year, month, service_type
"""

df_d = query_snowflake(sql_d, "Ticket promedio por servicio y mes")
save_and_show(df_d, "d_avg_ticket_by_service_month.csv", "Avg ticket", 30)


Pregunta: Ticket promedio por servicio y mes
+----+-----+------------+----------+--------+
|YEAR|MONTH|SERVICE_TYPE|AVG_TICKET|TRIPS   |
+----+-----+------------+----------+--------+
|2001|1    |yellow      |24.16     |25      |
|2001|2    |yellow      |3.8       |1       |
|2001|8    |yellow      |24.55     |1       |
|2002|2    |yellow      |17.9      |11      |
|2002|10   |yellow      |25.42     |425     |
|2002|12   |yellow      |27.97     |47      |
|2003|1    |yellow      |25.97     |48      |
|2003|3    |yellow      |3.31      |1       |
|2003|12   |yellow      |7.3       |1       |
|2004|4    |yellow      |12.3      |1       |
|2008|8    |yellow      |2.83      |2       |
|2008|10   |green       |0.0       |1       |
|2008|12   |green       |13.55     |112     |
|2008|12   |yellow      |23.33     |762     |
|2009|1    |green       |16.36     |312     |
|2009|1    |yellow      |19.74     |1290    |
|2010|8    |yellow      |18.36     |1       |
|2010|9    |green       |17.98    

### e) Viajes por hora del día y día de semana (picos)

In [20]:
sql_e = """
SELECT 
    pickup_hour,
    day_of_week,
    COUNT(*) AS trips
FROM SPARK_DATA.ANALYTICS.OBT_TRIPS
GROUP BY pickup_hour, day_of_week
ORDER BY trips DESC
LIMIT 100
"""

df_e = query_snowflake(sql_e, "Viajes por hora y día de semana")
save_and_show(df_e, "e_trips_by_hour_dow.csv", "Peak hours", 30)


Pregunta: Viajes por hora y día de semana
+-----------+-----------+-------+
|PICKUP_HOUR|DAY_OF_WEEK|TRIPS  |
+-----------+-----------+-------+
|19         |Fri        |8085381|
|18         |Fri        |8079215|
|18         |Thu        |8065882|
|18         |Wed        |7969811|
|19         |Thu        |7889838|
|18         |Tue        |7866078|
|19         |Wed        |7747710|
|21         |Thu        |7527772|
|20         |Thu        |7490767|
|19         |Sat        |7477011|
|19         |Tue        |7414601|
|18         |Sat        |7400524|
|18         |Mon        |7271765|
|21         |Wed        |7267044|
|20         |Wed        |7266850|
|22         |Fri        |7178517|
|22         |Thu        |7088315|
|20         |Fri        |7078196|
|17         |Fri        |7066933|
|17         |Thu        |7019118|
|20         |Tue        |6966762|
|17         |Wed        |6966428|
|21         |Fri        |6929618|
|21         |Tue        |6929545|
|23         |Fri        |6920020|
|23  

### f) p50/p90 de trip_duration_min por borough de pickup

In [39]:
sql_f = """
SELECT 
    pu_borough,
    ROUND(APPROX_PERCENTILE(trip_duration_min, 0.5), 2) AS p50_duration_approx,
    ROUND(APPROX_PERCENTILE(trip_duration_min, 0.9), 2) AS p90_duration_approx,
    COUNT(*) AS trips
FROM SPARK_DATA.ANALYTICS.OBT_TRIPS
WHERE pu_borough != 'Unknown'
  AND trip_duration_min > 0
GROUP BY pu_borough
ORDER BY pu_borough;

"""

df_f = query_snowflake(sql_f, "Percentiles de duración por borough")
save_and_show(df_f, "f_duration_percentiles_by_borough.csv", "Duration percentiles", 10)


Pregunta: Percentiles de duración por borough
+-------------+-------------------+-------------------+---------+
|PU_BOROUGH   |P50_DURATION_APPROX|P90_DURATION_APPROX|TRIPS    |
+-------------+-------------------+-------------------+---------+
|Bronx        |14.0               |38.0               |4822364  |
|Brooklyn     |13.0               |32.01              |33702956 |
|EWR          |1.0                |11.94              |29229    |
|Manhattan    |11.0               |25.0               |697547525|
|Queens       |25.0               |54.07              |67519443 |
|Staten Island|29.0               |74.0               |44996    |
+-------------+-------------------+-------------------+---------+

✓ Guardado: /home/jovyan/work/evidencias/f_duration_percentiles_by_borough.csv


### g) avg_speed_mph por franja horaria y borough

In [24]:
sql_g = """
SELECT 
    CASE 
        WHEN pickup_hour BETWEEN 6 AND 9 THEN '06-09 Morning'
        WHEN pickup_hour BETWEEN 17 AND 20 THEN '17-20 Evening'
        ELSE 'Other'
    END AS time_slot,
    pu_borough,
    ROUND(AVG(avg_speed_mph), 2) AS avg_speed,
    COUNT(*) AS trips
FROM SPARK_DATA.ANALYTICS.OBT_TRIPS
WHERE pu_borough != 'Unknown'
  AND avg_speed_mph IS NOT NULL
  AND avg_speed_mph > 0
GROUP BY time_slot, pu_borough
ORDER BY time_slot, pu_borough
"""

df_g = query_snowflake(sql_g, "Velocidad promedio por franja horaria y borough")
save_and_show(df_g, "g_avg_speed_by_timeslot_borough.csv", "Avg speed", 20)


Pregunta: Velocidad promedio por franja horaria y borough
+-------------+-------------+---------+---------+
|TIME_SLOT    |PU_BOROUGH   |AVG_SPEED|TRIPS    |
+-------------+-------------+---------+---------+
|06-09 Morning|Bronx        |130.76   |946281   |
|06-09 Morning|Brooklyn     |46.33    |4234992  |
|06-09 Morning|EWR          |170.74   |1963     |
|06-09 Morning|Manhattan    |20.89    |102470517|
|06-09 Morning|Queens       |35.83    |8434854  |
|06-09 Morning|Staten Island|33.73    |8111     |
|17-20 Evening|Bronx        |40.98    |927816   |
|17-20 Evening|Brooklyn     |19.13    |7461647  |
|17-20 Evening|EWR          |194.66   |2590     |
|17-20 Evening|Manhattan    |39.87    |168349304|
|17-20 Evening|Queens       |34.11    |16257703 |
|17-20 Evening|Staten Island|96.95    |7308     |
|Other        |Bronx        |92.28    |2792403  |
|Other        |Brooklyn     |33.69    |21627585 |
|Other        |EWR          |186.81   |7029     |
|Other        |Manhattan    |19.98    |42

### h) Participación por payment_type_desc y relación con tip_pct

In [25]:
sql_h = """
SELECT 
    payment_type_desc,
    COUNT(*) AS trips,
    ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER (), 2) AS pct_of_total,
    ROUND(AVG(tip_pct), 2) AS avg_tip_pct,
    ROUND(AVG(tip_amount), 2) AS avg_tip_amount
FROM SPARK_DATA.ANALYTICS.OBT_TRIPS
GROUP BY payment_type_desc
ORDER BY trips DESC
"""

df_h = query_snowflake(sql_h, "Participación por tipo de pago y propina")
save_and_show(df_h, "h_payment_type_participation.csv", "Payment types", 10)


Pregunta: Participación por tipo de pago y propina
+-----------------+---------+------------+-----------+--------------+
|PAYMENT_TYPE_DESC|TRIPS    |PCT_OF_TOTAL|AVG_TIP_PCT|AVG_TIP_AMOUNT|
+-----------------+---------+------------+-----------+--------------+
|Credit Card      |549846312|67.18       |0.15       |2.98          |
|Cash             |251349549|30.71       |0.0        |0.0           |
|Unknown          |9382205  |1.15        |0.28       |20.71         |
|No Charge        |3787215  |0.46        |0.0        |0.04          |
|Dispute          |2219711  |0.27        |0.0        |0.03          |
|NULL             |1829548  |0.22        |0.03       |0.88          |
+-----------------+---------+------------+-----------+--------------+

✓ Guardado: /home/jovyan/work/evidencias/h_payment_type_participation.csv


### i) Rate codes con mayor trip_distance y total_amount

In [26]:
sql_i = """
SELECT 
    rate_code_desc,
    COUNT(*) AS trips,
    ROUND(SUM(trip_distance), 2) AS total_distance,
    ROUND(AVG(trip_distance), 2) AS avg_distance,
    ROUND(SUM(total_amount), 2) AS total_revenue,
    ROUND(AVG(total_amount), 2) AS avg_amount
FROM SPARK_DATA.ANALYTICS.OBT_TRIPS
GROUP BY rate_code_desc
ORDER BY total_distance DESC
"""

df_i = query_snowflake(sql_i, "Rate codes por distancia y revenue")
save_and_show(df_i, "i_rate_codes_distance_revenue.csv", "Rate codes", 10)


Pregunta: Rate codes por distancia y revenue
+---------------------+---------+---------------+------------+-----------------+----------+
|RATE_CODE_DESC       |TRIPS    |TOTAL_DISTANCE |AVG_DISTANCE|TOTAL_REVENUE    |AVG_AMOUNT|
+---------------------+---------+---------------+------------+-----------------+----------+
|Standard rate        |780536229|3.47480959413E9|4.45        |1.270860302435E10|16.28     |
|NULL                 |12062203 |6.4959461609E8 |53.85       |3.661169662E8    |30.35     |
|JFK                  |18681164 |4.1305612683E8 |22.11       |1.29680395028E9  |69.42     |
|Negotiated fare      |4866382  |8.882004435E7  |18.25       |2.6374460516E8   |54.2      |
|Newark               |1634103  |2.711823245E7  |16.6        |1.5093149036E8   |92.36     |
|Nassau or Westchester|627343   |1.84984592E7   |29.49       |5.819990246E7    |92.77     |
|Group ride           |7116     |8903.37        |1.25        |179496.34        |25.22     |
+---------------------+---------+-

### j) Mix yellow vs green por mes y borough

In [27]:
sql_j = """
SELECT 
    year,
    month,
    pu_borough,
    service_type,
    COUNT(*) AS trips
FROM SPARK_DATA.ANALYTICS.OBT_TRIPS
WHERE pu_borough != 'Unknown'
GROUP BY year, month, pu_borough, service_type
ORDER BY year DESC, month DESC, pu_borough, service_type
LIMIT 200
"""

df_j = query_snowflake(sql_j, "Mix yellow vs green por mes y borough")
save_and_show(df_j, "j_service_mix_by_month_borough.csv", "Service mix", 30)


Pregunta: Mix yellow vs green por mes y borough
+----+-----+----------+------------+-----+
|YEAR|MONTH|PU_BOROUGH|SERVICE_TYPE|TRIPS|
+----+-----+----------+------------+-----+
|2098|9    |Manhattan |yellow      |1    |
|2090|12   |Queens    |yellow      |1    |
|2088|1    |Manhattan |yellow      |2    |
|2084|11   |Manhattan |yellow      |8    |
|2081|6    |Queens    |green       |1    |
|2070|8    |Queens    |yellow      |1    |
|2066|12   |Manhattan |yellow      |1    |
|2062|8    |Queens    |green       |1    |
|2058|12   |Manhattan |yellow      |3    |
|2053|3    |Brooklyn  |yellow      |1    |
|2042|12   |Manhattan |yellow      |1    |
|2041|11   |Manhattan |yellow      |1    |
|2041|8    |Queens    |green       |1    |
|2041|6    |Manhattan |yellow      |1    |
|2041|3    |Manhattan |yellow      |1    |
|2038|2    |Manhattan |yellow      |4    |
|2037|11   |Manhattan |yellow      |1    |
|2035|9    |Queens    |green       |1    |
|2033|4    |Manhattan |yellow      |3    |
|2032

### k) Top 20 flujos PU→DO por volumen y ticket promedio

In [28]:
sql_k = """
SELECT 
    pu_zone,
    do_zone,
    COUNT(*) AS trips,
    ROUND(AVG(total_amount), 2) AS avg_ticket,
    ROUND(SUM(total_amount), 2) AS total_revenue
FROM SPARK_DATA.ANALYTICS.OBT_TRIPS
WHERE pu_zone != 'Unknown' AND do_zone != 'Unknown'
GROUP BY pu_zone, do_zone
ORDER BY trips DESC
LIMIT 20
"""

df_k = query_snowflake(sql_k, "Top 20 flujos PU→DO")
save_and_show(df_k, "k_top_routes_volume.csv", "Top routes", 20)


Pregunta: Top 20 flujos PU→DO
+----------------------------+----------------------------+-------+----------+-------------+
|PU_ZONE                     |DO_ZONE                     |TRIPS  |AVG_TICKET|TOTAL_REVENUE|
+----------------------------+----------------------------+-------+----------+-------------+
|Upper East Side South       |Upper East Side North       |4284312|10.09     |4.322970797E7|
|Upper East Side North       |Upper East Side South       |3661476|11.03     |4.037671847E7|
|Upper East Side North       |Upper East Side North       |3418818|8.35      |2.853545724E7|
|Upper East Side South       |Upper East Side South       |3255592|8.86      |2.88366022E7 |
|Upper West Side South       |Upper West Side North       |1932930|8.73      |1.686584232E7|
|Upper West Side South       |Lincoln Square East         |1918577|9.26      |1.776883937E7|
|Upper East Side South       |Midtown East                |1845687|10.47     |1.932434152E7|
|Upper East Side South       |Midtown C

### l) Distribución de passenger_count y efecto en total_amount

In [29]:
sql_l = """
SELECT 
    passenger_count,
    COUNT(*) AS trips,
    ROUND(AVG(total_amount), 2) AS avg_total_amount,
    ROUND(AVG(trip_distance), 2) AS avg_distance
FROM SPARK_DATA.ANALYTICS.OBT_TRIPS
WHERE passenger_count BETWEEN 1 AND 9
GROUP BY passenger_count
ORDER BY passenger_count
"""

df_l = query_snowflake(sql_l, "Distribución passenger_count")
save_and_show(df_l, "l_passenger_count_distribution.csv", "Passenger count", 10)


Pregunta: Distribución passenger_count
+---------------+---------+----------------+------------+
|PASSENGER_COUNT|TRIPS    |AVG_TOTAL_AMOUNT|AVG_DISTANCE|
+---------------+---------+----------------+------------+
|1.0            |587087258|17.77           |5.26        |
|2.0            |113770945|19.11           |5.04        |
|3.0            |31687694 |18.63           |4.28        |
|4.0            |15160104 |19.29           |4.35        |
|5.0            |33328168 |17.01           |3.05        |
|6.0            |20487740 |16.84           |3.0         |
|7.0            |3756     |46.08           |2.67        |
|8.0            |3787     |47.17           |3.13        |
|9.0            |1960     |61.08           |4.7         |
+---------------+---------+----------------+------------+

✓ Guardado: /home/jovyan/work/evidencias/l_passenger_count_distribution.csv


### m) Impacto de tolls_amount y congestion_surcharge por zona

In [30]:
sql_m = """
SELECT 
    pu_zone,
    COUNT(*) AS trips,
    ROUND(AVG(tolls_amount), 2) AS avg_tolls,
    ROUND(AVG(congestion_surcharge), 2) AS avg_congestion,
    ROUND(AVG(total_amount), 2) AS avg_total
FROM SPARK_DATA.ANALYTICS.OBT_TRIPS
WHERE pu_zone != 'Unknown'
GROUP BY pu_zone
HAVING COUNT(*) >= 1000
ORDER BY avg_tolls DESC
LIMIT 30
"""

df_m = query_snowflake(sql_m, "Impacto de tolls y congestion por zona")
save_and_show(df_m, "m_surcharges_impact_by_zone.csv", "Surcharges", 30)


Pregunta: Impacto de tolls y congestion por zona
+-----------------------------------+--------+---------+--------------+---------+
|PU_ZONE                            |TRIPS   |AVG_TOLLS|AVG_CONGESTION|AVG_TOTAL|
+-----------------------------------+--------+---------+--------------+---------+
|Arden Heights                      |2207    |13.82    |0.11          |85.04    |
|Arrochar/Fort Wadsworth            |4195    |9.26     |0.08          |33.16    |
|Bloomfield/Emerson Hill            |7270    |7.29     |0.04          |69.1     |
|Charleston/Tottenville             |3842    |6.99     |0.01          |89.63    |
|South Beach/Dongan Hills           |1758    |5.19     |0.08          |50.27    |
|Grymes Hill/Clifton                |5266    |4.84     |0.06          |45.19    |
|Mariners Harbor                    |4988    |4.78     |0.07          |48.7     |
|Randalls Island                    |54426   |4.12     |1.09          |57.57    |
|Westerleigh                        |1542    |3.

### n) Proporción viajes cortos vs largos por borough y estacionalidad

In [31]:
sql_n = """
SELECT 
    pu_borough,
    month,
    CASE 
        WHEN trip_distance <= 2 THEN 'Short (<=2mi)'
        WHEN trip_distance <= 5 THEN 'Medium (2-5mi)'
        ELSE 'Long (>5mi)'
    END AS trip_category,
    COUNT(*) AS trips
FROM SPARK_DATA.ANALYTICS.OBT_TRIPS
WHERE pu_borough != 'Unknown'
GROUP BY pu_borough, month, trip_category
ORDER BY pu_borough, month, trip_category
LIMIT 300
"""

df_n = query_snowflake(sql_n, "Proporción viajes cortos vs largos")
save_and_show(df_n, "n_trip_length_distribution.csv", "Trip length", 30)


Pregunta: Proporción viajes cortos vs largos
+----------+-----+--------------+------+
|PU_BOROUGH|MONTH|TRIP_CATEGORY |TRIPS |
+----------+-----+--------------+------+
|Bronx     |1    |Long (>5mi)   |111388|
|Bronx     |1    |Medium (2-5mi)|131547|
|Bronx     |1    |Short (<=2mi) |190493|
|Bronx     |2    |Long (>5mi)   |113891|
|Bronx     |2    |Medium (2-5mi)|143194|
|Bronx     |2    |Short (<=2mi) |208412|
|Bronx     |3    |Long (>5mi)   |129170|
|Bronx     |3    |Medium (2-5mi)|163687|
|Bronx     |3    |Short (<=2mi) |233679|
|Bronx     |4    |Long (>5mi)   |115779|
|Bronx     |4    |Medium (2-5mi)|139603|
|Bronx     |4    |Short (<=2mi) |196729|
|Bronx     |5    |Long (>5mi)   |121059|
|Bronx     |5    |Medium (2-5mi)|141439|
|Bronx     |5    |Short (<=2mi) |197349|
|Bronx     |6    |Long (>5mi)   |110142|
|Bronx     |6    |Medium (2-5mi)|127093|
|Bronx     |6    |Short (<=2mi) |180151|
|Bronx     |7    |Long (>5mi)   |106295|
|Bronx     |7    |Medium (2-5mi)|114418|
|Bronx     

### o) Diferencias por vendor en avg_speed_mph y trip_duration_min

In [32]:
sql_o = """
SELECT 
    vendor_name,
    COUNT(*) AS trips,
    ROUND(AVG(avg_speed_mph), 2) AS avg_speed,
    ROUND(AVG(trip_duration_min), 2) AS avg_duration,
    ROUND(AVG(trip_distance), 2) AS avg_distance
FROM SPARK_DATA.ANALYTICS.OBT_TRIPS
WHERE avg_speed_mph IS NOT NULL AND avg_speed_mph > 0
GROUP BY vendor_name
ORDER BY trips DESC
"""

df_o = query_snowflake(sql_o, "Diferencias por vendor")
save_and_show(df_o, "o_vendor_performance.csv", "Vendor performance", 10)


Pregunta: Diferencias por vendor
+----------------------------+---------+---------+------------+------------+
|VENDOR_NAME                 |TRIPS    |AVG_SPEED|AVG_DURATION|AVG_DISTANCE|
+----------------------------+---------+---------+------------+------------+
|VeriFone Inc.               |496742523|16.55    |21.53       |4.34        |
|Creative Mobile Technologies|311658990|42.66    |16.98       |8.01        |
|NULL                        |881594   |10.6     |21.39       |3.83        |
+----------------------------+---------+---------+------------+------------+

✓ Guardado: /home/jovyan/work/evidencias/o_vendor_performance.csv


### p) Relación método de pago ↔ tip_amount por hora

In [33]:
sql_p = """
SELECT 
    pickup_hour,
    payment_type_desc,
    COUNT(*) AS trips,
    ROUND(AVG(tip_amount), 2) AS avg_tip,
    ROUND(AVG(tip_pct), 2) AS avg_tip_pct
FROM SPARK_DATA.ANALYTICS.OBT_TRIPS
GROUP BY pickup_hour, payment_type_desc
ORDER BY pickup_hour, trips DESC
LIMIT 200
"""

df_p = query_snowflake(sql_p, "Método de pago y propina por hora")
save_and_show(df_p, "p_payment_tip_by_hour.csv", "Payment tip", 30)


Pregunta: Método de pago y propina por hora
+-----------+-----------------+--------+-------+-----------+
|PICKUP_HOUR|PAYMENT_TYPE_DESC|TRIPS   |AVG_TIP|AVG_TIP_PCT|
+-----------+-----------------+--------+-------+-----------+
|0          |Credit Card      |18310150|2.98   |0.15       |
|0          |Cash             |8125794 |0.0    |0.0        |
|0          |Unknown          |344766  |1.36   |0.05       |
|0          |No Charge        |147116  |0.01   |0.0        |
|0          |Dispute          |87972   |0.03   |0.0        |
|0          |NULL             |11856   |1.64   |0.06       |
|1          |Credit Card      |12957228|2.77   |0.15       |
|1          |Cash             |5971306 |0.0    |0.0        |
|1          |Unknown          |222922  |1.34   |0.05       |
|1          |No Charge        |120965  |0.0    |0.0        |
|1          |Dispute          |69039   |0.04   |0.0        |
|1          |NULL             |7261    |1.56   |0.06       |
|2          |Credit Card      |9122892 |

### q) Zonas con percentil 99 de duración/distancia fuera de rango

In [34]:
sql_q = """
SELECT 
    pu_zone,
    COUNT(*) AS trips,
    ROUND(PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY trip_duration_min), 2) AS p99_duration,
    ROUND(PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY trip_distance), 2) AS p99_distance
FROM SPARK_DATA.ANALYTICS.OBT_TRIPS
WHERE pu_zone != 'Unknown'
GROUP BY pu_zone
HAVING COUNT(*) >= 1000
ORDER BY p99_duration DESC
LIMIT 30
"""

df_q = query_snowflake(sql_q, "Zonas con p99 extremo (congestión potencial)")
save_and_show(df_q, "q_zones_extreme_p99.csv", "Extreme p99", 30)


Pregunta: Zonas con p99 extremo (congestión potencial)
+--------------------------------+------+------------+------------+
|PU_ZONE                         |TRIPS |P99_DURATION|P99_DISTANCE|
+--------------------------------+------+------------+------------+
|Bronx Park                      |32712 |225.56      |21.94       |
|Coney Island                    |181244|186.0       |29.96       |
|Arden Heights                   |2207  |184.94      |40.18       |
|West Brighton                   |1575  |155.94      |39.67       |
|Mariners Harbor                 |4988  |150.43      |36.42       |
|Hammels/Arverne                 |34314 |136.0       |32.21       |
|Far Rockaway                    |28946 |133.0       |31.63       |
|Marine Park/Floyd Bennett Field |6128  |132.0       |49.35       |
|Heartland Village/Todt Hill     |2398  |130.06      |39.4        |
|Saint George/New Brighton       |6213  |124.88      |33.49       |
|Charleston/Tottenville          |3842  |122.95      |53.34 

### r) Yield por milla (total_amount/trip_distance) por borough y hora

In [35]:
sql_r = """
SELECT 
    pu_borough,
    pickup_hour,
    COUNT(*) AS trips,
    ROUND(AVG(total_amount / NULLIF(trip_distance, 0)), 2) AS yield_per_mile,
    ROUND(AVG(total_amount), 2) AS avg_amount,
    ROUND(AVG(trip_distance), 2) AS avg_distance
FROM SPARK_DATA.ANALYTICS.OBT_TRIPS
WHERE pu_borough != 'Unknown'
  AND trip_distance > 0
GROUP BY pu_borough, pickup_hour
ORDER BY yield_per_mile DESC
LIMIT 100
"""

df_r = query_snowflake(sql_r, "Yield por milla por borough y hora")
save_and_show(df_r, "r_yield_per_mile.csv", "Yield per mile", 30)


Pregunta: Yield por milla por borough y hora
+-------------+-----------+-----+--------------+----------+------------+
|PU_BOROUGH   |PICKUP_HOUR|TRIPS|YIELD_PER_MILE|AVG_AMOUNT|AVG_DISTANCE|
+-------------+-----------+-----+--------------+----------+------------+
|EWR          |17         |1732 |1685.93       |89.2      |5.4         |
|EWR          |6          |1068 |1648.13       |82.25     |4.43        |
|EWR          |18         |1415 |1558.96       |89.59     |5.36        |
|EWR          |15         |2139 |1544.56       |85.12     |5.37        |
|EWR          |14         |1865 |1536.76       |88.29     |5.16        |
|EWR          |16         |2112 |1515.85       |89.7      |5.21        |
|EWR          |9          |741  |1438.86       |87.18     |6.94        |
|EWR          |7          |898  |1414.84       |86.61     |6.2         |
|EWR          |19         |1070 |1410.54       |86.33     |5.94        |
|EWR          |8          |728  |1401.85       |83.28     |5.45        |
|EWR 

### s) Cambios YoY en volumen y ticket promedio por service_type

In [36]:
sql_s = """
WITH yearly_stats AS (
    SELECT 
        year,
        service_type,
        COUNT(*) AS trips,
        ROUND(AVG(total_amount), 2) AS avg_ticket
    FROM SPARK_DATA.ANALYTICS.OBT_TRIPS
    GROUP BY year, service_type
),
yoy_changes AS (
    SELECT 
        year,
        service_type,
        trips,
        avg_ticket,
        LAG(trips) OVER (PARTITION BY service_type ORDER BY year) AS prev_year_trips,
        LAG(avg_ticket) OVER (PARTITION BY service_type ORDER BY year) AS prev_year_ticket
    FROM yearly_stats
)
SELECT 
    year,
    service_type,
    trips,
    avg_ticket,
    prev_year_trips,
    prev_year_ticket,
    ROUND(100.0 * (trips - prev_year_trips) / NULLIF(prev_year_trips, 0), 2) AS yoy_trips_pct,
    ROUND(100.0 * (avg_ticket - prev_year_ticket) / NULLIF(prev_year_ticket, 0), 2) AS yoy_ticket_pct
FROM yoy_changes
WHERE prev_year_trips IS NOT NULL
ORDER BY year, service_type
"""

df_s = query_snowflake(sql_s, "Cambios YoY en volumen y ticket")
save_and_show(df_s, "s_yoy_changes.csv", "YoY changes", 30)


Pregunta: Cambios YoY en volumen y ticket
+----+------------+---------+----------+---------------+----------------+--------------+--------------+
|YEAR|SERVICE_TYPE|TRIPS    |AVG_TICKET|PREV_YEAR_TRIPS|PREV_YEAR_TICKET|YOY_TRIPS_PCT |YOY_TICKET_PCT|
+----+------------+---------+----------+---------------+----------------+--------------+--------------+
|2002|yellow      |483      |25.5      |27             |23.42           |1688.89       |8.88          |
|2003|yellow      |50       |25.15     |483            |25.5            |-89.65        |-1.37         |
|2004|yellow      |1        |12.3      |50             |25.15           |-98.00        |-51.09        |
|2008|yellow      |764      |23.28     |1              |12.3            |76300.00      |89.27         |
|2009|green       |312      |16.36     |113            |13.43           |176.11        |21.82         |
|2009|yellow      |1290     |19.74     |764            |23.28           |68.85         |-15.21        |
|2010|green       |34

### t) Días con alta congestion_surcharge: efecto en total_amount vs días normales

In [37]:
sql_t = """
WITH daily_congestion AS (
    SELECT 
        pickup_date,
        AVG(congestion_surcharge) AS avg_congestion,
        AVG(total_amount) AS avg_total,
        COUNT(*) AS trips
    FROM SPARK_DATA.ANALYTICS.OBT_TRIPS
    WHERE pickup_date IS NOT NULL
    GROUP BY pickup_date
),
classified AS (
    SELECT 
        CASE 
            WHEN avg_congestion > 2 THEN 'High Congestion'
            ELSE 'Normal'
        END AS day_type,
        avg_congestion,
        avg_total,
        trips
    FROM daily_congestion
)
SELECT 
    day_type,
    COUNT(*) AS days,
    ROUND(AVG(avg_congestion), 2) AS avg_congestion_charge,
    ROUND(AVG(avg_total), 2) AS avg_total_amount,
    ROUND(AVG(trips), 0) AS avg_trips_per_day
FROM classified
GROUP BY day_type
ORDER BY day_type
"""

df_t = query_snowflake(sql_t, "Efecto de alta congestión vs días normales")
save_and_show(df_t, "t_congestion_effect.csv", "Congestion effect", 10)


Pregunta: Efecto de alta congestión vs días normales
+---------------+----+---------------------+----------------+-----------------+
|DAY_TYPE       |DAYS|AVG_CONGESTION_CHARGE|AVG_TOTAL_AMOUNT|AVG_TRIPS_PER_DAY|
+---------------+----+---------------------+----------------+-----------------+
|High Congestion|2080|2.22                 |22.89           |124797           |
|Normal         |1648|1.33                 |16.27           |339100           |
+---------------+----+---------------------+----------------+-----------------+

✓ Guardado: /home/jovyan/work/evidencias/t_congestion_effect.csv


## Resumen de Ejecución

In [42]:
import os

# Listar todos los CSVs generados
csv_files = sorted([f for f in os.listdir(EVID_DIR) if f.endswith('.csv')])

print(f"\nRUN_ID: {RUN_ID}")
print(f"\n Total de análisis generados: {len([f for f in csv_files if f.startswith(('a_','b_','c_','d_','e_','f_','g_','h_','i_','j_','k_','l_','m_','n_','o_','p_','q_','r_','s_','t_'))])}")
print(f"\n Evidencias generadas en: {EVID_DIR}")


RUN_ID: 20251020_011122

 Total de análisis generados: 20

 Evidencias generadas en: /home/jovyan/work/evidencias
