In [24]:
from pyspark.sql import SparkSession, functions as F, types as T
from pyspark import SparkConf, SparkContext
import os
from dotenv import load_dotenv
load_dotenv(dotenv_path="/home/jovyan/work/.env")


True

In [25]:
query_counts = """
SELECT
    YEAR,
    MONTH,
    SERVICE_TYPE,
    COUNT(*) AS total_viajes,
    ROUND(AVG(TRIP_DISTANCE), 2) AS avg_distance,
    ROUND(MAX(TRIP_DISTANCE), 2) AS max_distance,
    ROUND(MIN(TRIP_DISTANCE), 2) AS min_distance,
    ROUND(AVG(TOTAL_AMOUNT), 2) AS avg_total_amount,
    ROUND(MAX(TOTAL_AMOUNT), 2) AS max_total_amount,
    ROUND(MIN(TOTAL_AMOUNT), 2) AS min_total_amount
FROM ANALYTICS.OBT_TRIPS
GROUP BY YEAR, MONTH, SERVICE_TYPE
ORDER BY YEAR, MONTH, SERVICE_TYPE;
"""



In [26]:
import snowflake.connector

conn = snowflake.connector.connect(
    user=os.getenv('USER'),
    password=os.getenv('PASSWORD'),
    account=os.getenv('ACCOUNT'),
    warehouse="COMPUTE_WH",
    database="NY_TAXI",
    schema="ANALYTICS",
    role =os.getenv('ROLE')
)

cur = conn.cursor()
cur.execute(query_counts)
print("cursor creado")
results = cur.fetchall()

# Obtener nombres de columnas
columns = [desc[0] for desc in cur.description]

# Mostrar como tabla
for row in results:
    print(dict(zip(columns, row)))

cur.close()

cursor creado
{'YEAR': 2000, 'MONTH': 12, 'SERVICE_TYPE': 'yellow', 'TOTAL_VIAJES': 19, 'AVG_DISTANCE': 5.25, 'MAX_DISTANCE': 20.77, 'MIN_DISTANCE': 0.0, 'AVG_TOTAL_AMOUNT': 26.36, 'MAX_TOTAL_AMOUNT': 98.76, 'MIN_TOTAL_AMOUNT': 3.8}
{'YEAR': 2001, 'MONTH': 1, 'SERVICE_TYPE': 'yellow', 'TOTAL_VIAJES': 6, 'AVG_DISTANCE': 2.65, 'MAX_DISTANCE': 9.13, 'MIN_DISTANCE': 0.0, 'AVG_TOTAL_AMOUNT': 17.17, 'MAX_TOTAL_AMOUNT': 43.0, 'MIN_TOTAL_AMOUNT': 3.3}
{'YEAR': 2001, 'MONTH': 2, 'SERVICE_TYPE': 'yellow', 'TOTAL_VIAJES': 1, 'AVG_DISTANCE': 0.0, 'MAX_DISTANCE': 0.0, 'MIN_DISTANCE': 0.0, 'AVG_TOTAL_AMOUNT': 3.8, 'MAX_TOTAL_AMOUNT': 3.8, 'MIN_TOTAL_AMOUNT': 3.8}
{'YEAR': 2001, 'MONTH': 8, 'SERVICE_TYPE': 'yellow', 'TOTAL_VIAJES': 1, 'AVG_DISTANCE': 7.08, 'MAX_DISTANCE': 7.08, 'MIN_DISTANCE': 7.08, 'AVG_TOTAL_AMOUNT': 24.55, 'MAX_TOTAL_AMOUNT': 24.55, 'MIN_TOTAL_AMOUNT': 24.55}
{'YEAR': 2002, 'MONTH': 2, 'SERVICE_TYPE': 'yellow', 'TOTAL_VIAJES': 11, 'AVG_DISTANCE': 2.93, 'MAX_DISTANCE': 11.28, 'MIN_

True

In [27]:
import pandas as pd

In [28]:
query_val = """
WITH base AS (
    SELECT *
    FROM ANALYTICS.OBT_TRIPS
)
SELECT
    -- TOTAL DE REGISTROS
    COUNT(*) AS total_registros,
    
    -- NULOS
    COUNT_IF(PICKUP_DATETIME IS NULL) AS null_pickup_datetime,
    COUNT_IF(DROPOFF_DATETIME IS NULL) AS null_dropoff_datetime,
    COUNT_IF(PICKUP_DATE IS NULL) AS null_pickup_date,
    COUNT_IF(PU_LOCATION_ID IS NULL) AS null_pu_location,
    COUNT_IF(DO_LOCATION_ID IS NULL) AS null_do_location,
    COUNT_IF(TRIP_DISTANCE IS NULL) AS null_trip_distance,
    COUNT_IF(TOTAL_AMOUNT IS NULL) AS null_total_amount,
    COUNT_IF(SERVICE_TYPE IS NULL) AS null_service_type,
    COUNT_IF(PAYMENT_TYPE_DESC IS NULL) AS null_payment_type,

    -- RANGOS
    COUNT_IF(TRIP_DISTANCE < 0 OR TRIP_DISTANCE > 200) AS distancia_fuera_rango,
    COUNT_IF(TOTAL_AMOUNT < 0 OR TOTAL_AMOUNT > 2000) AS monto_fuera_rango,
    COUNT_IF(PASSENGER_COUNT < 0 OR PASSENGER_COUNT > 10) AS pasajeros_fuera_rango,
    COUNT_IF(AVG_SPEED_MPH < 0 OR AVG_SPEED_MPH > 120) AS velocidad_fuera_rango,
    COUNT_IF(TIP_PCT < 0 OR TIP_PCT > 100) AS tip_pct_fuera_rango,

    -- COHERENCIA DE FECHAS
    COUNT_IF(DROPOFF_DATETIME < PICKUP_DATETIME) AS dropoff_antes_pickup,
    COUNT_IF(TRIP_DURATION_MIN <= 0) AS duracion_invalida,

    -- ESTADÍSTICAS
    MIN(TRIP_DISTANCE) AS min_trip_distance,
    MAX(TRIP_DISTANCE) AS max_trip_distance,
    AVG(TRIP_DISTANCE) AS avg_trip_distance,
    
    MIN(TOTAL_AMOUNT) AS min_total_amount,
    MAX(TOTAL_AMOUNT) AS max_total_amount,
    AVG(TOTAL_AMOUNT) AS avg_total_amount,
    
    MIN(TRIP_DURATION_MIN) AS min_trip_duration,
    MAX(TRIP_DURATION_MIN) AS max_trip_duration,
    AVG(TRIP_DURATION_MIN) AS avg_trip_duration,
    
    MIN(AVG_SPEED_MPH) AS min_avg_speed,
    MAX(AVG_SPEED_MPH) AS max_avg_speed,
    AVG(AVG_SPEED_MPH) AS avg_speed,
    
    MIN(TIP_PCT) AS min_tip_pct,
    MAX(TIP_PCT) AS max_tip_pct,
    AVG(TIP_PCT) AS avg_tip_pct

FROM base;

"""

In [29]:
df = pd.read_sql(query_val, conn)

  df = pd.read_sql(query_val, conn)


In [30]:
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,TOTAL_REGISTROS,NULL_PICKUP_DATETIME,NULL_DROPOFF_DATETIME,NULL_PICKUP_DATE,NULL_PU_LOCATION,NULL_DO_LOCATION,NULL_TRIP_DISTANCE,NULL_TOTAL_AMOUNT,NULL_SERVICE_TYPE,NULL_PAYMENT_TYPE,DISTANCIA_FUERA_RANGO,MONTO_FUERA_RANGO,PASAJEROS_FUERA_RANGO,VELOCIDAD_FUERA_RANGO,TIP_PCT_FUERA_RANGO,DROPOFF_ANTES_PICKUP,DURACION_INVALIDA,MIN_TRIP_DISTANCE,MAX_TRIP_DISTANCE,AVG_TRIP_DISTANCE,MIN_TOTAL_AMOUNT,MAX_TOTAL_AMOUNT,AVG_TOTAL_AMOUNT,MIN_TRIP_DURATION,MAX_TRIP_DURATION,AVG_TRIP_DURATION,MIN_AVG_SPEED,MAX_AVG_SPEED,AVG_SPEED,MIN_TIP_PCT,MAX_TIP_PCT,AVG_TIP_PCT
0,855907899,0,0,0,0,0,0,0,0,0,43887,2697394,7,202226,193,93708,6240058,-40840124.4,59016609.3,5.656175,-2567.8,3950611.6,18.485043,-61001725,125373161,19.241372,-2405684000.0,3540926000.0,23.601116,-7933.333333,210928800.0,10.446729
