# Configuración de paths. 
Busco y agrego el 'root' del Proyecto en sys.path, para evitar problemas al importar mis scripts.

In [1]:
from pathlib import Path
import sys

# Subimos hasta encontrar config.py
base = Path().resolve()
while not (base / "config.py").exists() and base != base.parent:
    base = base.parent

if not (base / "config.py").exists():
    print(f"❌ No se encontró config.py en la jerarquía de carpetas desde: {Path().resolve()}")
    print("➜ sys.path no fue modificado.")
else:
    if str(base) not in sys.path:
        sys.path.insert(0, str(base))
        print(f"✔️ sys.path configurado con raíz del proyecto: {base}")
    else:
        print(f"✔️ sys.path ya está configurado con raíz del proyecto: {base}")

✔️ sys.path configurado con raíz del proyecto: D:\CHardyE-Projects\Python\DataAnalisis\Repositorios\DA_NYC_Taxis_Yellow_LAB


# 1. ETL

## 1.1. Extracción.

In [2]:
import pandas as pd
from pathlib import Path

# Ruta al archivo Parquet
ruta_archivo = Path("../data/raw/yellow_tripdata_2025-01.parquet")

try:
    if not ruta_archivo.exists():
        raise FileNotFoundError(f"Archivo no encontrado: {ruta_archivo.resolve()}")
    # Carga con motor explícito
    df = pd.read_parquet(ruta_archivo, engine="pyarrow")

    print(" Extracción correcta")
    print(f" Registros cargados: {len(df)}")
    print(f" Columnas: {len(df.columns)}")

except FileNotFoundError as e:
    print(f"❌ Error: {e}")
except Exception as e:
    print(f"⚠️ Error inesperado: {e}")

 Extracción correcta
 Registros cargados: 3475226
 Columnas: 20


In [3]:
df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3475226 entries, 0 to 3475225
Data columns (total 20 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   VendorID               3475226 non-null  int32         
 1   tpep_pickup_datetime   3475226 non-null  datetime64[us]
 2   tpep_dropoff_datetime  3475226 non-null  datetime64[us]
 3   passenger_count        2935077 non-null  float64       
 4   trip_distance          3475226 non-null  float64       
 5   RatecodeID             2935077 non-null  float64       
 6   store_and_fwd_flag     2935077 non-null  object        
 7   PULocationID           3475226 non-null  int32         
 8   DOLocationID           3475226 non-null  int32         
 9   payment_type           3475226 non-null  int64         
 10  fare_amount            3475226 non-null  float64       
 11  extra                  3475226 non-null  float64       
 12  mta_tax                34752

In [4]:
df.head(3)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee
0,1,2025-01-01 00:18:38,2025-01-01 00:26:59,1.0,1.6,1.0,N,229,237,1,10.0,3.5,0.5,3.0,0.0,1.0,18.0,2.5,0.0,0.0
1,1,2025-01-01 00:32:40,2025-01-01 00:35:13,1.0,0.5,1.0,N,236,237,1,5.1,3.5,0.5,2.02,0.0,1.0,12.12,2.5,0.0,0.0
2,1,2025-01-01 00:44:04,2025-01-01 00:46:01,1.0,0.6,1.0,N,141,141,1,5.1,3.5,0.5,2.0,0.0,1.0,12.1,2.5,0.0,0.0


In [5]:
ruta_archivo = Path("../data/raw/taxi_zone_lookup.csv")

try:
    if not ruta_archivo.exists():
        raise FileNotFoundError(f"Archivo no encontrado: {ruta_archivo.resolve()}")
    
    df_zonas = pd.read_csv(ruta_archivo)

    print(" Extracción correcta")
    print(f" Registros cargados: {len(df_zonas)}")
    print(f" Columnas: {len(df_zonas.columns)}")

except FileNotFoundError as e:
    print(f"❌ Error: {e}")
except Exception as e:
    print(f"⚠️ Error inesperado: {e}")

 Extracción correcta
 Registros cargados: 265
 Columnas: 4


In [6]:
df_zonas.info(verbose=True, show_counts=True)
df_zonas.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 265 entries, 0 to 264
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   LocationID    265 non-null    int64 
 1   Borough       264 non-null    object
 2   Zone          264 non-null    object
 3   service_zone  263 non-null    object
dtypes: int64(1), object(3)
memory usage: 8.4+ KB


Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone
5,6,Staten Island,Arrochar/Fort Wadsworth,Boro Zone
6,7,Queens,Astoria,Boro Zone
7,8,Queens,Astoria Park,Boro Zone
8,9,Queens,Auburndale,Boro Zone
9,10,Queens,Baisley Park,Boro Zone


## 1.2. Transformación.

### 1.2.1. Analisis de Nulos

In [7]:
df.info(verbose=True, show_counts=True)
df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3475226 entries, 0 to 3475225
Data columns (total 20 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   VendorID               3475226 non-null  int32         
 1   tpep_pickup_datetime   3475226 non-null  datetime64[us]
 2   tpep_dropoff_datetime  3475226 non-null  datetime64[us]
 3   passenger_count        2935077 non-null  float64       
 4   trip_distance          3475226 non-null  float64       
 5   RatecodeID             2935077 non-null  float64       
 6   store_and_fwd_flag     2935077 non-null  object        
 7   PULocationID           3475226 non-null  int32         
 8   DOLocationID           3475226 non-null  int32         
 9   payment_type           3475226 non-null  int64         
 10  fare_amount            3475226 non-null  float64       
 11  extra                  3475226 non-null  float64       
 12  mta_tax                34752

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee
0,1,2025-01-01 00:18:38,2025-01-01 00:26:59,1.0,1.6,1.0,N,229,237,1,10.0,3.5,0.5,3.0,0.0,1.0,18.0,2.5,0.0,0.0
1,1,2025-01-01 00:32:40,2025-01-01 00:35:13,1.0,0.5,1.0,N,236,237,1,5.1,3.5,0.5,2.02,0.0,1.0,12.12,2.5,0.0,0.0
2,1,2025-01-01 00:44:04,2025-01-01 00:46:01,1.0,0.6,1.0,N,141,141,1,5.1,3.5,0.5,2.0,0.0,1.0,12.1,2.5,0.0,0.0


In [8]:
print(df.isnull().sum()) # Muestra valores nulos (valor cero no es nulo)

VendorID                      0
tpep_pickup_datetime          0
tpep_dropoff_datetime         0
passenger_count          540149
trip_distance                 0
RatecodeID               540149
store_and_fwd_flag       540149
PULocationID                  0
DOLocationID                  0
payment_type                  0
fare_amount                   0
extra                         0
mta_tax                       0
tip_amount                    0
tolls_amount                  0
improvement_surcharge         0
total_amount                  0
congestion_surcharge     540149
Airport_fee              540149
cbd_congestion_fee            0
dtype: int64


In [9]:
cols = ["passenger_count", "RatecodeID",  "store_and_fwd_flag", "congestion_surcharge", "Airport_fee"]

for col in cols:
    print(f"\n--- {col} ---")
    print(df[col].value_counts(dropna=False).sort_index())    # Hace que muestre los NaN


--- passenger_count ---
passenger_count
0.0      24656
1.0    2322434
2.0     407761
3.0      91409
4.0      59009
5.0      17786
6.0      12004
7.0          4
8.0         11
9.0          3
NaN     540149
Name: count, dtype: int64

--- RatecodeID ---
RatecodeID
1.0     2756472
2.0       94420
3.0        8622
4.0        7092
5.0       26501
6.0           7
99.0      41963
NaN      540149
Name: count, dtype: int64

--- store_and_fwd_flag ---
store_and_fwd_flag
N       2927431
Y          7646
None     540149
Name: count, dtype: int64

--- congestion_surcharge ---
congestion_surcharge
-2.5      48321
 0.0     225938
 2.5    2660818
 NaN     540149
Name: count, dtype: int64

--- Airport_fee ---
Airport_fee
-1.75      10411
 0.00    2706446
 0.75          1
 1.25          8
 1.75     218203
 5.00          7
 6.75          1
 NaN      540149
Name: count, dtype: int64


Vamos a corregir ``passenger_count`` partiendo de la lógica que si se registro un cobro y además existe una distancia recorrida, el taxi no pudo ir vacio, tuvo que llevar al menos 1 pasajero.
Creo una nueva columna ``xx_passenger_count_imputed = True`` que me permitiria filtrar, visualizar o excluir esos registros en análisis sensibles.

In [10]:
passenger_null =  df["passenger_count"].isnull().sum() # solo para tener el total
condicion_imputacion = (
    df["passenger_count"].isna() &  # es nulo
    (df["fare_amount"] > 0) &       # hubo 1 cobro positivo
    (df["trip_distance"] > 0)       # se recorrio alguna distancia
)
# Imputar valor 1 solo en esos casos
df.loc[condicion_imputacion, "passenger_count"] = 1.0

# Crear columna de trazabilidad
df["xx_passenger_count_imputed"] = False
# Marcar trazabilidad
df.loc[condicion_imputacion, "xx_passenger_count_imputed"] = True

# Resumen
print("Total imputados:", df["xx_passenger_count_imputed"].sum(), " de ", passenger_null)
print("Distribución de passenger_count:")
print(df["passenger_count"].value_counts(dropna=False).sort_index())

Total imputados: 413394  de  540149
Distribución de passenger_count:
passenger_count
0.0      24656
1.0    2735828
2.0     407761
3.0      91409
4.0      59009
5.0      17786
6.0      12004
7.0          4
8.0         11
9.0          3
NaN     126755
Name: count, dtype: int64


In [11]:
# Ver cuántos NaN (nulos) quedan en cada columna
print("xx_passenger_count_imputed nulos:", df["xx_passenger_count_imputed"].isna().sum()) 
print("passenger_count nulos:           ", df["passenger_count"].isna().sum())

xx_passenger_count_imputed nulos: 0
passenger_count nulos:            126755


In [12]:
df_restantes = df[df["passenger_count"].isna()]

# sin cobro ni distancia
caso_A = df_restantes[(df_restantes["fare_amount"] == 0) & (df_restantes["trip_distance"] == 0)]
# cobro sin distancia
caso_B = df_restantes[(df_restantes["fare_amount"] > 0) & (df_restantes["trip_distance"] == 0)] 
# distancia sin cobro
caso_C = df_restantes[(df_restantes["fare_amount"] == 0) & (df_restantes["trip_distance"] > 0)]
# ambos > 0 pero no imputados
caso_D = df_restantes[(df_restantes["fare_amount"] > 0) & (df_restantes["trip_distance"] > 0)]

print("Caso A (sin cobro ni distancia):     ", len(caso_A))
print("Caso B (cobro sin distancia):        ", len(caso_B))
print("Caso C (distancia sin cobro):        ", len(caso_C))
print("Caso D (ambos > 0 pero no imputados):", len(caso_D))


Caso A (sin cobro ni distancia):      11
Caso B (cobro sin distancia):         41894
Caso C (distancia sin cobro):         28
Caso D (ambos > 0 pero no imputados): 0


##### - Caso A.
Sin cobro de tarifa ni distancia recorrida.

In [13]:
# marco todos los del Caso A como registro no valido.

df["xx_invalid_record"] = False  # Creo la una nueva columna mara ir marcando los registros no validos (=True).

df.loc[caso_A.index, "xx_invalid_record"] = True
print(f"Se marcaron como No Válidos {df['xx_invalid_record'].sum():,} registros.")


Se marcaron como No Válidos 11 registros.


#### - Caso B.
Se cobro la tarifa pero no hay distancia recorrida.

In [14]:
cols = ["fare_amount", "extra", "mta_tax", "tip_amount", "tolls_amount", "PULocationID", "DOLocationID", "total_amount"]
caso_B[cols].describe()
caso_B[cols].info()


<class 'pandas.core.frame.DataFrame'>
Index: 41894 entries, 2935079 to 3475157
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   fare_amount   41894 non-null  float64
 1   extra         41894 non-null  float64
 2   mta_tax       41894 non-null  float64
 3   tip_amount    41894 non-null  float64
 4   tolls_amount  41894 non-null  float64
 5   PULocationID  41894 non-null  int32  
 6   DOLocationID  41894 non-null  int32  
 7   total_amount  41894 non-null  float64
dtypes: float64(6), int32(2)
memory usage: 2.6 MB


In [15]:
# Totales globales
total_viajes = len(df)
total_monto = df["total_amount"].sum()

# Totales del caso B
total_caso_b = len(caso_B)
monto_caso_b = caso_B["total_amount"].sum()

# Porcentajes
porcentaje_viajes = (total_caso_b / total_viajes) * 100
porcentaje_monto = (monto_caso_b / total_monto) * 100

print(f"Los Registros del caso B, Representan ≈{porcentaje_viajes:.1f}% del total de viajes y ≈{porcentaje_monto:.1f}% del monto total.")

Los Registros del caso B, Representan ≈1.2% del total de viajes y ≈1.1% del monto total.


---

**Diagnóstico del caso B**

*Características:*

- fare_amount **> 0**   ----------------------→ **hay cobro**
- trip_distance **== 0**  --------------------→ **no se registró distancia**
- RatecodeID **nulo**  -----------------------→ **no se registró tipo de tarifa**
- passenger_count **nulo**  ------------------→ **no se pudo imputar**
- PULocationID y DOLocationID **presentes** --→ **hay indicio de trayecto**
- total_amount **> 0**  ----------------------→ **hubo cobro total**

*Interpretación:*

- Posible fallo en el equipo de registro (GPS, taxímetro, sincronización)
- No hay evidencia suficiente para imputar ``passenger_count`` con certeza
- Representan ≈1.2% del total de viajes y ≈1.1% del monto total
- No comprometen el análisis si se excluyen

***Se marcan los registros como no válidos (para poder excluirlos) y se exporta DF del Caso B para analisis posterior.***

---

In [16]:
df.loc[caso_B.index, "xx_invalid_record"] = True
print(f"Se marcaron como No Válidos {df.loc[caso_B.index, 'xx_invalid_record'].sum()} registros.")

Se marcaron como No Válidos 41894 registros.


In [17]:
df_caso_b = df.loc[caso_B.index]

for col in df_caso_b.columns:
    if df_caso_b[col].dtype == "object":
        df_caso_b[col] = df_caso_b[col].astype(str)

# Guardar dataset 
#df_caso_b.to_parquet("../data/to_review/registros_invalidos_caso_b.parquet", index=False, engine="pyarrow")
df_caso_b.to_csv("../data/to_review/registros_invalidos_caso_b.csv", index=False)

#### Caso C.
En términos de negocio, son viajes que se movieron físicamente pero no generaron tarifa. Esto puede deberse a:
- Errores de carga (distancia registrada pero tarifa olvidada).
- Viajes de prueba o cortesía.
- Inconsistencias en la imputación de datos.
- Son solo 28 reg. y No comprometen el análisis si se excluyen debido

***Se marcan los registros como no válidos (para poder excluirlos) y se exporta DF del Caso C para analisis posterior.***


In [18]:

df.loc[caso_C.index, "xx_invalid_record"] = True
print(f"Se marcaron como No Válidos {len(caso_C)} registros en caso C.")

df_caso_c = df.loc[caso_C.index]

for col in df_caso_c.columns:
    if df_caso_c[col].dtype == "object":
        df_caso_c[col] = df_caso_c[col].astype(str)

df_caso_c.to_csv("../data/to_review/registros_invalidos_caso_c.csv", index=False)

Se marcaron como No Válidos 28 registros en caso C.


In [19]:
true_count = df["xx_invalid_record"].sum()
print(f"Se marcaron como 'No Válido' {true_count} registros.")

Se marcaron como 'No Válido' 41933 registros.


**Analizo ahora los que aparecen con 0.0, ya que tampoco deberian estar registrados los viajes con cero pasajeros.**

In [20]:
df_pasajero_cero = df[df["passenger_count"] == 0.0].copy()

In [21]:
# verifico si hubo cobros y su variedad cuando no se registraron pasajeros
df_pasajero_cero["fare_amount"].describe()

count    24656.000000
mean        15.494218
std         14.833706
min        -63.230000
25%          7.900000
50%         11.400000
75%         17.000000
max        500.000000
Name: fare_amount, dtype: float64

- count    24656.000000 -> Total de viajes registrados con 0 pasajeros.
- mean        15.494218 -> Tarifa promedio cobrada en esos viajes.
- std         14.833706 -> Alta variabilidad en las tarifas.
- min        -63.230000 -> ⚠️ Tarifa negativa: posible error o ajuste.
- 25%          7.900000
- 50%         11.400000
- 75%         17.000000
- max        500.000000 -> ⚠️ Tarifa extremadamente alta para un viaje sin pasajeros. 

In [22]:
total_con_cobro_valido = df_pasajero_cero[
    (df_pasajero_cero["fare_amount"] > 0) &     # excluyo viajes sin cobro o con tarifa negativa.
    (df_pasajero_cero["fare_amount"] <= 17.0)   # descarto valores atípicos por encima del tercer cuartil.
].shape[0]

# Resumen
print(f"Total de viajes con passenger_count = 0.0 y cobro válido:         {total_con_cobro_valido}")

total_con_cobro_no_valido = df_pasajero_cero[df_pasajero_cero["fare_amount"] < 0].shape[0]
print(f"Total de viajes con passenger_count = 0.0 y cobro no válido:      {total_con_cobro_no_valido}")

total_con_cobro_mayor_75 = df_pasajero_cero[df_pasajero_cero["fare_amount"] > 17].shape[0]
print(f"Total de viajes con passenger_count = 0.0 y cobro mayor al 75%:   {total_con_cobro_mayor_75}")

Total de viajes con passenger_count = 0.0 y cobro válido:         18661
Total de viajes con passenger_count = 0.0 y cobro no válido:      5
Total de viajes con passenger_count = 0.0 y cobro mayor al 75%:   5956


In [23]:
condicion_cero_valido = (
    (df["passenger_count"] == 0.0) &
    (df["fare_amount"] > 0) &
    (df["fare_amount"] <= 17.0)
)
# Imputar valor 1 solo en esos casos de cobro validado
df.loc[condicion_cero_valido, "passenger_count"] = 1
# Marcar trazabilidad
df.loc[condicion_cero_valido, "xx_passenger_count_imputed"] = True

total_imputados_cero = condicion_cero_valido.sum()
print("Total imputados desde 0.0 con cobro válido:", total_imputados_cero, " de ", df_pasajero_cero.shape[0])
print("Distribución de passenger_count tras imputación:")
print(df["passenger_count"].value_counts().sort_index())

Total imputados desde 0.0 con cobro válido: 18661  de  24656
Distribución de passenger_count tras imputación:
passenger_count
0.0       5995
1.0    2754489
2.0     407761
3.0      91409
4.0      59009
5.0      17786
6.0      12004
7.0          4
8.0         11
9.0          3
Name: count, dtype: int64


In [24]:
# Analizo ahora las tarifa del 4to cuartil
df_pasajero_cero[df_pasajero_cero["fare_amount"] > 17]["fare_amount"].describe()

count    5956.000000
mean       32.837080
std        21.769846
min        17.050000
25%        19.800000
50%        24.700000
75%        37.300000
max       500.000000
Name: fare_amount, dtype: float64

In [25]:
df_pasajero_cero_alto = df_pasajero_cero[df_pasajero_cero["fare_amount"] > 17].copy()
# Total general del subconjunto
print(f"Total de registros con passenger_count = 0.0 y cobro alto: {df_pasajero_cero_alto.shape[0]}")
# Cobros válidos (positivos)
total_valido_alto = df_pasajero_cero_alto[df_pasajero_cero_alto["fare_amount"] < 37.30].shape[0]
print(f"Total de viajes con passenger_count = 0.0 y cobro < 37.30 válido: {total_valido_alto}")

Total de registros con passenger_count = 0.0 y cobro alto: 5956
Total de viajes con passenger_count = 0.0 y cobro < 37.30 válido: 4434


In [26]:
condicion_cero_alto_valido = (
    (df["passenger_count"] == 0.0) &
    (df["fare_amount"] > 17.0) &
    (df["fare_amount"] < 37.30)
)

df.loc[condicion_cero_alto_valido, "passenger_count"] = 1
# Marcar trazabilidad
df.loc[condicion_cero_alto_valido, "xx_passenger_count_imputed"] = True
# Resumen
total_imputados_cero_alto = condicion_cero_alto_valido.sum()
print("Total imputados desde 0.0 con cobro entre 17.0 y 37.30:", total_imputados_cero_alto, " de ", df_pasajero_cero_alto.shape[0])
print("Distribución de passenger_count tras imputación extendida:")
print(df["passenger_count"].value_counts().sort_index())


Total imputados desde 0.0 con cobro entre 17.0 y 37.30: 4434  de  5956
Distribución de passenger_count tras imputación extendida:
passenger_count
0.0       1561
1.0    2758923
2.0     407761
3.0      91409
4.0      59009
5.0      17786
6.0      12004
7.0          4
8.0         11
9.0          3
Name: count, dtype: int64


In [27]:
condicion_nan_no_imputado = df["passenger_count"].isnull() & (~df["xx_passenger_count_imputed"])
df.loc[condicion_nan_no_imputado, "xx_invalid_record"] = True

condicion_cero_excesivo = (df["passenger_count"] == 0.0) & (df["fare_amount"] > 37.30)
df.loc[condicion_cero_excesivo, "xx_invalid_record"] = True

print("Total de registros marcados como inválidos:", df["xx_invalid_record"].sum())
print("Distribución de passenger_count en registros inválidos:")
print(df[df["xx_invalid_record"]]["passenger_count"].value_counts().sort_index())


Total de registros marcados como inválidos: 128227
Distribución de passenger_count en registros inválidos:
passenger_count
0.0    1472
Name: count, dtype: int64


### 1.2.2. Reemplazo los códigos numéricos por sus descripciones legibles.
 
Objetivos:
1. Mejorar la legibilidad inmediata mapeando códigos a descripciones.
2. Facilitar el análisis exploratorio
3. Evitar errores de interpretación, eliminando la necesidad de recordar qué significa cada número.

In [28]:
ratecode_map = {
    1: "Standard",
    2: "JFK",
    3: "Newark",
    4: "Nassau or Westchester",
    5: "Negotiated",
    6: "Group",
    99: "Unknown"
}

df["RateCode"] = df["RatecodeID"].map(ratecode_map).fillna("Unknown")           

# Reordenar columnas: insertar RateCode justo después de RatecodeID
cols = df.columns.tolist()
if "RateCode" in cols:
    cols.remove("RateCode")  # Eliminar la versión que quedó al final
idx = cols.index("RatecodeID")
cols = cols[:idx+1] + ["RateCode"] + cols[idx+1:]
df = df[cols]


df.head(3)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,RateCode,store_and_fwd_flag,PULocationID,DOLocationID,...,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee,xx_passenger_count_imputed,xx_invalid_record
0,1,2025-01-01 00:18:38,2025-01-01 00:26:59,1.0,1.6,1.0,Standard,N,229,237,...,0.5,3.0,0.0,1.0,18.0,2.5,0.0,0.0,False,False
1,1,2025-01-01 00:32:40,2025-01-01 00:35:13,1.0,0.5,1.0,Standard,N,236,237,...,0.5,2.02,0.0,1.0,12.12,2.5,0.0,0.0,False,False
2,1,2025-01-01 00:44:04,2025-01-01 00:46:01,1.0,0.6,1.0,Standard,N,141,141,...,0.5,2.0,0.0,1.0,12.1,2.5,0.0,0.0,False,False


Repitiendo la misma lógica la transformaciona a mis columnas "payment_type", "store_and_fwd_flag"

In [29]:
payment_map = {
        1: "Credit card",
        2: "Cash",
        3: "No charge",
        4: "Dispute",
        5: "Unknown",
        6: "Voided trip"
    }

payment_map_es = {
    1: "T.Crédito",
    2: "Efectivo",
    3: "Sin cargo",
    4: "Reclamación",
    5: "Desconocido",
    6: "Viaje anulado"
}

df["PaymentType"] = df["payment_type"].map(payment_map_es).fillna("Unknown")
# Reordenar columnas: insertar RateCode justo después de RatecodeID
cols = df.columns.tolist()
if "PaymentType" in cols:
    cols.remove("PaymentType")  # Eliminar la versión que quedó al final
idx = cols.index("payment_type")
cols = cols[:idx+1] + ["PaymentType"] + cols[idx+1:]
df = df[cols]

df.head(3)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,RateCode,store_and_fwd_flag,PULocationID,DOLocationID,...,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee,xx_passenger_count_imputed,xx_invalid_record
0,1,2025-01-01 00:18:38,2025-01-01 00:26:59,1.0,1.6,1.0,Standard,N,229,237,...,0.5,3.0,0.0,1.0,18.0,2.5,0.0,0.0,False,False
1,1,2025-01-01 00:32:40,2025-01-01 00:35:13,1.0,0.5,1.0,Standard,N,236,237,...,0.5,2.02,0.0,1.0,12.12,2.5,0.0,0.0,False,False
2,1,2025-01-01 00:44:04,2025-01-01 00:46:01,1.0,0.6,1.0,Standard,N,141,141,...,0.5,2.0,0.0,1.0,12.1,2.5,0.0,0.0,False,False


In [30]:
# La columna StoreAndFwd (store_and_fwd_flag) indica si los datos del viaje fueron almacenados 
# temporalmente en el dispositivo del taxi antes de ser enviados al servidor central. Esto 
# suele ocurrir por problemas de conectividad en tiempo real.

flag_map = {
        "Y": "Stored and forwarded",
        "N": "Not stored",
        None: "Unknown"
    }
df["StoreAndFwd"] = df["store_and_fwd_flag"].map(flag_map).fillna("Unknown")
# Reordenar columnas: insertar RateCode justo después de RatecodeID
cols = df.columns.tolist()
if "StoreAndFwd" in cols:
    cols.remove("StoreAndFwd")  # Eliminar la versión que quedó al final
idx = cols.index("store_and_fwd_flag")
cols = cols[:idx+1] + ["StoreAndFwd"] + cols[idx+1:]
df = df[cols]

df.head(3)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,RateCode,store_and_fwd_flag,StoreAndFwd,PULocationID,...,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee,xx_passenger_count_imputed,xx_invalid_record
0,1,2025-01-01 00:18:38,2025-01-01 00:26:59,1.0,1.6,1.0,Standard,N,Not stored,229,...,0.5,3.0,0.0,1.0,18.0,2.5,0.0,0.0,False,False
1,1,2025-01-01 00:32:40,2025-01-01 00:35:13,1.0,0.5,1.0,Standard,N,Not stored,236,...,0.5,2.02,0.0,1.0,12.12,2.5,0.0,0.0,False,False
2,1,2025-01-01 00:44:04,2025-01-01 00:46:01,1.0,0.6,1.0,Standard,N,Not stored,141,...,0.5,2.0,0.0,1.0,12.1,2.5,0.0,0.0,False,False


In [31]:
# Elimino columnas innecesarias
df = df.drop(columns=["RatecodeID"])
df = df.drop(columns=["payment_type"])
df = df.drop(columns=["store_and_fwd_flag"])

df.head(3)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RateCode,StoreAndFwd,PULocationID,DOLocationID,PaymentType,...,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee,xx_passenger_count_imputed,xx_invalid_record
0,1,2025-01-01 00:18:38,2025-01-01 00:26:59,1.0,1.6,Standard,Not stored,229,237,T.Crédito,...,0.5,3.0,0.0,1.0,18.0,2.5,0.0,0.0,False,False
1,1,2025-01-01 00:32:40,2025-01-01 00:35:13,1.0,0.5,Standard,Not stored,236,237,T.Crédito,...,0.5,2.02,0.0,1.0,12.12,2.5,0.0,0.0,False,False
2,1,2025-01-01 00:44:04,2025-01-01 00:46:01,1.0,0.6,Standard,Not stored,141,141,T.Crédito,...,0.5,2.0,0.0,1.0,12.1,2.5,0.0,0.0,False,False


### 1.2.3 Preparando DataFrame para Exportar.

In [32]:
# Excluyo los registros marcados como No V}alidos
df_validos = df[~df["xx_invalid_record"]].copy()

print(f"Total de registros válidos: {df_validos.shape[0]}")
print("Distribución de passenger_count en datos válidos:")
print(df_validos["passenger_count"].value_counts(dropna=False).sort_index())


Total de registros válidos: 3346999
Distribución de passenger_count en datos válidos:
passenger_count
0.0         89
1.0    2758923
2.0     407761
3.0      91409
4.0      59009
5.0      17786
6.0      12004
7.0          4
8.0         11
9.0          3
Name: count, dtype: int64


In [33]:
# Forzar tipos antes de exportar
df_validos["VendorID"] = df["VendorID"].astype("int32")
df_validos["passenger_count"] = df["passenger_count"].fillna(0).astype("int32")

print(df_validos.dtypes)
df_validos.head(3)

VendorID                               int32
tpep_pickup_datetime          datetime64[us]
tpep_dropoff_datetime         datetime64[us]
passenger_count                        int32
trip_distance                        float64
RateCode                              object
StoreAndFwd                           object
PULocationID                           int32
DOLocationID                           int32
PaymentType                           object
fare_amount                          float64
extra                                float64
mta_tax                              float64
tip_amount                           float64
tolls_amount                         float64
improvement_surcharge                float64
total_amount                         float64
congestion_surcharge                 float64
Airport_fee                          float64
cbd_congestion_fee                   float64
xx_passenger_count_imputed              bool
xx_invalid_record                       bool
dtype: obj

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RateCode,StoreAndFwd,PULocationID,DOLocationID,PaymentType,...,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee,xx_passenger_count_imputed,xx_invalid_record
0,1,2025-01-01 00:18:38,2025-01-01 00:26:59,1,1.6,Standard,Not stored,229,237,T.Crédito,...,0.5,3.0,0.0,1.0,18.0,2.5,0.0,0.0,False,False
1,1,2025-01-01 00:32:40,2025-01-01 00:35:13,1,0.5,Standard,Not stored,236,237,T.Crédito,...,0.5,2.02,0.0,1.0,12.12,2.5,0.0,0.0,False,False
2,1,2025-01-01 00:44:04,2025-01-01 00:46:01,1,0.6,Standard,Not stored,141,141,T.Crédito,...,0.5,2.0,0.0,1.0,12.1,2.5,0.0,0.0,False,False


In [34]:
print(df_validos.isnull().sum()) # Muestra valores nulos

VendorID                           0
tpep_pickup_datetime               0
tpep_dropoff_datetime              0
passenger_count                    0
trip_distance                      0
RateCode                           0
StoreAndFwd                        0
PULocationID                       0
DOLocationID                       0
PaymentType                        0
fare_amount                        0
extra                              0
mta_tax                            0
tip_amount                         0
tolls_amount                       0
improvement_surcharge              0
total_amount                       0
congestion_surcharge          413394
Airport_fee                   413394
cbd_congestion_fee                 0
xx_passenger_count_imputed         0
xx_invalid_record                  0
dtype: int64


## 1.3. Carga.

In [35]:
import os

ruta = "../data/processed/yellow_tripdata_validado.csv"

try:
    df_validos.to_csv(ruta, index=False)
    if os.path.exists(ruta):
        print(f"✔️ Dataset guardado con éxito en: {ruta}")
    else:
        print("⚠️ No se encontró el archivo después de guardar.")
except Exception as e:
    print(f"❌ Error al guardar el dataset: {e}")


✔️ Dataset guardado con éxito en: ../data/processed/yellow_tripdata_validado.csv


In [36]:

df_validos.info()
df_validos.head(3)

<class 'pandas.core.frame.DataFrame'>
Index: 3346999 entries, 0 to 3475225
Data columns (total 22 columns):
 #   Column                      Dtype         
---  ------                      -----         
 0   VendorID                    int32         
 1   tpep_pickup_datetime        datetime64[us]
 2   tpep_dropoff_datetime       datetime64[us]
 3   passenger_count             int32         
 4   trip_distance               float64       
 5   RateCode                    object        
 6   StoreAndFwd                 object        
 7   PULocationID                int32         
 8   DOLocationID                int32         
 9   PaymentType                 object        
 10  fare_amount                 float64       
 11  extra                       float64       
 12  mta_tax                     float64       
 13  tip_amount                  float64       
 14  tolls_amount                float64       
 15  improvement_surcharge       float64       
 16  total_amount           

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RateCode,StoreAndFwd,PULocationID,DOLocationID,PaymentType,...,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee,xx_passenger_count_imputed,xx_invalid_record
0,1,2025-01-01 00:18:38,2025-01-01 00:26:59,1,1.6,Standard,Not stored,229,237,T.Crédito,...,0.5,3.0,0.0,1.0,18.0,2.5,0.0,0.0,False,False
1,1,2025-01-01 00:32:40,2025-01-01 00:35:13,1,0.5,Standard,Not stored,236,237,T.Crédito,...,0.5,2.02,0.0,1.0,12.12,2.5,0.0,0.0,False,False
2,1,2025-01-01 00:44:04,2025-01-01 00:46:01,1,0.6,Standard,Not stored,141,141,T.Crédito,...,0.5,2.0,0.0,1.0,12.1,2.5,0.0,0.0,False,False


# 2 Celda Temp - Tools Varias

## 2.1 Backup

In [37]:
# Temporalmente creo un copia de mi df para ir probando las distintas conversiones y o uniones, 
# y no tener que estar realizando la extraccion nuevamente
df_backup = df.copy(deep=True)
df.head(3)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RateCode,StoreAndFwd,PULocationID,DOLocationID,PaymentType,...,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee,xx_passenger_count_imputed,xx_invalid_record
0,1,2025-01-01 00:18:38,2025-01-01 00:26:59,1.0,1.6,Standard,Not stored,229,237,T.Crédito,...,0.5,3.0,0.0,1.0,18.0,2.5,0.0,0.0,False,False
1,1,2025-01-01 00:32:40,2025-01-01 00:35:13,1.0,0.5,Standard,Not stored,236,237,T.Crédito,...,0.5,2.02,0.0,1.0,12.12,2.5,0.0,0.0,False,False
2,1,2025-01-01 00:44:04,2025-01-01 00:46:01,1.0,0.6,Standard,Not stored,141,141,T.Crédito,...,0.5,2.0,0.0,1.0,12.1,2.5,0.0,0.0,False,False


## 2.2 Restauración

In [38]:
# Restauro mi df_bkp
df = df_backup.copy(deep=True)
df.head(3)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RateCode,StoreAndFwd,PULocationID,DOLocationID,PaymentType,...,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee,xx_passenger_count_imputed,xx_invalid_record
0,1,2025-01-01 00:18:38,2025-01-01 00:26:59,1.0,1.6,Standard,Not stored,229,237,T.Crédito,...,0.5,3.0,0.0,1.0,18.0,2.5,0.0,0.0,False,False
1,1,2025-01-01 00:32:40,2025-01-01 00:35:13,1.0,0.5,Standard,Not stored,236,237,T.Crédito,...,0.5,2.02,0.0,1.0,12.12,2.5,0.0,0.0,False,False
2,1,2025-01-01 00:44:04,2025-01-01 00:46:01,1.0,0.6,Standard,Not stored,141,141,T.Crédito,...,0.5,2.0,0.0,1.0,12.1,2.5,0.0,0.0,False,False


## 2.3 Peso de los DataFrames

In [39]:
try:
    peso_mb_df = df.memory_usage(deep=True).sum() / (1024 ** 2)
    print(f"Peso del DataFrame df:         {peso_mb_df:.2f} MB")
    peso_mb_df_zonas = df_zonas.memory_usage(deep=True).sum() / (1024 ** 2)
    print(f"Peso del DataFrame df_zonas:   {peso_mb_df_zonas:.2f} MB")
    peso_mb_df_validos = df_validos.memory_usage(deep=True).sum() / (1024 ** 2)
    print(f"Peso del DataFrame df_validos: {peso_mb_df_validos:.2f} MB")
except:
    pass

Peso del DataFrame df:         1127.90 MB
Peso del DataFrame df_zonas:   0.05 MB
Peso del DataFrame df_validos: 1101.74 MB
