# NYC Yellow Taxi

## Paso 0: Problema de negocio

### Paso 0.1 Contexto

### Paso 0.2: Problema

### Paso 0.3: Objetivos

## Paso 1: Importar Liberias

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import kagglehub
import shutil
import os


## Paso 2: Cargar los datos

In [3]:
# Download the dataset
path = kagglehub.dataset_download("elemento/nyc-yellow-taxi-trip-data")
print("Path to dataset files:", path)

# Define target directory: one folder above current working directory, in 'data'
target_dir = os.path.abspath(os.path.join(os.getcwd(), "../data"))

# Create the 'data' folder if it doesn't exist
os.makedirs(target_dir, exist_ok=True)

# Move all files from download path to target_dir
for file_name in os.listdir(path):
    src = os.path.join(path, file_name)
    dst = os.path.join(target_dir, file_name)
    shutil.move(src, dst)

print("Files moved to:", target_dir)

Downloading from https://www.kaggle.com/api/v1/datasets/download/elemento/nyc-yellow-taxi-trip-data?dataset_version_number=2...


100%|██████████| 1.78G/1.78G [09:28<00:00, 3.37MB/s]

Extracting files...





Path to dataset files: C:\Users\guill\.cache\kagglehub\datasets\elemento\nyc-yellow-taxi-trip-data\versions\2
Files moved to: C:\Users\guill\Documents\GitHub\nyc-yellow-taxi\data


In [None]:
df = pd.read_csv(f"/content/drive/MyDrive/Predictive_Statistical_Models/yellow_tripdata_2016-03.csv")

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.tail()

## Paso 3: Análisis de datos

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
# Select specific numerical columns
selected_columns = ['passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tip_amount', 'total_amount']
numerical_columns = [col for col in selected_columns if col in df.columns]

# Compute summary statistics
statistics = df[numerical_columns].agg(
    ['mean', 'median', 'std', 'var', 'min', 'max', 'skew', 'kurt']
).T.rename(columns={
    "mean": "Mean",
    "median": "Median",
    "std": "Standard Deviation",
    "var": "Variance",
    "min": "Minimum",
    "max": "Maximum",
    "skew": "Skewness",
    "kurt": "Kurtosis"
})

# Display results
print("\nMedidas de tendencia central y dispersión:\n")
statistics

## Paso 4: Gestión de duplicaciones y valores faltantes

### 4.1 Manejo de duplicaciones

In [None]:
duplicates = df.duplicated().sum()

In [None]:
print(f"Número de registros duplicados: {duplicates}")

In [None]:
df = df.drop_duplicates()

### 4.2 Manejo de valores faltantes

In [None]:
len(df)

In [None]:
missing_data = df.isnull().sum()[df.isnull().sum() > 0]

In [None]:
missing_data

## Paso 5:  Reducción de Datos

In [None]:
data = df.drop(columns=["VendorID", "RatecodeID", "store_and_fwd_flag", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude"])

## Paso 6: Ingeniería de Características (Feature Engineering)

### Paso 6.1: Crear Caracteristicas

In [None]:
data.head()

In [None]:
# Ensure the columns are in datetime format
data['tpep_pickup_datetime'] = pd.to_datetime(data['tpep_pickup_datetime'])
data['tpep_dropoff_datetime'] = pd.to_datetime(data['tpep_dropoff_datetime'])

# Calculate trip duration
data['trip_duration'] = data['tpep_dropoff_datetime'] - data['tpep_pickup_datetime']

# Optional: Get duration in minutes (or seconds, etc.)
data['trip_duration_minutes'] = data['trip_duration'].dt.total_seconds() / 60

In [None]:
data.head()

## Paso 7: Limpieza y Manipulación de Datos

In [None]:
data = data.drop("tpep_pickup_datetime", axis=1)

In [None]:
data = data.drop("tpep_dropoff_datetime", axis=1)

In [None]:
data = data[data['total_amount'] >= 0]

In [None]:
data_cleaned = data.copy()

## Paso 8: Agrupar las variables según el tipo

### Paso 8.1: Guardar la variable objetivo

In [None]:
target_var_num = "total_amount"

In [None]:
target_var_cat = ""

### Paso 8.2: Guardar las variables categorias

In [None]:
data.head()

In [None]:
vars_cat = [var for var in data.columns if (data[var].dtype == "O"
            or var == "payment_type")
            and var != target_var_cat]

In [None]:
vars_cat

### Paso 8.3 Guardar las variables númericas

In [None]:
vars_nums = [var for var in data.columns if data[var].dtype != "O" and var != target_var_num and var not in vars_cat]

In [None]:
vars_nums

### Paso 8.4 Variables discretas

In [None]:
# Initialize lists
vars_disc = []
vars_con = []

# Loop through each numeric variable
for col in vars_nums:
    if pd.api.types.is_integer_dtype(data[col]):
        vars_disc.append(col)
    else:
        vars_con.append(col)

In [None]:
vars_disc

In [None]:
vars_con

## Paso 9: Análisis Univariado

### Paso 9.1: Análisis Univeriado de Variables Categorías

In [None]:
for var in vars_cat:
    order = data[var].value_counts().index
    plt.figure(figsize=(12, 6))
    sns.countplot(data=data, x=var, order=order)
    plt.title(f'Distribución de {var}', fontsize=14)
    plt.xlabel(var)
    plt.ylabel('Frecuencia')
    plt.show()

### Paso 9.2: Análisis Univeriado de Variables Númericas

#### Paso 9.2.1: Análisis Univeriado de Variables Discretas

In [None]:
for var in vars_disc:
    plt.figure(figsize=(10, 6))
    sns.countplot(data=data, x=var)
    plt.title(f'Distribución de {var}', fontsize=14)
    plt.xlabel(var)
    plt.ylabel('Frecuencia')
    plt.show()

#### Paso 9.2.2: Análisis Univeriado de Variables Continuas

In [None]:
for var in vars_con:
    plt.figure(figsize=(16, 6))
    plt.subplot(1, 2, 1)
    sns.histplot(data[var], kde=True, bins=30)
    plt.title(f'Distribución de {var} (Antes de remover atípicos)', fontsize=14)
    plt.xlabel(var)
    plt.ylabel('Frecuencia')

    plt.subplot(1, 2, 2)
    sns.histplot(data_cleaned[var], kde=True, bins=30)
    plt.title(f'Distribución de {var} (Después de remover atípicos)', fontsize=14)
    plt.xlabel(var)
    plt.ylabel('Frecuencia')
    plt.show()

### Paso 9.3: Análisis Univeriado del Variable Objetivo

In [None]:
plt.figure(figsize=(16, 6))
plt.subplot(1, 2, 1)
sns.histplot(data[target_var_num], kde=True, bins=10)
plt.title(f'Distribución de {target_var_num} (Antes de remover atípicos)', fontsize=14)
plt.xlabel(target_var_num)
plt.ylabel('Frecuencia')

plt.subplot(1, 2, 2)
sns.histplot(data_cleaned[target_var_num], kde=True, bins=10)
plt.title(f'Distribución de {target_var_num} (Después de remover atípicos)', fontsize=14)
plt.xlabel(target_var_num)
plt.ylabel('Frecuencia')
plt.show()

## Paso 10: Análisis Bivariado

### Paso 10.1: Análisis Bivariado: Relación con el Precio