In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor


In [2]:
# Cargar los datos de viajes en taxi
file_path_taxi = '../Datasets/yellow_tripdata_2023-09s.csv'
taxi_data = pd.read_csv(file_path_taxi, low_memory=False)

# Cargar los datos de mapeo de boroughs
file_path_boroughs = '../Datasets/taxi+_zone_lookup.csv'
borough_data = pd.read_csv(file_path_boroughs)

In [3]:
# Preprocesamiento y mapeo
taxi_data['tpep_pickup_datetime'] = pd.to_datetime(taxi_data['tpep_pickup_datetime'])
taxi_data_with_boroughs = taxi_data.merge(borough_data[['LocationID', 'Borough']], left_on='PULocationID', right_on='LocationID', how='left')
taxi_data_with_boroughs.rename(columns={'Borough': 'PUBorough'}, inplace=True)
taxi_data_with_boroughs = taxi_data_with_boroughs.merge(borough_data[['LocationID', 'Borough']], left_on='DOLocationID', right_on='LocationID', how='left')
taxi_data_with_boroughs.rename(columns={'Borough': 'DOBorough'}, inplace=True)
taxi_data_with_boroughs.drop(['LocationID_x', 'LocationID_y'], axis=1, inplace=True)

In [4]:
# Agrupación y análisis
taxi_data_with_boroughs['pickup_hour'] = taxi_data_with_boroughs['tpep_pickup_datetime'].dt.hour
taxi_data_with_boroughs['pickup_day_of_week'] = taxi_data_with_boroughs['tpep_pickup_datetime'].dt.dayofweek
grouped_data = taxi_data_with_boroughs.groupby(['PUBorough', 'pickup_day_of_week', 'pickup_hour']).size().reset_index(name='trip_count')

In [5]:
# Preparación para el modelado
one_hot_boroughs = pd.get_dummies(grouped_data['PUBorough'], prefix='Borough')
model_data = pd.concat([grouped_data, one_hot_boroughs], axis=1)
model_data.drop(['PUBorough'], axis=1, inplace=True)
X = model_data.drop('trip_count', axis=1)
y = model_data['trip_count']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
# Entrenamiento del modelo
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

In [7]:
# Función para realizar predicciones
def predict_taxi_demand(pickup_day_of_week, pickup_hour, borough):
    # Preparar los datos de entrada
    input_data = pd.DataFrame({
        'pickup_day_of_week': [pickup_day_of_week],
        'pickup_hour': [pickup_hour],
        'Borough_Bronx': [1 if borough == 'Bronx' else 0],
        'Borough_Brooklyn': [1 if borough == 'Brooklyn' else 0],
        'Borough_EWR': [1 if borough == 'EWR' else 0],
        'Borough_Manhattan': [1 if borough == 'Manhattan' else 0],
        'Borough_Queens': [1 if borough == 'Queens' else 0],
        'Borough_Staten Island': [1 if borough == 'Staten Island' else 0],
        'Borough_Unknown': [1 if borough == 'Unknown' else 0]
    })

    # Realizar la predicción
    predicted_demand = rf_model.predict(input_data)
    return max(0, predicted_demand[0])  # Aseguramos que la demanda no sea negativa


In [9]:

# Probar la función de predicción
example_prediction = predict_taxi_demand(2, 18, 'Manhattan')
print("Predicción de demanda:", int(example_prediction), "viajes")


Predicción de demanda: 29081 viajes


In [10]:
import joblib

# Guardar el modelo en un archivo
model_filename = '../Modelo/random_forest_model.joblib'
joblib.dump(rf_model, model_filename)


['../Modelo/random_forest_model.joblib']