In [7]:
# # Configura el entorno de trabajo
%reload_ext autoreload
%autoreload 2

In [8]:
import sys
from pathlib import Path
from datetime import datetime

# Añade src al path
sys.path.append(str(Path().resolve().parent / "src"))

import config

In [9]:
import hopsworks

# Conecta a Hopsworks
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value=config.HOPSWORKS_API_KEY
)

# Conecta a la API de Feature Store
feature_store = project.get_feature_store()

# Conecta al grupo de características
feature_group = feature_store.get_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION,
)

2025-06-04 14:03:21,817 INFO: Initializing external client
2025-06-04 14:03:21,818 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-06-04 14:03:23,093 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1224869


In [10]:
# Crea una vista de características (si no existe ya)
# Esta vista de características usa solo un grupo de características, por lo que la consulta es trivial
try:
    # Crea la vista de características si no existe
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all()
    )
except:
    print('Feature view already existed. Skip creation.')

Feature view already existed. Skip creation.


In [11]:
# obtenemos la vista de características
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME,
    version=config.FEATURE_VIEW_VERSION
)

In [12]:
# obtenemos la vista de características necesaria para el entrenamiento
ts_data, _ = feature_view.training_data(
    description='Time-series hourly taxi rides',
)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (131.34s) 




In [13]:
ts_data.head()

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2024-10-16 16:00:00+00:00,118,0
1,2024-06-30 06:00:00+00:00,226,9
2,2024-07-25 09:00:00+00:00,132,163
3,2024-06-15 16:00:00+00:00,138,163
4,2024-09-19 03:00:00+00:00,246,35


In [14]:
import pandas as pd

ts_data['pickup_hour'] = pd.to_datetime(ts_data['pickup_hour'])

ts_data.describe(include='all')

Unnamed: 0,pickup_hour,pickup_location_id,rides
count,2854812,2854812.0,2854812.0
mean,2024-08-16 14:10:23.051606784+00:00,133.2348,18.05797
min,2024-01-01 00:00:00+00:00,1.0,0.0
25%,2024-04-23 01:00:00+00:00,66.0,0.0
50%,2024-08-14 03:00:00+00:00,134.0,0.0
75%,2024-12-05 05:00:00+00:00,200.0,3.0
max,2025-05-08 16:00:00+00:00,265.0,1239.0
std,,76.74766,55.19342


In [None]:
from data import transform_ts_data_into_features_and_target

features, targets = transform_ts_data_into_features_and_target(
    ts_data,
    input_seq_len=24,
    step_size=23,
)

features_and_target = features.copy()
features_and_target['target_rides_next_hour'] = targets

print(f'{features_and_target.shape=}')

100%|██████████| 263/263 [00:10<00:00, 25.37it/s]

features_and_target.shape=(123875, 27)





In [16]:
import pandas as pd

# Convertimos la columna de fecha y hora a un objeto datetime

features_and_target['pickup_hour'] = pd.to_datetime(features_and_target['pickup_hour'])

In [17]:
from datetime import date, timedelta
from pytz import timezone
import pandas as pd
from data_split import train_test_split

# training data -> from January 2025 up until 2 months ago
# test data -> last 2 months
cutoff_date = pd.to_datetime(date.today() - timedelta(days=28*1), utc=True)

print(f'{cutoff_date=}')

X_train, y_train, X_test, y_test = train_test_split(
    features_and_target,
    cutoff_date,
    target_column_name='target_rides_next_hour'   
)


# eliminamos la columna de fecha y hora
X_train = X_train.drop(columns=['pickup_hour'])
X_test = X_test.drop(columns=['pickup_hour'])

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

cutoff_date=Timestamp('2025-05-07 00:00:00+0000', tz='UTC')
X_train.shape=(123621, 25)
y_train.shape=(123621,)
X_test.shape=(254, 25)
y_test.shape=(254,)


In [18]:
X_train.tail()

Unnamed: 0,rides_previous_24_hour,rides_previous_23_hour,rides_previous_22_hour,rides_previous_21_hour,rides_previous_20_hour,rides_previous_19_hour,rides_previous_18_hour,rides_previous_17_hour,rides_previous_16_hour,rides_previous_15_hour,...,rides_previous_9_hour,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_location_id
123616,0.0,3.0,0.0,1.0,0.0,1.0,2.0,3.0,0.0,2.0,...,2.0,2.0,1.0,1.0,0.0,2.0,0.0,0.0,2.0,129
123617,2.0,2.0,1.0,3.0,3.0,0.0,1.0,1.0,4.0,1.0,...,3.0,2.0,1.0,1.0,1.0,0.0,7.0,4.0,4.0,129
123618,4.0,3.0,1.0,2.0,10.0,9.0,3.0,0.0,4.0,2.0,...,2.0,2.0,2.0,1.0,1.0,1.0,3.0,0.0,3.0,129
123619,3.0,3.0,4.0,0.0,1.0,2.0,5.0,2.0,1.0,3.0,...,2.0,2.0,2.0,1.0,2.0,1.0,0.0,1.0,3.0,129
123620,3.0,4.0,1.0,3.0,1.0,0.0,0.0,0.0,0.0,2.0,...,2.0,2.0,5.0,5.0,2.0,0.0,1.0,1.0,0.0,129


In [19]:
y_train.tail()

123616    2.0
123617    3.0
123618    3.0
123619    4.0
123620    1.0
Name: target_rides_next_hour, dtype: float32

In [25]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.svm import SVR
from sklearn.linear_model import Ridge, Lasso, ElasticNet
# from xgboost import XGBRegressor
# from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
# Importamos mlflow para el seguimiento de experimentos
import mlflow
import mlflow.sklearn

# Importamos la configuración de logging
import logging

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

In [24]:
# Configuración de MLflow
mlflow.set_tracking_uri("mlruns")  # En local, cambia si usas server
mlflow.set_experiment("TaxiDemandRegression")

2025/06/04 14:14:47 INFO mlflow.tracking.fluent: Experiment with name 'TaxiDemandRegression' does not exist. Creating a new experiment.


<Experiment: artifact_location='/Users/cdonairem/Documents/Workspace/proyecto_mlops_prueba_poetry/notebooks/mlruns/593164492010380515', creation_time=1749039287979, experiment_id='593164492010380515', last_update_time=1749039287979, lifecycle_stage='active', name='TaxiDemandRegression', tags={}>

In [27]:
# Definimos los modelos a entrenar
with mlflow.start_run(run_name="LinearRegression"):
    # Entrenamos el modelo
    model = LinearRegression()
    model.fit(X_train, y_train)
    # Realizamos predicciones
    y_pred = model.predict(X_test)

    # Calcular métricas
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Log de los parámetros
    mlflow.log_param("model", "LinearRegression")
    mlflow.log_params(model.get_params())  # coeficiente y intercepto
    # Log de las métricas
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    # Log del modelo
    mlflow.sklearn.log_model(model, "model")
    # Imprimir resultados
    print(f"LinearRegression -> MSE: {mse:.4f}, R2: {r2:.4f}")



LinearRegression -> MSE: 84.2003, R2: 0.9857


In [26]:
# Definimos los modelos a entrenar
models = {

    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.1),
    "RandomForest": RandomForestRegressor(n_estimators=100),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100)
}

In [41]:
# Entrenamos y evaluamos varios modelos a la vez
for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        # Entrenamos el modelo
        model.fit(X_train, y_train)

        # Hacemos predicciones
        y_pred = model.predict(X_test)

        # Calculamos las métricas
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Registramos las métricas en MLflow
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("r2", r2)

        mlflow.log_param("model", model_name)
        mlflow.log_params(model.get_params())

        # Registramos el modelo
        mlflow.sklearn.log_model(model, "model")

        logger.info(f"Modelo: {model_name}, MAE: {mae}, MSE: {mse}, R2: {r2}")



2025-06-04 19:03:58,948 INFO: Modelo: LinearRegression, MAE: 3.5303730737216292, MSE: 84.20030508054398, R2: 0.9857374568656027




2025-06-04 19:04:01,553 INFO: Modelo: Ridge, MAE: 3.530372952620165, MSE: 84.20029799130599, R2: 0.9857374580664364




2025-06-04 19:04:04,960 INFO: Modelo: Lasso, MAE: 3.5251848695430557, MSE: 83.85928341670531, R2: 0.9857952219317222




2025-06-04 19:05:41,240 INFO: Modelo: RandomForest, MAE: 3.425621883563887, MSE: 64.75600756516121, R2: 0.9890310925806505




2025-06-04 19:06:03,212 INFO: Modelo: GradientBoosting, MAE: 3.241406311311049, MSE: 61.69710361230693, R2: 0.9895492349974723


In [None]:
# Registramos el mejor modelo
# ID del run del mejor modelo (puedes copiarlo de la UI)
run_id = "e87d7ea6defe43b489412f939aca9b12"

model_uri = f"runs:/{run_id}/model"

mlflow.register_model(
    model_uri=model_uri,
    name="LinearRegressionTaxiDemandModel"
)

Successfully registered model 'LinearRegressionTaxiDemandModel'.
Created version '1' of model 'LinearRegressionTaxiDemandModel'.


<ModelVersion: aliases=[], creation_timestamp=1749057066804, current_stage='None', description=None, last_updated_timestamp=1749057066804, name='LinearRegressionTaxiDemandModel', run_id='e87d7ea6defe43b489412f939aca9b12', run_link=None, source='/Users/cdonairem/Documents/Workspace/proyecto_mlops_prueba_poetry/notebooks/mlruns/593164492010380515/e87d7ea6defe43b489412f939aca9b12/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [None]:
# Cargamos el modelo registrado
from mlflow.tracking import MlflowClient

# Cargamos el modelo registrado por su nombre y versión
client = MlflowClient()
model_name = "GradientBoostingTaxiDemandModel"
model_version = 1  # Cambia según la versión que quieras cargar
model = mlflow.pyfunc.load_model(f"models:/{model_name}/{model_version}")

# Realizamos una predicción con el modelo cargado
sample_data = X_test
predictions = model.predict(sample_data)
print(predictions)

# Evaluamos el modelo cargado
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Evaluación del modelo cargado -> MSE: {mse:.4f}, R2: {r2:.4f}, MAE: {mae:.4f}")

# Guardamos el modelo en un archivo
model_path = "gradient_boosting_taxi_demand_model.pkl"
mlflow.sklearn.save_model(model, model_path)


[3.44747865e-01 1.37398487e+00 3.87506173e+02 2.46117381e+02
 1.60363092e+02 6.05339171e-01 4.09739212e-01 3.44747865e-01
 4.73903090e-01 3.44747865e-01 3.44747865e-01 2.97888592e+02
 2.89306417e+01 2.51187411e+00 1.82164855e+01 4.17793967e-01
 1.86716135e+00 4.09739212e-01 4.09739212e-01 4.78712884e+01
 3.44747865e-01 3.44747865e-01 4.17793967e-01 2.85056466e+02
 1.19620678e+00 4.17793967e-01 9.63279975e+01 9.74698404e+01
 2.05302928e+00 4.73903090e-01 4.73903090e-01 5.63322799e+02
 3.44747865e-01 4.04844811e-01 3.44747865e-01 3.44747865e-01
 4.06361386e-01 5.34000036e-01 5.84936163e-01 3.44747865e-01
 1.05610740e+00 9.78306584e+01 3.44747865e-01 3.44747865e-01
 3.44747865e-01 3.44747865e-01 5.38894437e-01 5.16209400e-01
 4.60300319e+01 3.44747865e-01 4.17793967e-01 3.44747865e-01
 3.44747865e-01 5.43725650e-01 1.24239086e+01 3.44747865e-01
 4.82785314e-01 3.44747865e-01 4.11131899e+02 3.44747865e-01
 9.91270339e-01 3.44747865e-01 9.42520079e+01 3.44747865e-01
 2.62833685e+01 3.447478

MlflowException: Path 'gradient_boosting_taxi_demand_model.pkl' already exists and is not empty