In [1]:
import sys
from pathlib import Path
from datetime import datetime

# Añade src al path
sys.path.append(str(Path().resolve().parent / "src"))

import config

In [2]:
# # Configura el entorno de trabajo
%reload_ext autoreload
%autoreload 2

In [3]:
import hopsworks

# Conecta a Hopsworks
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value=config.HOPSWORKS_API_KEY
)

# Conecta a la API de Feature Store
feature_store = project.get_feature_store()

# Conecta al grupo de características
feature_group = feature_store.get_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION,
)

2025-05-23 15:33:53,993 INFO: Initializing external client
2025-05-23 15:33:53,994 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-23 15:33:55,213 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1224869


In [4]:
# Crea una vista de características (si no existe ya)
# Esta vista de características usa solo un grupo de características, por lo que la consulta es trivial
try:
    # Crea la vista de características si no existe
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all()
    )
except:
    print('Feature view already existed. Skip creation.')

Feature view already existed. Skip creation.


In [5]:
# obtenemos la vista de características
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME,
    version=config.FEATURE_VIEW_VERSION
)

In [6]:
# obtenemos la vista de características necesaria para el entrenamiento
ts_data, _ = feature_view.training_data(
    description='Time-series hourly taxi rides',
)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (11.22s) 




In [7]:
ts_data.head()

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2024-10-16 16:00:00+00:00,118,0
1,2024-06-30 06:00:00+00:00,226,9
2,2024-07-25 09:00:00+00:00,132,163
3,2024-06-15 16:00:00+00:00,138,163
4,2024-09-19 03:00:00+00:00,246,35


In [10]:
import pandas as pd

ts_data['pickup_hour'] = pd.to_datetime(ts_data['pickup_hour'])

ts_data.describe(include='all')

Unnamed: 0,pickup_hour,pickup_location_id,rides
count,2854812,2854812.0,2854812.0
mean,2024-08-16 14:10:23.051606784+00:00,133.2348,18.05797
min,2024-01-01 00:00:00+00:00,1.0,0.0
25%,2024-04-23 01:00:00+00:00,66.0,0.0
50%,2024-08-14 03:00:00+00:00,134.0,0.0
75%,2024-12-05 05:00:00+00:00,200.0,3.0
max,2025-05-08 16:00:00+00:00,265.0,1239.0
std,,76.74766,55.19342


In [12]:
from data import transform_ts_data_into_features_and_target

features, targets = transform_ts_data_into_features_and_target(
    ts_data,
    input_seq_len=24, # one month
    step_size=23,
)

features_and_target = features.copy()
features_and_target['target_rides_next_hour'] = targets

print(f'{features_and_target.shape=}')

100%|██████████| 263/263 [00:17<00:00, 15.31it/s]


features_and_target.shape=(123875, 27)


In [28]:
import pandas as pd

# Convertimos la columna de fecha y hora a un objeto datetime

features_and_target['pickup_hour'] = pd.to_datetime(features_and_target['pickup_hour'])

In [None]:
from datetime import date, timedelta
from pytz import timezone
import pandas as pd
from data_split import train_test_split

# training data -> from January 2025 up until 2 months ago
# test data -> last 2 months
cutoff_date = pd.to_datetime(date.today() - timedelta(days=28*1), utc=True)

print(f'{cutoff_date=}')

X_train, y_train, X_test, y_test = train_test_split(
    features_and_target,
    cutoff_date,
    target_column_name='target_rides_next_hour'   
)


# eliminamos la columna de fecha y hora
X_train = X_train.drop(columns=['pickup_hour'])
X_test = X_test.drop(columns=['pickup_hour'])

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

cutoff_date=Timestamp('2025-04-25 00:00:00+0000', tz='UTC')
X_train.shape=(120319, 25)
y_train.shape=(120319,)
X_test.shape=(3556, 25)
y_test.shape=(3556,)


In [26]:
X_train.tail()

Unnamed: 0,rides_previous_24_hour,rides_previous_23_hour,rides_previous_22_hour,rides_previous_21_hour,rides_previous_20_hour,rides_previous_19_hour,rides_previous_18_hour,rides_previous_17_hour,rides_previous_16_hour,rides_previous_15_hour,...,rides_previous_9_hour,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_location_id
120314,1.0,1.0,1.0,3.0,3.0,3.0,0.0,0.0,3.0,0.0,...,3.0,0.0,0.0,2.0,1.0,4.0,4.0,4.0,1.0,129
120315,1.0,0.0,4.0,3.0,4.0,2.0,2.0,1.0,2.0,2.0,...,2.0,1.0,3.0,4.0,4.0,1.0,2.0,2.0,1.0,129
120316,1.0,4.0,2.0,0.0,1.0,2.0,0.0,3.0,3.0,2.0,...,2.0,0.0,1.0,3.0,0.0,1.0,0.0,1.0,3.0,129
120317,3.0,1.0,1.0,1.0,3.0,2.0,1.0,1.0,2.0,1.0,...,0.0,2.0,2.0,0.0,1.0,4.0,1.0,4.0,3.0,129
120318,3.0,2.0,5.0,1.0,5.0,3.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,3.0,1.0,0.0,0.0,2.0,3.0,129


In [14]:
y_train.tail()

120314    0.0
120315    4.0
120316    1.0
120317    2.0
120318    3.0
Name: target_rides_next_hour, dtype: float32

In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

# Train the Random Forest model
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

# Predicciones
y_pred_rf = rf_model.predict(X_test)

# Evaluación
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = mean_squared_error(y_test, y_pred_rf)
r2 = r2_score(y_test, y_pred_rf)

print(f"Random Forest - MAE: {mae_rf:.2f} | RMSE: {rmse_rf:.2f} | R2: {r2:.2f}")

Random Forest - MAE: 3.27 | RMSE: 143.92 | R2: 0.94


In [None]:
import joblib
from paths import MODELS_DIR

joblib.dump(rf_model, MODELS_DIR / 'rf_model_v2.pkl')

array(['rides_previous_24_hour', 'rides_previous_23_hour',
       'rides_previous_22_hour', 'rides_previous_21_hour',
       'rides_previous_20_hour', 'rides_previous_19_hour',
       'rides_previous_18_hour', 'rides_previous_17_hour',
       'rides_previous_16_hour', 'rides_previous_15_hour',
       'rides_previous_14_hour', 'rides_previous_13_hour',
       'rides_previous_12_hour', 'rides_previous_11_hour',
       'rides_previous_10_hour', 'rides_previous_9_hour',
       'rides_previous_8_hour', 'rides_previous_7_hour',
       'rides_previous_6_hour', 'rides_previous_5_hour',
       'rides_previous_4_hour', 'rides_previous_3_hour',
       'rides_previous_2_hour', 'rides_previous_1_hour',
       'pickup_location_id'], dtype=object)

In [None]:
# cargamos el modelo para probarlo
rf_model = joblib.load(MODELS_DIR / 'rf_model_v2.pkl')

array(['rides_previous_24_hour', 'rides_previous_23_hour',
       'rides_previous_22_hour', 'rides_previous_21_hour',
       'rides_previous_20_hour', 'rides_previous_19_hour',
       'rides_previous_18_hour', 'rides_previous_17_hour',
       'rides_previous_16_hour', 'rides_previous_15_hour',
       'rides_previous_14_hour', 'rides_previous_13_hour',
       'rides_previous_12_hour', 'rides_previous_11_hour',
       'rides_previous_10_hour', 'rides_previous_9_hour',
       'rides_previous_8_hour', 'rides_previous_7_hour',
       'rides_previous_6_hour', 'rides_previous_5_hour',
       'rides_previous_4_hour', 'rides_previous_3_hour',
       'rides_previous_2_hour', 'rides_previous_1_hour',
       'pickup_location_id'], dtype=object)

In [23]:
X_train.columns

Index(['rides_previous_24_hour', 'rides_previous_23_hour',
       'rides_previous_22_hour', 'rides_previous_21_hour',
       'rides_previous_20_hour', 'rides_previous_19_hour',
       'rides_previous_18_hour', 'rides_previous_17_hour',
       'rides_previous_16_hour', 'rides_previous_15_hour',
       'rides_previous_14_hour', 'rides_previous_13_hour',
       'rides_previous_12_hour', 'rides_previous_11_hour',
       'rides_previous_10_hour', 'rides_previous_9_hour',
       'rides_previous_8_hour', 'rides_previous_7_hour',
       'rides_previous_6_hour', 'rides_previous_5_hour',
       'rides_previous_4_hour', 'rides_previous_3_hour',
       'rides_previous_2_hour', 'rides_previous_1_hour', 'pickup_location_id'],
      dtype='object')

In [20]:
y_test.describe()

count    3556.000000
mean       13.294150
std        48.741875
min         0.000000
25%         0.000000
50%         0.000000
75%         2.000000
max       686.000000
Name: target_rides_next_hour, dtype: float64

In [30]:
from sklearn.metrics import mean_absolute_error

predictions = rf_model.predict(X_test)

mean_absolute_error(y_test, predictions)

3.2653103610817307

In [None]:
rf_model.feature_names_in_

array(['rides_previous_24_hour', 'rides_previous_23_hour',
       'rides_previous_22_hour', 'rides_previous_21_hour',
       'rides_previous_20_hour', 'rides_previous_19_hour',
       'rides_previous_18_hour', 'rides_previous_17_hour',
       'rides_previous_16_hour', 'rides_previous_15_hour',
       'rides_previous_14_hour', 'rides_previous_13_hour',
       'rides_previous_12_hour', 'rides_previous_11_hour',
       'rides_previous_10_hour', 'rides_previous_9_hour',
       'rides_previous_8_hour', 'rides_previous_7_hour',
       'rides_previous_6_hour', 'rides_previous_5_hour',
       'rides_previous_4_hour', 'rides_previous_3_hour',
       'rides_previous_2_hour', 'rides_previous_1_hour',
       'pickup_location_id'], dtype=object)

In [None]:
# creamos el model registry
# el modelo se registrará en el registro de modelos
from hsml.schema import Schema
from hsml.model_schema import ModelSchema
from sklearn.metrics import mean_absolute_error
from paths import MODELS_DIR

# creamos el esquema de entrada y salida
# el esquema de entrada es el esquema de las características
# el esquema de salida es el esquema de la variable objetivo
input_schema = Schema(X_train)
output_schema = Schema(y_train)

# model schema
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)
test_mae = mean_absolute_error(y_test, predictions)

# obtenerlo el model registry - Hopsworks
model_registry = project.get_model_registry()


# registramos el modelo en el registro de modelos
model_hopsworks = model_registry.sklearn.create_model(
    name="taxi_demand_predictor_next_hour",
    metrics={"test_mae": test_mae},
    description="Random Forest regressor with a bit of hyper-parameter tuning",
    input_example=X_train.sample(),
    model_schema=model_schema
)

model_hopsworks.save(str(MODELS_DIR / 'rf_model_v2.pkl'))

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading /Users/cdonairem/Documents/Workspace/proyecto_mlops_prueba_poetry/models/rf_model_v2.pkl: 0.000%|   …

Uploading /Users/cdonairem/Documents/Workspace/proyecto_mlops_prueba_poetry/notebooks/input_example.json: 0.00…

Uploading /Users/cdonairem/Documents/Workspace/proyecto_mlops_prueba_poetry/notebooks/model_schema.json: 0.000…

Model created, explore it at https://c.app.hopsworks.ai:443/p/1224869/models/taxi_demand_predictor_next_hour/4


Model(name: 'taxi_demand_predictor_next_hour', version: 4)