In [1]:
import sys
from pathlib import Path
from datetime import datetime

# Añade src al path
sys.path.append(str(Path().resolve().parent / "src"))

import config

In [2]:
%load_ext autoreload
%autoreload 2

## (Opcional) Este código permite conocer lo que hay en Hopsworks

In [3]:
# # Conocer qué datos hay en el Feature Store
# import hopsworks
# import pandas as pd

# FV_NAME = "time_series_hourly_feature_view"
# FV_VERSION = 1

# ## 1) Conéctate a Hopsworks y al Feature Store
# project = hopsworks.login(project=config.HOPSWORKS_PROJECT_NAME, api_key_value=config.HOPSWORKS_API_KEY)
# fs  = project.get_feature_store()

# # 2) Lista tus Feature Groups
# fgs = fs.get_feature_groups(name='time_series_hourly_feature_group')
# print("=== Feature Groups ===")
# for fg in fgs:
#     print(f"- {fg.name} v{fg.version}")


# # 3) Para cada FG, saca número de filas y rango de timestamp
# for fg in fgs:
#     print(f"\n> FG: {fg.name} v{fg.version}")
#     df = fg.read()            # ajusta el limit si hace falta
#     print("  • Shape:", df.shape)
#     # cambia 'pickup_hour' por la columna timestamp de tu FG si difiere
#     if 'pickup_hour' in df.columns:
#         print("  • Min pickup_hour:", df.pickup_hour.min())
#         print("  • Max pickup_hour:", df.pickup_hour.max())
#     print("  • Primeras filas:")
#     print(df.head(5))

# # 4) Lista tus Feature Views
# fvs = fs.get_feature_views(name='time_series_hourly_feature_view')
# print("\n=== Feature Views ===")
# for fv in fvs:
#     print(f"- {fv.name} v{fv.version}")

# # 5) Para la FV que uses, haz un batch_data amplio
# fv = fs.get_feature_view(name=FV_NAME, version=FV_VERSION)
# print(f"\n> Feature View: {fv.name} v{fv.version}")

# # Prueba un rango amplio: desde hace un año hasta hoy
# start = pd.Timestamp.today() - pd.Timedelta(days=365)
# end   = pd.Timestamp.today()
# df_fv = fv.get_batch_data(start_time=start, end_time=end)
# print("  • Shape batch_data:", df_fv.shape)
# print("  • Min timestamp:", df_fv['pickup_hour'].min())
# print("  • Max timestamp:", df_fv['pickup_hour'].max())
# print("  • Primeras filas:")
# print(df_fv.head(5))


## Conexión al proyecto, feature store y model registry en Hopsworks

In [None]:
import hopsworks

## 1) Conexión a Hopsworks y proyecto
project = hopsworks.login(project=config.HOPSWORKS_PROJECT_NAME, api_key_value=config.HOPSWORKS_API_KEY) # Conexión al proyecto
fs      = project.get_feature_store() # Conexión al Feature Store

print("Conectado a proyecto:", project.name)
print("Conectado a Feature Store:", fs.name)

2025-06-04 14:57:16,487 INFO: Initializing external client
2025-06-04 14:57:16,488 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-06-04 14:57:17,549 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1224869
Conectado a proyecto: taxo_demand
Conectado a Feature Store: taxo_demand_featurestore
Conectado a Model Registry: ModelRegistry(project: 'taxo_demand')


## Leemos Batch Score del Feature View

In [7]:
from datetime import datetime, timedelta
import numpy as np
import pandas as pd

current_date = pd.to_datetime(datetime.utcnow()).floor('H') - timedelta(days=100)
print(f'{current_date=}')

fetch_data_from = current_date - timedelta(days=1)
fetch_data_to = current_date - timedelta(hours=1)

n_features = 24 # 24 horas de datos anteriores

FV_NAME = "time_series_hourly_feature_view"
FV_VERSION = 1
# 3) Lectura batch cruda de la Feature View
fv = fs.get_feature_view(name=FV_NAME, version=FV_VERSION)

ts_data = fv.get_batch_data(
    start_time=current_date - timedelta(days=1),
    end_time=current_date + timedelta(days=1)
)
print("Registros en ±1 día:", ts_data.shape[0])
print("Timestamps únicos:", ts_data["pickup_hour"].drop_duplicates().sort_values().tolist()[:5])

ts_data.head()

current_date=Timestamp('2025-02-24 12:00:00')
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (6.20s) 
Registros en ±1 día: 12624
Timestamps únicos: [Timestamp('2025-02-23 12:00:00+0000', tz='Etc/UTC'), Timestamp('2025-02-23 13:00:00+0000', tz='Etc/UTC'), Timestamp('2025-02-23 14:00:00+0000', tz='Etc/UTC'), Timestamp('2025-02-23 15:00:00+0000', tz='Etc/UTC'), Timestamp('2025-02-23 16:00:00+0000', tz='Etc/UTC')]


Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2025-02-23 22:00:00+00:00,178,0
1,2025-02-25 01:00:00+00:00,235,0
2,2025-02-24 22:00:00+00:00,223,3
3,2025-02-24 12:00:00+00:00,248,0
4,2025-02-24 11:00:00+00:00,261,23


In [8]:
ts_data.describe(include='all')

Unnamed: 0,pickup_hour,pickup_location_id,rides
count,12624,12624.0,12624.0
mean,2025-02-24 11:30:00+00:00,133.224335,15.189797
min,2025-02-23 12:00:00+00:00,1.0,0.0
25%,2025-02-23 23:45:00+00:00,66.0,0.0
50%,2025-02-24 11:30:00+00:00,134.0,1.0
75%,2025-02-24 23:15:00+00:00,200.0,5.0
max,2025-02-25 11:00:00+00:00,265.0,552.0
std,,76.748279,45.30062


## Filtramos datos por el período de interés

In [9]:
# filtrar datos en el período de interés
pickup_ts_from = pd.Timestamp(fetch_data_from).tz_localize("UTC") # convertir a UTC
pickup_ts_to   = pd.Timestamp(fetch_data_to).tz_localize("UTC") # convertir a UTC

ts_data = ts_data[ts_data.pickup_hour.between(pickup_ts_from, pickup_ts_to)] # filtrar por rango

# ordenar datos por location y tiempo
ts_data.sort_values(by=['pickup_location_id', 'pickup_hour'], inplace=True)

# valida que no faltan datos en el feature store
# el número de registros debe ser igual al número de features por location_id
location_ids = ts_data['pickup_location_id'].unique()
print("Ubicaciones únicas:", len(location_ids))
assert len(ts_data) == n_features * len(location_ids), \
    "Time-series data is not complete. Make sure your feature pipeline is up and runnning."


ts_data.head()

Ubicaciones únicas: 263


Unnamed: 0,pickup_hour,pickup_location_id,rides
10456,2025-02-23 12:00:00+00:00,1,0
12332,2025-02-23 13:00:00+00:00,1,1
7136,2025-02-23 14:00:00+00:00,1,2
768,2025-02-23 15:00:00+00:00,1,1
8953,2025-02-23 16:00:00+00:00,1,3


In [10]:
# transponer los datos de la serie temporal como un vector de características, para cada `pickup_location_id`.
x = np.ndarray(shape=(len(location_ids), n_features), dtype=np.float32)
for i, location_id in enumerate(location_ids):
    ts_data_i = ts_data.loc[ts_data.pickup_location_id == location_id, :]
    ts_data_i = ts_data_i.sort_values(by=['pickup_hour'])
    x[i, :] = ts_data_i['rides'].values

# numpy arrays a Pandas dataframes
features = pd.DataFrame(
    x,
    columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(n_features))]
)
features['pickup_hour'] = current_date
features['pickup_location_id'] = location_ids
features.sort_values(by=['pickup_location_id'], inplace=True)
features.head()


Unnamed: 0,rides_previous_24_hour,rides_previous_23_hour,rides_previous_22_hour,rides_previous_21_hour,rides_previous_20_hour,rides_previous_19_hour,rides_previous_18_hour,rides_previous_17_hour,rides_previous_16_hour,rides_previous_15_hour,...,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id
0,0.0,1.0,2.0,1.0,3.0,1.0,2.0,9.0,1.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,2025-02-24 12:00:00,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025-02-24 12:00:00,2
2,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,1.0,0.0,1.0,0.0,2025-02-24 12:00:00,3
3,4.0,13.0,8.0,7.0,14.0,6.0,13.0,10.0,3.0,6.0,...,1.0,3.0,8.0,13.0,22.0,16.0,8.0,5.0,2025-02-24 12:00:00,4
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025-02-24 12:00:00,5


## Cargar modelo para predecir

In [21]:
import mlflow.pyfunc

MODEL_NAME = "LinearRegressionTaxiDemandModel"
MODEL_VERSION = 1 

model_uri = f"models:/{MODEL_NAME}/{MODEL_VERSION}"
model = mlflow.pyfunc.load_model(model_uri)

print(f"Modelo '{MODEL_NAME}' ({MODEL_VERSION}) cargado desde MLflow Registry.")

Modelo 'LinearRegressionTaxiDemandModel' (1) cargado desde MLflow Registry.


[autoreload of src.feature_store_api failed: Traceback (most recent call last):
  File "/Users/cdonairem/Documents/Workspace/proyecto_mlops_prueba_poetry/.venv/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/Users/cdonairem/Documents/Workspace/proyecto_mlops_prueba_poetry/.venv/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 475, in superreload
    module = reload(module)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/importlib/__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 619, in _exec
  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "/Users/cdonairem/Documents/Workspace/proyecto_mlops_prueba_poetry/src/feature_store_api.py", line 11, in <module>
    import src.config as con

In [12]:
features = features.drop(columns=['pickup_hour'])
features

Unnamed: 0,rides_previous_24_hour,rides_previous_23_hour,rides_previous_22_hour,rides_previous_21_hour,rides_previous_20_hour,rides_previous_19_hour,rides_previous_18_hour,rides_previous_17_hour,rides_previous_16_hour,rides_previous_15_hour,...,rides_previous_9_hour,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_location_id
0,0.0,1.0,2.0,1.0,3.0,1.0,2.0,9.0,1.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,1.0,0.0,1.0,0.0,3
3,4.0,13.0,8.0,7.0,14.0,6.0,13.0,10.0,3.0,6.0,...,0.0,1.0,3.0,8.0,13.0,22.0,16.0,8.0,5.0,4
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258,37.0,51.0,46.0,33.0,33.0,42.0,27.0,15.0,19.0,6.0,...,1.0,0.0,0.0,2.0,8.0,9.0,11.0,13.0,23.0,261
259,130.0,114.0,105.0,80.0,79.0,73.0,63.0,38.0,40.0,16.0,...,1.0,6.0,20.0,85.0,177.0,154.0,156.0,122.0,92.0,262
260,160.0,168.0,124.0,114.0,108.0,120.0,130.0,106.0,104.0,58.0,...,3.0,6.0,24.0,59.0,127.0,165.0,134.0,89.0,107.0,263
261,7.0,8.0,8.0,12.0,15.0,15.0,10.0,8.0,5.0,7.0,...,0.0,0.0,2.0,4.0,12.0,6.0,6.0,14.0,6.0,264


## Hacer predicciones

In [22]:
# --- Paso 3: Hacer predicciones ---
preds = model.predict(features)

results = pd.DataFrame()
results['pickup_location_id'] = features['pickup_location_id'].values
results['predicted_demand'] = preds.round(0)
results['pickup_hour'] = current_date
results

Unnamed: 0,pickup_location_id,predicted_demand,pickup_hour
0,1,1.0,2025-02-24 12:00:00
1,2,0.0,2025-02-24 12:00:00
2,3,-0.0,2025-02-24 12:00:00
3,4,6.0,2025-02-24 12:00:00
4,5,0.0,2025-02-24 12:00:00
...,...,...,...
258,261,32.0,2025-02-24 12:00:00
259,262,92.0,2025-02-24 12:00:00
260,263,126.0,2025-02-24 12:00:00
261,264,5.0,2025-02-24 12:00:00


## Guardar predicciones

Guardar estas predicciones en el feature store, para que puedan ser consumidas posteriormente por nuestra aplicación Streamlit.

In [None]:
# --- Paso 4: Guardar predicciones en local en un CSV ---
output_path = Path("predicciones_batch.csv")
df_preds = pd.DataFrame(preds, columns=["prediction"])
df_output = pd.concat([features.reset_index(drop=True), df_preds], axis=1)

df_output.to_csv(output_path, index=False)
print(f"Predicciones guardadas en {output_path}")

mlflow.log_artifact(str(output_path))

Predicciones guardadas en predicciones_batch.csv


In [None]:
# Run de Inferencia
# --- Iniciar run de inferencia en MLFlow ---
if mlflow.active_run():
    mlflow.end_run()
    
with mlflow.start_run(run_name="BatchInference"):

    # Cargar modelo desde Model Registry
    model_uri = f"models:/{MODEL_NAME}/{MODEL_VERSION}"
    model = mlflow.pyfunc.load_model(model_uri)
    mlflow.log_param("model_name", MODEL_NAME)
    mlflow.log_param("stage", MODEL_VERSION)

    print(f"Modelo '{MODEL_NAME}' ({MODEL_VERSION}) cargado desde MLflow Registry.")

    # Predecir
    preds = model.predict(features)

    # Guardar predicciones
    df_preds = pd.DataFrame(preds, columns=["prediction"])
    df_output = pd.concat([features.reset_index(drop=True), df_preds], axis=1)
    df_output.to_csv(output_path, index=False)
    print(f"Predicciones guardadas en {output_path}")

    # Logging del resultado
    mlflow.log_param("num_predictions", len(preds))
    mlflow.log_artifact(str(output_path))

    print("Proceso de inferencia trackeado correctamente con MLflow.")

Modelo 'GradientBoostingTaxiDemandModel' (1) cargado desde MLflow Registry.
Predicciones guardadas en predicciones_batch.csv
Proceso de inferencia trackeado correctamente con MLflow.
