Copiar dataset en una carpeta `data`

In [3]:
# Create the directory if it doesn't exist
!mkdir ../data

# Download files using curl
!curl -o ../data/green_tripdata_2024-01.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-01.parquet
!curl -o ../data/green_tripdata_2024-02.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-02.parquet

La sintaxis del comando no es correcta.
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  2 1330k    2 32768    0     0   111k      0  0:00:11 --:--:--  0:00:11  114k
100 1330k  100 1330k    0     0  2157k      0 --:--:-- --:--:-- --:--:-- 2184k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1253k  100 1253k    0     0  2265k      0 --:--:-- --:--:-- --:--:-- 2300k


Importar las librerías necesarias y definir función para importar los datos

In [4]:
import pickle
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
from sklearn.linear_model import Lasso, Ridge, LinearRegression

In [5]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [6]:
df_train = read_dataframe('../data/green_tripdata_2024-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2024-02.parquet')

Feature Engineering

In [7]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

One Hot Encoding

In [8]:
categorical = ['PU_DO']  #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [9]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

Definir el `tracking URI` y el nombre del experimento

In [11]:
import dagshub
import mlflow


dagshub.init(url="https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow", mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()

print(MLFLOW_TRACKING_URI)

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="nyc-taxi-experiment")

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=c7be1f2a-de45-418b-9519-65e37fe9e889&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=94884bc9fb1ec2763845037aa258b4db5c7bb2b9db9670c3ec85509a590e4234




https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow.mlflow


2024/09/17 21:15:52 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/9c5091aa749047a8b46c900fd0cf03c1', creation_time=1726629358358, experiment_id='0', last_update_time=1726629358358, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

Definir los `dataset` como objetos de `mlflow` para poderlos trackear

In [12]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2024-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2024-02")

### Subir los dataset al storage que nos brinda `dagshub`

In [18]:
from dagshub import get_repo_bucket_client
import os
# Get a boto3.client object
s3 = get_repo_bucket_client("G4ll4rd0/nyc-taxi-time-prediction")


basepath = os.path.abspath('../data')

for file in os.listdir(basepath):
    filepath = os.path.join(basepath, file)
    # Upload file
    s3.upload_file(
        Bucket = "nyc-taxi-time-prediction",  # name of the repo
        Filename = filepath,  # local path of file to upload
        Key = file,  # remote path where to upload the file
    )

Ahora vamos a entrenar un modelo `xgboost`


In [19]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import pathlib

Definir los `dataset` a trabajar.

In [20]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

Definir la función objetivo

In [21]:
def objective(params):
    with mlflow.start_run(nested=True):
         
        # Tag model
        mlflow.set_tag("model_family", "xgboost")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Train model
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, 'validation')],
            early_stopping_rounds=10
        )
        
        # Log xgboost model with artifact_path
        mlflow.xgboost.log_model(booster, artifact_path="model")

        # Predict in the val dataset
        y_pred = booster.predict(valid)
        
        # Calculate metric
        rmse = root_mean_squared_error(y_val, y_pred)
        
        # Log performance metric
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

Definir el espacio de búsqueda

In [22]:
mlflow.xgboost.autolog()

with mlflow.start_run(run_name="Xgboost Hyper-parameter Optimization", nested=True):
    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
        'learning_rate': hp.loguniform('learning_rate', -3, 0),
        'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
        'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
        'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
        'objective': 'reg:squarederror',
        'seed': 42
    }

    best_params = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    best_params["max_depth"] = int(best_params["max_depth"])
    best_params["seed"] = 42
    best_params["objective"] = "reg:squarederror"

    mlflow.log_params(best_params)

    # Log tags
    mlflow.set_tags(
        tags={
            "project": "NYC Taxi Time Prediction Project",
            "optimizer_engine": "hyper-opt",
            "model_family": "xgboost",
            "feature_set_version": 1,
        }
    )

    # Log a fit model instance
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, 'validation')],
        early_stopping_rounds=10
    )

    y_pred = booster.predict(valid)

    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    pathlib.Path("models").mkdir(exist_ok=True)
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

[0]	validation-rmse:8.35175                           
[1]	validation-rmse:7.72521                           
[2]	validation-rmse:7.21511                           
[3]	validation-rmse:6.79928                           
[4]	validation-rmse:6.46767                           
[5]	validation-rmse:6.20092                           
[6]	validation-rmse:5.99027                           
[7]	validation-rmse:5.82500                           
[8]	validation-rmse:5.69361                           
[9]	validation-rmse:5.59094                           
[10]	validation-rmse:5.50977                          
[11]	validation-rmse:5.44623                          
[12]	validation-rmse:5.39810                          
[13]	validation-rmse:5.35880                          
[14]	validation-rmse:5.32671                          
[15]	validation-rmse:5.29805                          
[16]	validation-rmse:5.27689                          
[17]	validation-rmse:5.26026                          
[18]	valid






2024/09/17 21:30:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run serious-snipe-227 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow.mlflow/#/experiments/0/runs/343f46646abd41b899f7c5b333f3c5d6.

2024/09/17 21:30:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow.mlflow/#/experiments/0.



[0]	validation-rmse:8.33183                                                    
[1]	validation-rmse:7.69903                                                    
[2]	validation-rmse:7.18994                                                    
[3]	validation-rmse:6.78474                                                    
[4]	validation-rmse:6.46438                                                    
[5]	validation-rmse:6.21505                                                    
[6]	validation-rmse:6.02050                                                    
[7]	validation-rmse:5.86960                                                    
[8]	validation-rmse:5.75175                                                    
[9]	validation-rmse:5.66125                                                    
[10]	validation-rmse:5.59242                                                   
[11]	validation-rmse:5.53934                                                   
[12]	validation-rmse:5.49760            






2024/09/17 21:32:36 INFO mlflow.tracking._tracking_service.client: 🏃 View run unique-panda-912 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow.mlflow/#/experiments/0/runs/ec71baec008e44a5b6eec083e37e9077.

2024/09/17 21:32:36 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow.mlflow/#/experiments/0.



[0]	validation-rmse:8.36808                                                     
[1]	validation-rmse:7.75941                                                     
[2]	validation-rmse:7.26639                                                     
[3]	validation-rmse:6.86628                                                     
[4]	validation-rmse:6.55265                                                     
[5]	validation-rmse:6.30073                                                     
[6]	validation-rmse:6.09923                                                     
[7]	validation-rmse:5.94757                                                     
[8]	validation-rmse:5.82464                                                     
[9]	validation-rmse:5.72545                                                     
[10]	validation-rmse:5.65038                                                    
[11]	validation-rmse:5.58936                                                    
[12]	validation-rmse:5.54189






2024/09/17 21:33:31 INFO mlflow.tracking._tracking_service.client: 🏃 View run illustrious-doe-844 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow.mlflow/#/experiments/0/runs/20fd6dabb2ed4f16bda07454762a4a24.

2024/09/17 21:33:31 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow.mlflow/#/experiments/0.



[0]	validation-rmse:8.06774                                                     
[1]	validation-rmse:7.29706                                                    
[2]	validation-rmse:6.73375                                                    
[3]	validation-rmse:6.33038                                                    
[4]	validation-rmse:6.04363                                                    
[5]	validation-rmse:5.84048                                                    
[6]	validation-rmse:5.69896                                                    
[7]	validation-rmse:5.60031                                                    
[8]	validation-rmse:5.53031                                                    
[9]	validation-rmse:5.47946                                                    
[10]	validation-rmse:5.44057                                                   
[11]	validation-rmse:5.41112                                                   
[12]	validation-rmse:5.38816           






2024/09/17 21:34:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run wistful-wasp-665 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow.mlflow/#/experiments/0/runs/873502c2a3514290a294bedf85070c33.

2024/09/17 21:34:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow.mlflow/#/experiments/0.



[0]	validation-rmse:8.70128                                                    
[1]	validation-rmse:8.33232                                                    
[2]	validation-rmse:8.00098                                                    
[3]	validation-rmse:7.70428                                                    
[4]	validation-rmse:7.43938                                                    
[5]	validation-rmse:7.20337                                                    
[6]	validation-rmse:6.99357                                                    
[7]	validation-rmse:6.80762                                                    
[8]	validation-rmse:6.64317                                                    
[9]	validation-rmse:6.49786                                                    
[10]	validation-rmse:6.37005                                                   
[11]	validation-rmse:6.25772                                                   
[12]	validation-rmse:6.15912            






2024/09/17 21:35:23 INFO mlflow.tracking._tracking_service.client: 🏃 View run inquisitive-dog-92 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow.mlflow/#/experiments/0/runs/77c51c9d58ac49a6982e635d27202450.

2024/09/17 21:35:23 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow.mlflow/#/experiments/0.



[0]	validation-rmse:6.69902                                                    
[1]	validation-rmse:5.78114                                                    
[2]	validation-rmse:5.46386                                                    
[3]	validation-rmse:5.35455                                                    
[4]	validation-rmse:5.31277                                                    
[5]	validation-rmse:5.28949                                                    
[6]	validation-rmse:5.28046                                                    
[7]	validation-rmse:5.27186                                                    
[8]	validation-rmse:5.26626                                                    
[9]	validation-rmse:5.26334                                                    
[10]	validation-rmse:5.25567                                                   
[11]	validation-rmse:5.25162                                                   
[12]	validation-rmse:5.24883            






2024/09/17 21:36:17 INFO mlflow.tracking._tracking_service.client: 🏃 View run hilarious-newt-995 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow.mlflow/#/experiments/0/runs/58460f5cebc34d6abc6c07d336361fdd.

2024/09/17 21:36:17 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow.mlflow/#/experiments/0.



[0]	validation-rmse:5.56469                                                    
[1]	validation-rmse:5.39255                                                    
[2]	validation-rmse:5.37285                                                    
[3]	validation-rmse:5.34577                                                    
[4]	validation-rmse:5.34084                                                    
[5]	validation-rmse:5.33381                                                    
[6]	validation-rmse:5.32531                                                    
[7]	validation-rmse:5.32759                                                    
[8]	validation-rmse:5.32911                                                    
[9]	validation-rmse:5.32400                                                    
[10]	validation-rmse:5.32339                                                   
[11]	validation-rmse:5.32826                                                   
[12]	validation-rmse:5.32765            






2024/09/17 21:37:34 INFO mlflow.tracking._tracking_service.client: 🏃 View run youthful-tern-209 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow.mlflow/#/experiments/0/runs/6adacbd9bc99441eb7fcf8247e5013c0.

2024/09/17 21:37:34 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow.mlflow/#/experiments/0.



[0]	validation-rmse:8.08050                                                    
[1]	validation-rmse:7.30810                                                    
[2]	validation-rmse:6.73950                                                    
[3]	validation-rmse:6.32820                                                    
[4]	validation-rmse:6.03524                                                    
[5]	validation-rmse:5.82849                                                    
[6]	validation-rmse:5.68259                                                    
[7]	validation-rmse:5.58029                                                    
[8]	validation-rmse:5.50933                                                    
[9]	validation-rmse:5.45836                                                    
[10]	validation-rmse:5.42254                                                   
[11]	validation-rmse:5.39680                                                   
[12]	validation-rmse:5.37855            






2024/09/17 21:38:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run skillful-croc-109 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow.mlflow/#/experiments/0/runs/2bb45baa2d2c4b5291b804cdf26b3120.

2024/09/17 21:38:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow.mlflow/#/experiments/0.



[0]	validation-rmse:8.34024                                                    
[1]	validation-rmse:7.70951                                                    
[2]	validation-rmse:7.19670                                                    
[3]	validation-rmse:6.78301                                                    
[4]	validation-rmse:6.45430                                                    
[5]	validation-rmse:6.19330                                                    
[6]	validation-rmse:5.98428                                                    
[7]	validation-rmse:5.82305                                                    
[8]	validation-rmse:5.69691                                                    
[9]	validation-rmse:5.59795                                                    
[10]	validation-rmse:5.52151                                                   
[11]	validation-rmse:5.46053                                                   
[12]	validation-rmse:5.41276            






2024/09/17 21:39:27 INFO mlflow.tracking._tracking_service.client: 🏃 View run valuable-toad-475 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow.mlflow/#/experiments/0/runs/6d320e85b70d4359a72e496ff066360a.

2024/09/17 21:39:27 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow.mlflow/#/experiments/0.



[0]	validation-rmse:7.89024                                                    
[1]	validation-rmse:7.03489                                                    
[2]	validation-rmse:6.45531                                                    
[3]	validation-rmse:6.05297                                                    
[4]	validation-rmse:5.80591                                                    
[5]	validation-rmse:5.63978                                                    
[6]	validation-rmse:5.54600                                                    
[7]	validation-rmse:5.46364                                                    
[8]	validation-rmse:5.43058                                                    
[9]	validation-rmse:5.41099                                                    
[10]	validation-rmse:5.39122                                                   
[11]	validation-rmse:5.37275                                                   
[12]	validation-rmse:5.36917            






2024/09/17 21:42:36 INFO mlflow.tracking._tracking_service.client: 🏃 View run loud-penguin-937 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow.mlflow/#/experiments/0/runs/988be4d4e4c747b48e11b862937c3f35.

2024/09/17 21:42:36 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow.mlflow/#/experiments/0.



100%|██████████| 10/10 [13:26<00:00, 80.67s/trial, best loss: 5.163621899488422] 
[0]	validation-rmse:8.35175
[1]	validation-rmse:7.72521
[2]	validation-rmse:7.21511
[3]	validation-rmse:6.79928
[4]	validation-rmse:6.46767
[5]	validation-rmse:6.20092
[6]	validation-rmse:5.99027
[7]	validation-rmse:5.82500
[8]	validation-rmse:5.69361
[9]	validation-rmse:5.59094
[10]	validation-rmse:5.50977
[11]	validation-rmse:5.44623
[12]	validation-rmse:5.39810
[13]	validation-rmse:5.35880
[14]	validation-rmse:5.32671
[15]	validation-rmse:5.29805
[16]	validation-rmse:5.27689
[17]	validation-rmse:5.26026
[18]	validation-rmse:5.24839
[19]	validation-rmse:5.23629
[20]	validation-rmse:5.22620
[21]	validation-rmse:5.21707
[22]	validation-rmse:5.21085
[23]	validation-rmse:5.20537
[24]	validation-rmse:5.20101
[25]	validation-rmse:5.19650
[26]	validation-rmse:5.19272
[27]	validation-rmse:5.18846
[28]	validation-rmse:5.18593
[29]	validation-rmse:5.18400
[30]	validation-rmse:5.18193
[31]	validation-rmse:5.18086


2024/09/17 21:43:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run Xgboost Hyper-parameter Optimization at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow.mlflow/#/experiments/0/runs/297da0aef2be4c7fa53b27fbc3d70f76.
2024/09/17 21:43:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow.mlflow/#/experiments/0.


In [23]:
best_params

{'learning_rate': np.float64(0.12878326532439502),
 'max_depth': 50,
 'min_child_weight': np.float64(1.4810238036672676),
 'reg_alpha': np.float64(0.3507121168237907),
 'reg_lambda': np.float64(0.023352215418367128),
 'seed': 42,
 'objective': 'reg:squarederror'}

Ahora vamos a registrar el mejor modelo en el `model registry` y usarlo para hacer predicciones

In [25]:
run_id = input("Ingrese el run_id")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="nyc-taxi-model"
)

Registered model 'nyc-taxi-model' already exists. Creating a new version of this model...
2024/09/17 21:44:54 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-model, version 1
Created version '1' of model 'nyc-taxi-model'.


In [26]:
from datetime import datetime
from mlflow import MlflowClient

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
client.update_registered_model(
    name="nyc-taxi-model",
    description="Model registry for the NYC Taxi Time Prediction Project",
)

new_alias = "champion"
date = datetime.today()
model_version = "1"

# create "champion" alias for version 1 of model "nyc-taxi-model"
client.set_registered_model_alias(
    name="nyc-taxi-model",
    alias=new_alias,
    version=model_version
)

client.update_model_version(
    name="nyc-taxi-model",
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}",
)

<ModelVersion: aliases=['champion'], creation_timestamp=1726631100887, current_stage='None', description='The model version 1 was transitioned to champion on 2024-09-17 21:46:38.880359', last_updated_timestamp=1726631205396, name='nyc-taxi-model', run_id='297da0aef2be4c7fa53b27fbc3d70f76', run_link='', source='mlflow-artifacts:/9c5091aa749047a8b46c900fd0cf03c1/297da0aef2be4c7fa53b27fbc3d70f76/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>

In [27]:
import mlflow.pyfunc

model_name = "nyc-taxi-model"
alias = "champion"

model_uri = f"models:/{model_name}@{alias}"

champion_version = mlflow.pyfunc.load_model(
    model_uri=model_uri
)

champion_version.predict(X_val)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

array([19.144537, 28.125263,  8.94481 , ..., 44.40944 , 13.726902,
       19.764984], dtype=float32)