In [1]:
%load_ext autoreload
%autoreload 2

# MLflow Regression Pipeline Notebook

This notebook runs the MLflow Regression Pipeline on Databricks and inspects its results. For more information about the MLflow Regression Pipeline, including usage examples, see the [Regression Pipeline overview documentation](https://mlflow.org/docs/latest/pipelines.html#regression-pipeline) the [Regression Pipeline API documentation](https://mlflow.org/docs/latest/python_api/mlflow.pipelines.html#module-mlflow.pipelines.regression.v1.pipeline).

In [2]:
from mlflow.pipelines import Pipeline

p = Pipeline(profile="local")

2022/09/27 14:46:46 INFO mlflow.pipelines.pipeline: Creating MLflow Pipeline 'mlp-regression-template' with profile: 'local'


In [3]:
p.clean()

In [6]:
p.inspect()

In [5]:
p.run("ingest")

2022/09/27 14:47:19 INFO mlflow.pipelines.step: Running step ingest...
2022/09/27 14:47:19 INFO mlflow.pipelines.steps.ingest.datasets: Resolving input data from '['/home/mlops/PycharmProjects/mlp-regression-template/data/sample.parquet']'
2022/09/27 14:47:19 INFO mlflow.pipelines.steps.ingest.datasets: Resolved input data to '/tmp/tmpu38cd9k7/sample.parquet'
2022/09/27 14:47:19 INFO mlflow.pipelines.steps.ingest.datasets: Converting dataset to parquet format, if necessary
2022/09/27 14:47:21 INFO mlflow.pipelines.steps.ingest: Successfully stored data in parquet format at '/home/mlops/.mlflow/pipelines/afeb86a26a2540755a99c9caa1230c6c0dc7f007a4c80b78654e43d15c5d5633/steps/ingest/outputs/dataset.parquet'
2022/09/27 14:47:21 INFO mlflow.pipelines.steps.ingest: Profiling ingested dataset
2022/09/27 14:47:22 INFO mlflow.pipelines.steps.ingest: Wrote dataset profile to '/home/mlops/.mlflow/pipelines/afeb86a26a2540755a99c9caa1230c6c0dc7f007a4c80b78654e43d15c5d5633/steps/ingest/outputs/datas

name,type
tpep_pickup_datetime,datetime
tpep_dropoff_datetime,datetime
trip_distance,number
fare_amount,number
pickup_zip,integer
dropoff_zip,integer


In [7]:
p.run("split")

2022/09/27 14:48:35 INFO mlflow.pipelines.utils.execution: ingest: No changes. Skipping.


2022/09/27 14:48:36 INFO mlflow.pipelines.step: Running step split...
2022/09/27 14:48:37 INFO mlflow.pipelines.steps.split: Creating hash buckets on input dataset containing 10000 rows consumes 0.11431336402893066 seconds.
2022/09/27 14:48:37 INFO mlflow.pipelines.steps.split: Running process_splits on train, validation and test datasets.


In [8]:
p.run("transform")

2022/09/27 14:48:43 INFO mlflow.pipelines.utils.execution: ingest: No changes. Skipping.
2022/09/27 14:48:43 INFO mlflow.pipelines.utils.execution: split: No changes. Skipping.


2022/09/27 14:48:45 INFO mlflow.pipelines.step: Running step transform...


Name,Type
tpep_pickup_datetime,datetime64[ns]
tpep_dropoff_datetime,datetime64[ns]
trip_distance,float64
fare_amount,float64
pickup_zip,int32
dropoff_zip,int32

Name,Type
hour_encoder__pickup_hour_0,float64
hour_encoder__pickup_hour_1,float64
hour_encoder__pickup_hour_2,float64
hour_encoder__pickup_hour_3,float64
hour_encoder__pickup_hour_4,float64
hour_encoder__pickup_hour_5,float64
hour_encoder__pickup_hour_6,float64
hour_encoder__pickup_hour_7,float64
hour_encoder__pickup_hour_8,float64
hour_encoder__pickup_hour_9,float64


In [9]:
p.run("train")

2022/09/27 14:48:54 INFO mlflow.pipelines.utils.execution: ingest: No changes. Skipping.
2022/09/27 14:48:54 INFO mlflow.pipelines.utils.execution: split: No changes. Skipping.
2022/09/27 14:48:54 INFO mlflow.pipelines.utils.execution: transform: No changes. Skipping.


2022/09/27 14:48:55 INFO mlflow.pipelines.step: Running step train...
2022/09/27 14:48:56 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2022/09/27 14:48:56 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [8

Metric,training,validation
root_mean_squared_error,3.43472,3.25884
example_count,8019.0,955.0
max_error,212.188,53.1241
mean_absolute_error,1.5208,1.63894
mean_absolute_percentage_error,0.148791,0.146808
mean_on_label,12.3563,13.0743
mean_squared_error,11.7973,10.62
r2_score,0.890054,0.908446
score,0.890054,0.908446
sum_on_label,99085.0,12486.0

Name,Type
tpep_pickup_datetime,datetime
tpep_dropoff_datetime,datetime
trip_distance,double
pickup_zip,integer
dropoff_zip,integer

Name,Type
-,"Tensor('float64', (-1,))"

absolute_error,prediction,fare_amount,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,pickup_zip,dropoff_zip
212.188444,62.811556,275.0,2016-02-12 20:55:19,2016-02-12 21:52:38,20.85,10013,7008
51.38178,3.61822,55.0,2016-02-28 04:50:41,2016-02-28 04:52:32,0.18,10115,10027
39.557483,45.442517,85.0,2016-02-11 17:52:13,2016-02-11 18:38:17,14.46,10282,7114
38.458072,13.541928,52.0,2016-01-26 09:04:58,2016-01-26 09:43:15,3.0,11109,10199
36.095139,15.904861,52.0,2016-02-26 16:54:41,2016-02-26 17:06:30,4.02,11371,11367
31.547252,13.452748,45.0,2016-02-06 23:18:10,2016-02-06 23:35:18,3.51,10013,7302
31.402555,20.597445,52.0,2016-01-16 00:12:20,2016-01-16 00:27:06,6.1,11378,10012
29.897227,58.102773,88.0,2016-02-11 12:47:12,2016-02-11 13:16:59,19.02,10119,10710
28.73453,31.23453,2.5,2016-01-16 17:50:50,2016-01-16 17:51:24,9.6,10007,10007
22.171355,24.671355,2.5,2016-01-04 10:20:18,2016-01-04 11:20:43,7.2,11370,11205

Unnamed: 0,Latest,Best
Model Rank,1,1
root_mean_squared_error,3.25884,3.25884
weighted_mean_squared_error,8.18055,8.18055
max_error,53.1241,53.1241
mean_absolute_error,1.63894,1.63894
mean_absolute_percentage_error,0.146808,0.146808
mean_squared_error,10.62,10.62
Run Time,2022-09-27 14:48:58,2022-09-27 14:48:58
Run ID,3ada36e3b9cc445b869f10946fbe6dfe,3ada36e3b9cc445b869f10946fbe6dfe


In [10]:
p.run("evaluate")

2022/09/27 14:49:16 INFO mlflow.pipelines.utils.execution: ingest: No changes. Skipping.
2022/09/27 14:49:16 INFO mlflow.pipelines.utils.execution: split: No changes. Skipping.
2022/09/27 14:49:16 INFO mlflow.pipelines.utils.execution: transform: No changes. Skipping.
2022/09/27 14:49:16 INFO mlflow.pipelines.utils.execution: train: No changes. Skipping.


2022/09/27 14:49:18 INFO mlflow.pipelines.step: Running step evaluate...
2022/09/27 14:49:19 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
X does not have valid feature names, but SGDRegressor was fitted with feature names
2022/09/27 14:49:21 INFO mlflow.models.evaluation.default_evaluator: Shap explainer _PatchedKernelExplainer is used.

  0%|          | 0/10 [00:00<?, ?it/s]X does not have valid feature names, but SGDRegressor was fitted with feature names
X does not have valid feature names, but SGDRegressor was fitted with feature names
X does not have valid feature names, but SGDRegressor was fitted with feature names
X does not have valid feature names, but SGDRegressor was fitted with feature names
X does not have valid feature names, but SGDRegressor was fitted with feature names
X does not have valid feature names, but SGDRegressor was fitted with feature names
X does not have valid feature names, but SGDRegressor was fitted with feature 

Metric,validation,test
root_mean_squared_error,3.25884,2.248191
example_count,955.0,987.0
max_error,53.1241,17.95838
mean_absolute_error,1.63894,1.513182
mean_absolute_percentage_error,0.146808,0.59546
mean_on_label,13.0743,12.180355
mean_squared_error,10.62,5.054361
r2_score,0.908446,0.947664
score,0.908446,0.947664
sum_on_label,12486.0,12022.01

metric,greater_is_better,value,threshold,validated
root_mean_squared_error,False,2.24819,10,✅
mean_absolute_error,False,1.51318,50,✅
weighted_mean_squared_error,False,3.29751,20,✅


In [11]:
p.run("register")

2022/09/27 14:49:26 INFO mlflow.pipelines.utils.execution: ingest: No changes. Skipping.
2022/09/27 14:49:26 INFO mlflow.pipelines.utils.execution: split: No changes. Skipping.
2022/09/27 14:49:26 INFO mlflow.pipelines.utils.execution: transform: No changes. Skipping.
2022/09/27 14:49:26 INFO mlflow.pipelines.utils.execution: train: No changes. Skipping.
2022/09/27 14:49:26 INFO mlflow.pipelines.utils.execution: evaluate: No changes. Skipping.


2022/09/27 14:49:29 INFO mlflow.pipelines.step: Running step register...
Successfully registered model 'taxi_fare_regressor'.
2022/09/27 14:49:29 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: taxi_fare_regressor, version 1
Created version '1' of model 'taxi_fare_regressor'.


In [12]:
p.inspect("train")

Metric,training,validation
root_mean_squared_error,3.43472,3.25884
example_count,8019.0,955.0
max_error,212.188,53.1241
mean_absolute_error,1.5208,1.63894
mean_absolute_percentage_error,0.148791,0.146808
mean_on_label,12.3563,13.0743
mean_squared_error,11.7973,10.62
r2_score,0.890054,0.908446
score,0.890054,0.908446
sum_on_label,99085.0,12486.0

Name,Type
tpep_pickup_datetime,datetime
tpep_dropoff_datetime,datetime
trip_distance,double
pickup_zip,integer
dropoff_zip,integer

Name,Type
-,"Tensor('float64', (-1,))"

absolute_error,prediction,fare_amount,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,pickup_zip,dropoff_zip
212.188444,62.811556,275.0,2016-02-12 20:55:19,2016-02-12 21:52:38,20.85,10013,7008
51.38178,3.61822,55.0,2016-02-28 04:50:41,2016-02-28 04:52:32,0.18,10115,10027
39.557483,45.442517,85.0,2016-02-11 17:52:13,2016-02-11 18:38:17,14.46,10282,7114
38.458072,13.541928,52.0,2016-01-26 09:04:58,2016-01-26 09:43:15,3.0,11109,10199
36.095139,15.904861,52.0,2016-02-26 16:54:41,2016-02-26 17:06:30,4.02,11371,11367
31.547252,13.452748,45.0,2016-02-06 23:18:10,2016-02-06 23:35:18,3.51,10013,7302
31.402555,20.597445,52.0,2016-01-16 00:12:20,2016-01-16 00:27:06,6.1,11378,10012
29.897227,58.102773,88.0,2016-02-11 12:47:12,2016-02-11 13:16:59,19.02,10119,10710
28.73453,31.23453,2.5,2016-01-16 17:50:50,2016-01-16 17:51:24,9.6,10007,10007
22.171355,24.671355,2.5,2016-01-04 10:20:18,2016-01-04 11:20:43,7.2,11370,11205

Unnamed: 0,Latest,Best
Model Rank,1,1
root_mean_squared_error,3.25884,3.25884
weighted_mean_squared_error,8.18055,8.18055
max_error,53.1241,53.1241
mean_absolute_error,1.63894,1.63894
mean_absolute_percentage_error,0.146808,0.146808
mean_squared_error,10.62,10.62
Run Time,2022-09-27 14:48:58,2022-09-27 14:48:58
Run ID,3ada36e3b9cc445b869f10946fbe6dfe,3ada36e3b9cc445b869f10946fbe6dfe


In [13]:
training_data = p.get_artifact("training_data")
training_data.describe()

Unnamed: 0,trip_distance,fare_amount,pickup_zip,dropoff_zip
count,8019.0,8019.0,8019.0,8019.0
mean,2.871861,12.356279,10138.396309,10175.045517
std,3.467339,10.359241,337.828943,409.264873
min,0.03,2.5,7002.0,7002.0
25%,1.0,6.5,10012.0,10013.0
50%,1.7,9.0,10022.0,10023.0
75%,3.07,14.0,10110.0,10119.0
max,30.6,275.0,11436.0,11691.0


In [14]:
trained_model = p.get_artifact("model")
print(trained_model)

mlflow.pyfunc.loaded_model:
  artifact_path: train/model
  flavor: mlflow.sklearn
  run_id: 3ada36e3b9cc445b869f10946fbe6dfe

