dont forget to update pipenv with mlflow and the other packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import root_mean_squared_error, r2_score
import pickle
import mlflow


%matplotlib inline

In [2]:
df = pd.read_parquet("data/yellow_tripdata_2024-08.parquet")

In [3]:
categories = ["PULocationID", "DOLocationID"]
numerics = ["trip_distance"]


def read_data(filename):
    df = pd.read_parquet(filename)
    df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])
    df["tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"])

    df["duration"] = df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
    df["duration"] = df["duration"].map(lambda x: x.total_seconds() / 60)

    df = df[(df["duration"] >= 0) & (df["duration"] <= 60)]
    df[categories] = df[categories].astype(object)

    df["tpep_pickup_datetime"] = df["tpep_pickup_datetime"].astype(int)

    prepped = df[categories + numerics + ["duration"]].dropna()
    return prepped

In [4]:
df = read_data("data/yellow_tripdata_2024-08.parquet")
df.head()

Unnamed: 0,PULocationID,DOLocationID,trip_distance,duration
0,138,80,7.4,15.216667
1,138,239,9.91,21.766667
2,138,88,13.4,23.883333
3,209,137,3.9,6.8
4,148,144,0.4,3.7


In [5]:
dv = DictVectorizer()
x_train = dv.fit_transform(df[categories + numerics].to_dict(orient="records"))

target = "duration"
y_train = df[target].values

In [6]:
test_data = read_data("data/yellow_tripdata_2024-01.parquet")
x_test_dict = test_data[categories + numerics].to_dict(orient="records")
x_test = dv.transform(x_test_dict)
y_test = test_data[target].values

In [10]:
import os

mlflow.set_tracking_uri("http://127.0.0.1:5000/")
mlflow.set_experiment("duration-prediction")

with mlflow.start_run() as run:
    # Log parameters and train model
    mlflow.sklearn.autolog()
    lr = LinearRegression()
    lr.fit(x_train, y_train)

    # Log metrics
    y_pred = lr.predict(x_test)
    rmse = root_mean_squared_error(y_test, y_pred)
    mlflow.log_metric("rmse", rmse)

    # Log model and DictVectorizer
    mlflow.sklearn.log_model(lr, "model")

    artifacts_dir = "mlflow_artifacts"
    os.makedirs(artifacts_dir, exist_ok=True)

    dv_path = os.path.join(artifacts_dir, "dv.bin")
    with open(dv_path, "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact(dv_path)

    print(f"Run ID: {run.info.run_id}")

2024/11/04 18:49:15 INFO mlflow.tracking.fluent: Experiment with name 'duration-prediction' does not exist. Creating a new experiment.


Run ID: 64b1755f573c4439974d9f5ab58a9a06


2024/11/04 18:49:19 INFO mlflow.tracking._tracking_service.client: 🏃 View run bemused-bass-165 at: http://127.0.0.1:5000/#/experiments/1/runs/64b1755f573c4439974d9f5ab58a9a06.
2024/11/04 18:49:19 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.


Artifact URI: /Users/dna/mlops-zoomcamp/chapter4/webservice/mlflow_artifacts/1/64b1755f573c4439974d9f5ab58a9a06/artifacts
