dont forget to update pipenv with mlflow and the other packages

In [13]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
from sklearn.pipeline import make_pipeline
import pickle
import mlflow

In [21]:
categories = ["PULocationID", "DOLocationID"]
numerics = ["trip_distance"]
target = ["duration"]


def read_data(filename):
    df = pd.read_parquet(filename)
    df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])
    df["tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"])

    df["duration"] = df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
    df["duration"] = df["duration"].map(lambda x: x.total_seconds() / 60)

    df = df[(df["duration"] >= 0) & (df["duration"] <= 60)]
    df[categories] = df[categories].astype(object)

    df["tpep_pickup_datetime"] = df["tpep_pickup_datetime"].astype(int)

    prepped = df[categories + numerics + target].dropna()
    return prepped

In [22]:
df = read_data("data/yellow_tripdata_2024-08.parquet")
x_train = df[categories + numerics].to_dict(orient="records")
y_train = df[target].values

In [26]:
test_data = read_data("data/yellow_tripdata_2024-01.parquet")
x_test = test_data[categories + numerics].to_dict(orient="records")
y_test = test_data[target].values

In [28]:
train_pipeline = make_pipeline(DictVectorizer(), LinearRegression())

In [29]:
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
mlflow.set_experiment("duration-prediction")

with mlflow.start_run() as run:

    train_pipeline.fit(x_train, y_train)

    # Log metrics
    y_pred = train_pipeline.predict(x_test)
    rmse = root_mean_squared_error(y_test, y_pred)
    mlflow.log_metric("rmse", rmse)

    # Log model and DictVectorizer
    mlflow.sklearn.log_model(train_pipeline, "model")

    print(f"Run ID: {run.info.run_id}")

2024/11/04 19:51:33 INFO mlflow.tracking._tracking_service.client: 🏃 View run unique-bee-473 at: http://127.0.0.1:5000/#/experiments/1/runs/3fb7008b16534ce4b61ea373d96ddf42.
2024/11/04 19:51:33 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.


Run ID: 3fb7008b16534ce4b61ea373d96ddf42
