dont forget to update pipenv with mlflow and the other packages

In [2]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
from sklearn.pipeline import make_pipeline
import mlflow



In [3]:
categories = ["PULocationID", "DOLocationID"]
numerics = ["trip_distance"]
target = ["duration"]


def read_data(filename):
    df = pd.read_parquet(filename)
    df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])
    df["tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"])

    df["duration"] = df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
    df["duration"] = df["duration"].map(lambda x: x.total_seconds() / 60)

    df = df[(df["duration"] >= 0) & (df["duration"] <= 60)]
    df[categories] = df[categories].astype(object)

    df["tpep_pickup_datetime"] = df["tpep_pickup_datetime"].astype(int)

    prepped = df[categories + numerics + target].dropna()
    return prepped

In [5]:
df = read_data("webservice/data/yellow_tripdata_2024-08.parquet")
x_train = df[categories + numerics].to_dict(orient="records")
y_train = df[target].values

In [6]:
test_data = read_data("webservice/data/yellow_tripdata_2024-01.parquet")
x_test = test_data[categories + numerics].to_dict(orient="records")
y_test = test_data[target].values

In [7]:
train_pipeline = make_pipeline(DictVectorizer(), LinearRegression())

In [9]:
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
mlflow.set_experiment("duration-prediction")

with mlflow.start_run() as run:

    train_pipeline.fit(x_train, y_train)

    # Log metrics
    y_pred = train_pipeline.predict(x_test)
    rmse = root_mean_squared_error(y_test, y_pred)
    mlflow.log_metric("rmse", rmse)

    # Log model and DictVectorizer
    mlflow.sklearn.log_model(train_pipeline, "model")

    print(f"Run ID: {run.info.run_id}")

2024/11/10 11:31:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run thundering-toad-387 at: http://127.0.0.1:5000/#/experiments/1/runs/e308ab2a149249a4b161cb428b4abc23.
2024/11/10 11:31:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.


Run ID: e308ab2a149249a4b161cb428b4abc23
