In [1]:
import polars as pl
import mlflow
import numpy as np

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import root_mean_squared_error



In [2]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("Trail_13_05")

client = mlflow.tracking.MlflowClient(tracking_uri="sqlite:///mlflow.db")

In [3]:
train_dat = pl.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet")
val_dat = pl.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet")

In [4]:
def dat_transform(df):
    df = (
        df.with_columns(
            (
                (pl.col("tpep_dropoff_datetime") - pl.col("tpep_pickup_datetime"))
                .dt.total_seconds() / 60          # ← total seconds → minutes
            ).alias("duration")
        )
        .filter(pl.col("duration").is_between(1, 60, closed="both"))
    )


    categorical = ['PULocationID', 'DOLocationID']

    df = df.with_columns([pl.col(c).cast(pl.Utf8) for c in categorical])
    return df

In [5]:
train = dat_transform(train_dat)

In [6]:
val = dat_transform(val_dat)

In [7]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']
target      = "duration"

train = train.with_columns([pl.col(categorical).cast(pl.Utf8)])


train_dicts = train.select(categorical + numerical).to_dicts()   

dv       = DictVectorizer(sparse=True)
X_train  = dv.fit_transform(train_dicts)
y_train  = train[target].to_numpy()

In [8]:
with mlflow.start_run():
    mlflow.set_tag("developer","André")
    mlflow.log_param("train_data_path","https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet")
    mlflow.log_param("val_data_path","https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet")
    
    lr       = LinearRegression().fit(X_train, y_train)
    y_pred   = lr.predict(X_train)

    rmse = root_mean_squared_error(y_train, y_pred)
    mlflow.log_metric("RMSE",rmse)