In [1]:
import pickle
import pandas as pd
import mlflow
import sklearn
from pathlib import Path
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error
import prefect

### 1. Get versions

In [2]:
print(f'pandas version: {pd.__version__}')
print(f"pickle version: {pickle.format_version}")
print(f"sklearn version: {sklearn.__version__}")
print(f"mlflow version: {mlflow.__version__}")
print(f"prefect version: {prefect.__version__}")

pandas version: 2.2.3
pickle version: 4.0
sklearn version: 1.6.1
mlflow version: 2.12.2
prefect version: 3.4.4


## 2. Load Data

Yellow Taxi data for March 2023
https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet

In [3]:
file_path = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet"

In [4]:
# Q3: How many records did we load?
df = pd.read_parquet(file_path)
print(f"Number of records: {len(df)}")

Number of records: 3403766


In [5]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,2,2023-03-01 00:06:43,2023-03-01 00:16:43,1.0,0.0,1.0,N,238,42,2,8.6,1.0,0.5,0.0,0.0,1.0,11.1,0.0,0.0
1,2,2023-03-01 00:08:25,2023-03-01 00:39:30,2.0,12.4,1.0,N,138,231,1,52.7,6.0,0.5,12.54,0.0,1.0,76.49,2.5,1.25
2,1,2023-03-01 00:15:04,2023-03-01 00:29:26,0.0,3.3,1.0,N,140,186,1,18.4,3.5,0.5,4.65,0.0,1.0,28.05,2.5,0.0
3,1,2023-03-01 00:49:37,2023-03-01 01:01:05,1.0,2.9,1.0,N,140,43,1,15.6,3.5,0.5,4.1,0.0,1.0,24.7,2.5,0.0
4,2,2023-03-01 00:08:04,2023-03-01 00:11:06,1.0,1.23,1.0,N,79,137,1,7.2,1.0,0.5,2.44,0.0,1.0,14.64,2.5,0.0


In [6]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [7]:
# Q4: What's the size of the result?
data = read_dataframe(file_path)
print(f"Number of records: {len(data)}")

Number of records: 3316216


In [8]:
# split the data into train and validation sets using time-based split
def train_val_split(df, train_size=0.8):
    train_size = int(len(df) * train_size)
    train = df[:train_size]
    val = df[train_size:]
    return train, val

df_train, df_val = train_val_split(data)

## 3. Train Model

start `mlflow ui --port 1994` in the cwd director

In [9]:
# start mlflow ui in the cwd directory and extract the url printed
mlflow.set_tracking_uri("http://127.0.0.1:1994/")
mlflow.set_experiment("homework-experiment")

<Experiment: artifact_location='mlflow-artifacts:/856591471687517319', creation_time=1748722844266, experiment_id='856591471687517319', last_update_time=1748722844266, lifecycle_stage='active', name='homework-experiment', tags={}>

featurization

In [10]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [11]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

train and track model

In [13]:
models_folder = Path('models')
models_folder.mkdir(exist_ok=True)

In [15]:
with mlflow.start_run():
    # set tag as linear regression model
    mlflow.set_tag("model", "linear_regression")
    mlflow.set_tag("version", f"{sklearn.__version__}")

    model = LinearRegression()
    model.fit(X_train, y_train)

    mlflow.log_params({
        "model_type": "linear_regression",
        "n_features": X_train.shape[1],
        "n_targets": y_train.shape[1] if len(y_train.shape) > 1 else 1,
        "n_samples": X_train.shape[0],
        "intercept_": model.intercept_,
    })

    y_pred = model.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    mae = mean_absolute_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)


    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    mlflow.sklearn.log_model(model, artifact_path="models_mlflow")