In [45]:
import pickle

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plot

from sklearn.feature_extraction import DictVectorizer #ohe
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [46]:
from sklearn.pipeline import make_pipeline

In [47]:
!python -V

Python 3.9.16


In [48]:
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("green-taxi-exp_s3")

2023/04/14 13:55:11 INFO mlflow.tracking.fluent: Experiment with name 'green-taxi-exp_s3' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://mlflow-nyc-taxi-reg-exp/4', creation_time=1681460711669, experiment_id='4', last_update_time=1681460711669, lifecycle_stage='active', name='green-taxi-exp_s3', tags={}>

In [49]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df['duration'] = df.duration.apply(lambda td: td.total_seconds() /60)
    df = df[((df.duration >= 1) & (df.duration <=60))]
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    return df

def prepare_data(df: pd.DataFrame):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
    numerical = ['trip_distance']
    dicts = df[categorical + numerical].to_dict(orient = 'records')
    return dicts


In [50]:
df_train = read_dataframe("data/green_tripdata_2022-01.parquet")
df_val = read_dataframe("data/green_tripdata_2022-02.parquet")

In [51]:
len(df_train),len(df_val)

(59603, 66097)

In [52]:
train_dicts = prepare_data(df_train)
val_dicts = prepare_data(df_val)

In [53]:
target = 'duration'
y_train = df_train[target].values #array
y_val = df_val[target].values #array

In [56]:
#RandomForestRegression
with mlflow.start_run():

    params = dict(max_depth=20,n_estimators=100,min_samples_leaf=10,random_state=0)
    mlflow.log_params(params)

    pipeline = make_pipeline(
        DictVectorizer(),
        RandomForestRegressor(**params, n_jobs=-1))
    
    pipeline.fit(train_dicts,y_train)
    y_predicted = pipeline.predict(val_dicts)

    rmse = mean_squared_error(y_val, y_predicted, squared= False)
    print(params,rmse)
    mlflow.log_metric("rmse",rmse)
    mlflow.sklearn.log_model(pipeline,artifact_path='model')


{'max_depth': 20, 'n_estimators': 100, 'min_samples_leaf': 10, 'random_state': 0} 6.101201727016495


END

In [18]:
from mlflow.tracking import MlflowClient

RUN_ID = 'dca5f3d497454560980e1261963ae7be'
MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"

client = MlflowClient(tracking_uri = MLFLOW_TRACKING_URI)

In [21]:

#download the dicvect artifact with client
path = client.download_artifacts(run_id = RUN_ID, path = 'dict_vectorizer.bin')

  path = client.download_artifacts(run_id = RUN_ID, path = 'dict_vectorizer.bin')


In [25]:

with open(path, 'rb') as fout:
    dv = pickle.load(fout)

In [33]:
from mlflow.entities import ViewType

runs = client.search_runs(
     experiment_ids = '1',
     filter_string = "",
     run_view_type = ViewType.ACTIVE_ONLY,
     max_results=5,
     order_by =["metrics.rmse ASC"]
)

In [35]:
for run in runs:
    print(f"run id: {run.info.run_id},rmse: {run.data.metrics['rmse']:.4f}")

run id: 57e220105c2e42b08a16595b89588e41,rmse: 5.8231
run id: 93be0212dbd042ada39c32c859245076,rmse: 5.8707
run id: bbb7e00472af433fb6b63729253dba9d,rmse: 5.8718
run id: e341597906b14dda9a1a7d6b64cf83cb,rmse: 5.8906
run id: 8d8d6eb312204d1cb408674ffe5e8499,rmse: 5.9036
