In [4]:
import mlflow

In [81]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [5]:
!mlflow --version

mlflow, version 1.26.1


In [6]:
!python preprocess_data.py --raw_data_path ./data  --dest_path ./output_folder 

In [24]:
%ls output_folder/ 

dv.pkl     test.pkl   train.pkl  valid.pkl


In [25]:
import pickle 

In [86]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("nyc-taxi-experiment")

2022/06/02 14:05:54 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='./artifacts_local/1', experiment_id='1', lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [87]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [88]:
df_train = read_dataframe('/Users//datalab//Documents//personal//mlops-zoomcamp//green_tripdata_2021-01.parquet')
df_val = read_dataframe('/Users//datalab//Documents//personal//mlops-zoomcamp//green_tripdata_2021-02.parquet')
df_test=read_dataframe('/Users//datalab//Documents//personal//mlops-zoomcamp//green_tripdata_2021-03.parquet')

In [74]:
target = 'duration'
y_train = df_train[target].values
y_valid = df_val[target].values
y_test = df_test[target].values

In [77]:
def preprocess(df: pd.DataFrame, dv: DictVectorizer, fit_dv: bool = False):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    dicts = df[categorical + numerical].to_dict(orient='records')
    if fit_dv:
        X = dv.fit_transform(dicts)
    else:
        X = dv.transform(dicts)
    return X, dv 

In [79]:
dv = DictVectorizer()
X_train, dv = preprocess(df_train, dv, fit_dv=True)
X_valid, _ = preprocess(df_val, dv, fit_dv=False)
X_test, _ = preprocess(df_test, dv, fit_dv=False)

In [82]:
rf = RandomForestRegressor(max_depth=10, random_state=0)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_valid)

rmse = mean_squared_error(y_valid, y_pred, squared=False)


In [83]:
rmse

6.729070933590364

In [89]:
!mkdir models

In [92]:
with open('models/randfo.bin', 'wb') as f_out:
    pickle.dump((dv, rf), f_out)

In [93]:
with mlflow.start_run():
    
    mlflow.set_tag("developer", "Aka")

    mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.csv")
    mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.csv")

    rf = RandomForestRegressor(max_depth=10, random_state=0)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_valid)

    rmse = mean_squared_error(y_valid, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)

    mlflow.log_artifact(local_path="models/randfo.bin", artifact_path="models")

In [94]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [135]:
def objective(params):
    rf = RandomForestRegressor(**params)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_valid)
    rmse = mean_squared_error(y_valid, y_pred, squared=False)

    return {'loss': rmse, 'status': STATUS_OK}
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(rf, f_out)
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    mlflow.sklearn.log_model(booster, artifact_path="models")

In [136]:
search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
        'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
        'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
        'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
        'random_state': 42
}


rstate = np.random.default_rng(42)  # for reproducible results
fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials(),
    rstate=rstate
)
mlflow.log_metric("rmse", rmse)



100%|█████████| 50/50 [07:28<00:00,  8.98s/trial, best loss: 6.6284257482044735]


In [120]:
with mlflow.start_run():
    
    

    best_params = {
        'max_depth': 19.0,
        'min_samples_leaf': 3.0,
        'min_samples_split': 5.0,
        'n_estimators': 28.0
    }

    mlflow.log_params(best_params)

    booster = RandomForestRegressor(rf=RandomForestRegressor({'max_depth':19.0,
     'min_samples_leaf': 3.0,
     'min_samples_split': 5.0,
     'n_estimators': 28.0}).fit(X_train,y_train)


    y_pred = booster.predict(X_valid)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)

    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(rf, f_out)
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    mlflow.sklearn.log_model(booster, artifact_path="models")

SyntaxError: invalid syntax (4194051972.py, line 20)

In [137]:
booster = RandomForestRegressor({'max_depth':19,
     'min_samples_leaf': 3,
     'min_samples_split': 5,
     'n_estimators': 28}).fit(X_train,y_train)

ValueError: n_estimators must be an integer, got <class 'dict'>.

In [None]:
RandomForestRegressor()

In [101]:
mlflow.sklearn.autolog(disable=True)

In [102]:
from mlflow.tracking import MlflowClient

MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [103]:
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids='1',    # Experiment ID we want
    filter_string="metrics.rmse < 7",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

In [104]:
for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")

run id: 427eff90a1d244aab78c4885de842907, rmse: 6.7291
run id: 2646d755b4644df0b45c50e6de7da8e3, rmse: 6.7291


In [None]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

run_id = "427eff90a1d244aab78c4885de842907"
model_uri = f"runs:/{run_id}/models"
mlflow.register_model(model_uri=model_uri, name="nyc-taxi-experiment")