In [30]:
!python -V

Python 3.9.12


In [31]:
import pandas as pd 
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

In [32]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("Mlflow_cooking")

<Experiment: artifact_location='/workspaces/MLops-cookin-/02-MLflow/mlruns/1', creation_time=1758188410351, experiment_id='1', last_update_time=1758188410351, lifecycle_stage='active', name='Mlflow_cooking', tags={}>

In [33]:
mlflow.__version__

'3.1.3'

In [34]:
# making it a function for easy access
def read_dataframe(filename):
    df=pd.read_parquet(filename)

    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)

    df['duration']=df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df=df[(df.duration >= 1.0) & (df.duration <= 62)]

    categorical=['PULocationID','DOLocationID']
    #numerical=['trip_distance']

    df[categorical] = df[categorical].astype(str)
    
    return df

In [35]:
df_train=read_dataframe("https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet")
df_val=read_dataframe("https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet")

In [36]:
target="duration"
y_train=df_train[target].values
y_val=df_val[target].values

In [37]:
# trying the feature combination 
df_train["PU_OD"]=df_train["PULocationID"] + '_' + df_train['DOLocationID']
df_val["PU_OD"]=df_val["PULocationID"] + '_' + df_val['DOLocationID']

In [38]:
categorical=["PU_OD"]#'PULocationID','DOLocationID']
numerical=['trip_distance']

dv=DictVectorizer()

train_dicts=df_train[categorical + numerical].to_dict(orient="records")
X_train=dv.fit_transform(train_dicts)

val_dicts=df_val[categorical + numerical].to_dict(orient="records")
X_val=dv.transform(val_dicts)

In [39]:
# trying this model to xgboost
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [40]:
'''with open("models/preprocessor.b", 'wb') as f_out:
    pickle.dump(dv, f_out)'''

'with open("models/preprocessor.b", \'wb\') as f_out:\n    pickle.dump(dv, f_out)'

In [41]:
train= xgb.DMatrix(X_train, label=y_train)
valid=xgb.DMatrix(X_val, label=y_val)

In [42]:
with mlflow.start_run():
    
    train= xgb.DMatrix(X_train, label=y_train)
    valid=xgb.DMatrix(X_val, label=y_val)
    
    
    best_params={
        "learning_rate": 0.14988312150619953,
        "max_depth":66,
        "min_child_weight":1.0531411801474737,
        "objective":"reg:squarederror",
        "reg_alpha":0.01980913796072851,
        "reg_lambda":0.042524169343261004,
        "seed":13}
    #saving the best params in mlflow
    mlflow.log_params(best_params)
    
    booster=xgb.train(
            params=best_params,
            dtrain=train,
            num_boost_round=5,
            evals=[(valid, "validation")],
            early_stopping_rounds=50)
    
    y_pred = booster.predict(valid)
    rmse=mean_squared_error(y_val, y_pred, squared=False)
    #saving the metrics score in mlflow
    mlflow.log_metric("rmse",rmse)
    
    #saving the preprocessing data in mlflow
    mlflow.log_artifact("models/preprocessor.b", artifact_path='preprocessor')
    
    #saving the model in mlflow
    mlflow.xgboost.log_model(
        xgb_model=booster,
        artifact_path="xgboost",
        registered_model_name="models"
    )




[0]	validation-rmse:11.19666
[1]	validation-rmse:10.20824
[2]	validation-rmse:9.41863
[3]	validation-rmse:8.79502
[4]	validation-rmse:8.30440


Registered model 'models' already exists. Creating a new version of this model...
Created version '5' of model 'models'.


In [43]:
xgb.__version__

'2.1.4'

In [44]:
best_params={
        "learning_rate": 0.14988312150619953,
        "max_depth":66,
        "min_child_weight":1.0531411801474737,
        "objective":"reg:squarederror",
        "reg_alpha":0.01980913796072851,
        "reg_lambda":0.042524169343261004,
        "seed":13}

In [45]:
# this is one to save the  both preprocessing and model 

In [46]:
with mlflow.start_run():
    
    train= xgb.DMatrix(X_train, label=y_train)
    valid=xgb.DMatrix(X_val, label=y_val)
    
    
    best_params={
        "learning_rate": 0.14988312150619953,
        "max_depth":66,
        "min_child_weight":1.0531411801474737,
        "objective":"reg:linear",
        "reg_alpha":0.01980913796072851,
        "reg_lambda":0.042524169343261004,
        "seed":13}
    #saving the best params in mlflow
    mlflow.log_params(best_params)
    
    booster=xgb.train(
            params=best_params,
            dtrain=train,
            num_boost_round=10,
            evals=[(valid, "validation")],
            early_stopping_rounds=50)
    
    y_pred = booster.predict(valid)
    rmse=mean_squared_error(y_val, y_pred, squared=False)
    #saving the metrics score in mlflow
    mlflow.log_metric("rmse",rmse)
    
    #with open("models/preprocessor.b", 'wb') as f_out:
        #pickle.dump(dv, f_out)
    #saving the preprocessing data in mlflow
    #mlflow.log_artifact("models/preprocessor.b", artifact_path='preprocessor')
    
    #saving the model in mlflow
    mlflow.xgboost.log_model(booster, name="model_mlflow", code_paths=["models/"])



[0]	validation-rmse:11.19666
[1]	validation-rmse:10.20824
[2]	validation-rmse:9.41863
[3]	validation-rmse:8.79502
[4]	validation-rmse:8.30440
[5]	validation-rmse:7.92164
[6]	validation-rmse:7.62675
[7]	validation-rmse:7.39581
[8]	validation-rmse:7.21975
[9]	validation-rmse:7.08243




In [47]:
mlflow.__version__

'3.1.3'

The loading model and making the prediction

In [48]:
model_uri = "file:///workspaces/MLops-cookin-/02-MLflow/mlruns/1/models/m-f664fd8e209f4c739b361698cd2106c7/artifacts"

# Load the booster back
booster = mlflow.pyfunc.load_model(model_uri)

 - mlflow (current: 3.1.3, required: mlflow==3.1.4)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


In [49]:
booster

mlflow.pyfunc.loaded_model:
  artifact_path: /workspaces/MLops-cookin-/02-MLflow/mlruns/1/models/m-f664fd8e209f4c739b361698cd2106c7/artifacts
  flavor: mlflow.xgboost
  run_id: 84b53fb51d0f4fc183dd524aefad1b64

In [50]:
xgboost_model= mlflow.xgboost.load_model(model_uri)

In [51]:
xgboost_model

<xgboost.core.Booster at 0x76cc188ca370>

In [52]:
y_pred=xgboost_model.predict(valid)

In [53]:
y_pred[:10]

array([15.161994, 11.486106, 18.373766, 21.084461, 12.870551, 17.09591 ,
       15.24416 , 12.267187, 12.706985, 18.904097], dtype=float32)

In [1]:
# trying different models

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR

mlflow.sklearn.autolog()

for model_class in (RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, LinearSVR):
    
    with mlflow.start_run():
        mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.csv")
        mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.csv")
        #mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")
        
        
        mlmodel = model_class()
        mlmodel.fit(X_train, y_train)

        y_pred = mlmodel.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)


NameError: name 'mlflow' is not defined