In [None]:
!python -V

Python 3.9.19


In [1]:
import pickle
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [2]:
import mlflow
mlflow.set_tracking_uri(uri=" http://127.0.0.1:8080")
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1716324856379, experiment_id='1', last_update_time=1716324856379, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [3]:
def read_dataframe(filename):
    print(f"Reading file: {filename}")
    df = pd.read_parquet(filename)
    print("File read successfully")
    
    df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime'])
    df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])
    
    df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
    df['duration'] = df['duration'].apply(lambda td: td.total_seconds() / 60)

    # Filter for trip durations between 1 and 60 minutes
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]

    # Convert categorical columns to string type
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [4]:
#https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
df_train = read_dataframe('/workspaces/experiment-tracking-zoomcamp/data/green_tripdata_2024-01.parquet')
df_val = read_dataframe('/workspaces/experiment-tracking-zoomcamp/data/green_tripdata_2024-02.parquet')

Reading file: /workspaces/experiment-tracking-zoomcamp/data/green_tripdata_2024-01.parquet
File read successfully
Reading file: /workspaces/experiment-tracking-zoomcamp/data/green_tripdata_2024-02.parquet
File read successfully


In [5]:
len(df_train), len(df_val)

(54373, 51497)

In [6]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [7]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [8]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [9]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False)



5.9947992164797

In [10]:
with open('/workspaces/experiment-tracking-zoomcamp/models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [11]:
with mlflow.start_run():

    mlflow.set_tag("developer", "cristian")

    mlflow.log_param("train-data-path", "/workspaces/experiment-tracking-zoomcamp/data/green_tripdata_2024-01.parquet")
    mlflow.log_param("valid-data-path", "/workspaces/experiment-tracking-zoomcamp/data/green_tripdata_2024-02.parquet")

    alpha = 0.12
    mlflow.log_param("alpha", alpha)
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)
      
    # Log the linear regression model binary file to the "models_pickle" directory in the MLflow artifact store.
    mlflow.log_artifact(local_path="/workspaces/experiment-tracking-zoomcamp/models/lin_reg.bin", artifact_path="models_pickle")



### XGBoost 

In [13]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [14]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [16]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
        
    return {'loss': rmse, 'status': STATUS_OK}

In [54]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials(),
    show_progressbar=True,  # Show progress bar
)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]




[0]	validation-rmse:8.35064                           
[1]	validation-rmse:7.73652                           
[2]	validation-rmse:7.24008                           
[3]	validation-rmse:6.82830                           
[4]	validation-rmse:6.50300                           
[5]	validation-rmse:6.25306                           
[6]	validation-rmse:6.03607                           
[7]	validation-rmse:5.88127                           
[8]	validation-rmse:5.75533                           
[9]	validation-rmse:5.65056                           
[10]	validation-rmse:5.57893                          
[11]	validation-rmse:5.52428                          
[12]	validation-rmse:5.47842                          
[13]	validation-rmse:5.44227                          
[14]	validation-rmse:5.40553                          
[15]	validation-rmse:5.38326                          
[16]	validation-rmse:5.36629                          
[17]	validation-rmse:5.34430                          
[18]	valid

KeyboardInterrupt: 

In [20]:
# disable=True : mlflow wouldnt track any parameters from xgboost
mlflow.xgboost.autolog(disable=True)

In [27]:

best_params = {
        'learning_rate': 0.15091974224498975,
        'max_depth': 9,
        'min_child_weight': 2.30962372679798,
        'objective': 'reg:linear',
        'reg_alpha': 0.2240486133622339,
        'reg_lambda':0.024236491845265277,
        'seed': 42
        }

mlflow.log_params(best_params)
booster = xgb.train(
            params=best_params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
y_pred = booster.predict(valid)
rmse = mean_squared_error(y_val, y_pred, squared=False)
mlflow. log_metric("rmse", rmse)

# Ensure the 'models' directory exists
os.makedirs("models", exist_ok=True)


with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

# Creating folder to save the model in models_mlflow
mlflow. xgboost. log_model(booster, artifact_path="models_mlflow")

mlflow. log_artifact("models/preprocessor.b", artifact_path="preprocessor")



[0]	validation-rmse:8.27342




[1]	validation-rmse:7.61017
[2]	validation-rmse:7.09090
[3]	validation-rmse:6.68632
[4]	validation-rmse:6.37894
[5]	validation-rmse:6.14104
[6]	validation-rmse:5.96349
[7]	validation-rmse:5.82802
[8]	validation-rmse:5.72733
[9]	validation-rmse:5.65055
[10]	validation-rmse:5.59365
[11]	validation-rmse:5.54714
[12]	validation-rmse:5.51368
[13]	validation-rmse:5.48821
[14]	validation-rmse:5.46415
[15]	validation-rmse:5.44762
[16]	validation-rmse:5.43227
[17]	validation-rmse:5.41996
[18]	validation-rmse:5.41035
[19]	validation-rmse:5.40258
[20]	validation-rmse:5.39617
[21]	validation-rmse:5.39134
[22]	validation-rmse:5.38697
[23]	validation-rmse:5.38395
[24]	validation-rmse:5.38128
[25]	validation-rmse:5.37930
[26]	validation-rmse:5.37637
[27]	validation-rmse:5.37410
[28]	validation-rmse:5.37275
[29]	validation-rmse:5.37142
[30]	validation-rmse:5.36927
[31]	validation-rmse:5.36853
[32]	validation-rmse:5.36723
[33]	validation-rmse:5.36579
[34]	validation-rmse:5.36432
[35]	validation-rmse:5.



 Implemenet The model that saved in mlflow Artifact as a **python Function**

In [29]:
logged_model = 'runs:/96b27371aa0947368c161039a130a585/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)
loaded_model 


Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]



mlflow.pyfunc.loaded_model:
  artifact_path: models_mlflow
  flavor: mlflow.xgboost
  run_id: 96b27371aa0947368c161039a130a585

 Implemenet The model that saved in mlflow Artifact as an **XGBOOST Object**

In [31]:
logged_model = 'runs:/96b27371aa0947368c161039a130a585/models_mlflow'
xgboost_model = mlflow.xgboost.load_model(logged_model)
xgboost_model

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]



<xgboost.core.Booster at 0x7713bf2de640>

Deploy the model extracted from **Artifact** for the prediction

In [36]:
y_pred = xgboost_model.predict(valid)
y_pred[:10]

array([19.222584 , 28.09318  ,  6.5815735,  8.568088 ,  4.363816 ,
        4.610695 , 13.16152  , 18.233154 , 22.580976 ,  6.7609262],
      dtype=float32)

In [14]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR


mlflow.sklearn.autolog()
for model_class in (RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, LinearSVR):

    with mlflow.start_run():

        mlflow.log_param("train-data-path", "/workspaces/experiment-tracking-zoomcamp/data/green_tripdata_2024-01.parquet")
        mlflow.log_param("valid-data-path", "/workspaces/experiment-tracking-zoomcamp/data/green_tripdata_2024-02.parquet")
        #mlflow.log_artifact("/workspaces/experiment-tracking-zoomcamp/models/preprocessor.b", artifact_path="preprocessor")

        mlmodel = model_class()
        mlmodel.fit(X_train, y_train)

        y_pred = mlmodel.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
        



: 

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import mlflow

# Enable MLflow autologging
mlflow.sklearn.autolog()

# Define paths for training and validation data, and the preprocessor artifact
train_data_path = "/workspaces/experiment-tracking-zoomcamp/data/green_tripdata_2024-01.parquet"
valid_data_path = "/workspaces/experiment-tracking-zoomcamp/data/green_tripdata_2024-02.parquet"
preprocessor_artifact_path = "/workspaces/experiment-tracking-zoomcamp/models/preprocessor.b"

# Assuming X_train, y_train, X_val, and y_val are defined
# Make sure to define them before running this code

with mlflow.start_run():
    # Log parameters: train-data-path, valid-data-path, and preprocessor artifact
    mlflow.log_param("train-data-path", train_data_path)
    mlflow.log_param("valid-data-path", valid_data_path)
    mlflow.log_artifact(preprocessor_artifact_path, artifact_path="preprocessor")

    # Train Random Forest model
    rf_model = RandomForestRegressor()
    rf_model.fit(X_train, y_train)

    # Make predictions
    y_pred = rf_model.predict(X_val)

    # Calculate and log RMSE
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)
