In [1]:
import numpy as np 
import mlflow
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from mlflow.models import infer_signature
from urllib.parse import urlparse

In [2]:
data = pd.read_csv('kc_house_data.csv')
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [3]:
# Get features and labels 
X, Y = data.iloc[:, 3:], data['price']

# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)

In [4]:
signature = infer_signature(X_train, y_train)



In [5]:
# hyperparameters grid
params = {
    "n_estimators": [100, 200],
    "max_depth": [5, 10, None],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

In [6]:
def hyperparameters_search(X_train, y_train, params):
    rf = RandomForestRegressor()
    grid_search = GridSearchCV(estimator=rf, param_grid=params, 
                                cv=3, n_jobs = -1, verbose =2,
                                scoring = "neg_mean_squared_error")
    grid_search.fit(X_train, y_train)
    return grid_search

In [None]:
# Set the traacking URI

mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [7]:
# mlflow tracking with new experiment
mlflow.set_experiment("Ml-Flow House Price Experiments")

<Experiment: artifact_location='file:///workspace/MLOPs_explained/ml-flow/mlruns/725470569323882244', creation_time=1730908442107, experiment_id='725470569323882244', last_update_time=1730908442107, lifecycle_stage='active', name='Ml-Flow House Price Experiments', tags={}>

In [None]:
# set experiments
with mlflow.start_run():
    # hyperparameters tuning
    grid_search = hyperparameters_search(X_train, y_train, params)

    # get the best model 
    best_model = grid_search.best_estimator_

    #evaluation
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    # log metrics and params
    mlflow.log_param("best_n_estimators", grid_search.best_params_["n_estimators"])
    mlflow.log_param("best_max_depth", grid_search.best_params_["max_depth"])
    mlflow.log_param("best_min_samples_split", grid_search.best_params_["min_samples_split"])
    mlflow.log_param("best_min_samples_leaf", grid_search.best_params_["min_samples_leaf"])
    mlflow.log_metric("mse", mse)

    # tracking URL 
    mlflow.set_tracking_uri(uri = "http://127.0.0.1:5000")
    tracking_uri_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    if tracking_uri_type_store != "file":
        mlflow.sklearn.log_model(best_model, "model", registered_model_name="Best_RF_Estimator")
    else:
        mlflow.sklearn.log_model(best_model, "model", signature=signature)

    print("best hyperparam", grid_search.best_params_)
    print("mean Square error", mse)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  14.9s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  15.5s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  15.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  15.2s
[CV] END max_depth=5, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=  16.3s
[CV] END max_depth=5, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=  16.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  17.0s
[CV] END max_depth=5, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=  16.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  18.5s
[CV] END max_depth=5, min_samples_leaf=