In [21]:
# Standard library imports
import argparse
import os
import sys
import urllib.request
import tarfile
from datetime import datetime
import warnings

# Third-party library imports
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# MLflow imports
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient


In [2]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

In [3]:
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_housing_data()

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

housing = load_housing_data()

In [4]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
# mlflow server     --backend-store-uri sqlite:///mlflow.db     --default-artifact-root ./mlruns     --host localhost     --port 5000

In [6]:
remote_server_uri = "http://localhost:5000" # set to your server URI
mlflow.set_tracking_uri(remote_server_uri)  # or set the MLFLOW_TRACKING_URI in the env
mlflow.tracking.get_tracking_uri()

'http://localhost:5000'

In [7]:
exp_name = "TAMLEP_house_price_pred"
mlflow.set_experiment(exp_name)

<Experiment: artifact_location='/home/carolyn/mlflow/mlruns/6', creation_time=1723199548561, experiment_id='6', last_update_time=1723199548561, lifecycle_stage='active', name='TAMLEP_house_price_pred', tags={}>

In [8]:

def eval_metrics(actual, pred):
    # compute relevant metrics
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2



def load_data(data_path):
    data = pd.read_csv(data_path)

    # Identify the target variable
    target_variable = "median_house_value"

    # Separate features and target
    X = data.drop(columns=[target_variable])
    y = data[[target_variable]]

    # Identify categorical columns
    categorical_columns = X.select_dtypes(include=['object']).columns

    # Create a preprocessing pipeline for the categorical columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(), categorical_columns)
        ],
        remainder='passthrough'  # Keep other columns (numerical ones) as they are
    )

    # Apply the preprocessing pipeline to the features
    X_transformed = preprocessor.fit_transform(X)

    # Split the data into training and test sets (75% training, 25% test)
    train_x, test_x, train_y, test_y = train_test_split(X_transformed, y, test_size=0.25, random_state=42)

    return train_x, train_y, test_x, test_y


from sklearn.ensemble import HistGradientBoostingRegressor

def train():
    warnings.filterwarnings("ignore")
    np.random.seed(40)

    # Path to the house price dataset
    data_path = os.path.join("datasets", "housing", "housing.csv")
    train_x, train_y, test_x, test_y = load_data(data_path)

    with mlflow.start_run():
        # Use a model that can handle missing values
        model = HistGradientBoostingRegressor()
        model.fit(train_x, train_y.values.ravel())

        # Evaluate Metrics
        predicted_prices = model.predict(test_x)
        (rmse, mae, r2) = eval_metrics(test_y, predicted_prices)

        # Print out metrics
        print("HistGradientBoostingRegressor model:")
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Log metrics and model to MLflow
        mlflow.log_metric(key="rmse", value=rmse)
        mlflow.log_metrics({"mae": mae, "r2": r2})
        mlflow.log_artifact(data_path)
        print("Save to: {}".format(mlflow.get_artifact_uri()))
        
        mlflow.sklearn.log_model(model, "model")




In [9]:
train()

HistGradientBoostingRegressor model:
  RMSE: 47647.9895691026
  MAE: 32163.79550434798
  R2: 0.8284235303652846
Save to: /home/carolyn/mlflow/mlruns/6/a5ee36edd15648098e0211270e00d9c5/artifacts


2024/08/09 18:55:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run calm-gull-460 at: http://localhost:5000/#/experiments/6/runs/a5ee36edd15648098e0211270e00d9c5.
2024/08/09 18:55:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/6.


In [11]:
import mlflow
from mlflow.tracking import MlflowClient

client = MlflowClient()

# Use search_experiments to get a list of experiments
experiments = client.search_experiments()

for exp in experiments:
    print(f"ID: {exp.experiment_id}, Name: {exp.name}")


ID: 6, Name: TAMLEP_house_price_pred
ID: 0, Name: Default


In [15]:
from mlflow.tracking import MlflowClient

client = MlflowClient()
experiments = client.search_experiments()
for experiment in experiments:
    print(f"Name: {experiment.name}, ID: {experiment.experiment_id}")


Name: TAMLEP_house_price_pred, ID: 6
Name: Default, ID: 0


In [17]:
# get the run
_run = client.get_run(run_id="a5ee36edd15648098e0211270e00d9c5")
print(_run)

<Run: data=<RunData: metrics={'mae': 32163.79550434798, 'r2': 0.8284235303652846, 'rmse': 47647.9895691026}, params={}, tags={'mlflow.log-model.history': '[{"run_id": "a5ee36edd15648098e0211270e00d9c5", '
                             '"artifact_path": "model", "utc_time_created": '
                             '"2024-08-09 13:25:48.612222", "flavors": '
                             '{"python_function": {"model_path": "model.pkl", '
                             '"predict_fn": "predict", "loader_module": '
                             '"mlflow.sklearn", "python_version": "3.10.12", '
                             '"env": {"conda": "conda.yaml", "virtualenv": '
                             '"python_env.yaml"}}, "sklearn": '
                             '{"pickled_model": "model.pkl", '
                             '"sklearn_version": "1.5.1", '
                             '"serialization_format": "cloudpickle", "code": '
                             'null}}, "model_uuid": '
              

In [20]:
# add a tag to the run
dt = datetime.now().strftime("%d-%m-%Y (%H:%M:%S.%f)")
client.set_tag(_run.info.run_id, "deployed", dt)