In [None]:
!pip install pandas scikit-learn mlflow


 # Training & Registration Script

In [1]:
import os
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

In [3]:
from dotenv import load_dotenv

load_dotenv()

True

In [4]:
# =======================
# CONFIGURATION
# =======================

DATABRICKS_HOST = os.getenv("DATABRICKS_HOST")
DATABRICKS_TOKEN = os.getenv("DATABRICKS_TOKEN")

mlflow.set_tracking_uri("databricks")


MODEL_NAME = "California_Housing_Best_Model"
mlflow.set_experiment("/Users/arunabh919@gmail.com/California_Housing_Experiment")


# mlflow.set_tracking_uri("http://127.0.0.1:5000")  

<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/853290876873368', creation_time=1754814044883, experiment_id='853290876873368', last_update_time=1754815200693, lifecycle_stage='active', name='/Users/arunabh919@gmail.com/California_Housing_Experiment', tags={'mlflow.databricks.filesystem.experiment_permissions_check': 'test',
 'mlflow.experiment.sourceName': '/Users/arunabh919@gmail.com/California_Housing_Experiment',
 'mlflow.experimentKind': 'custom_model_development',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'arunabh919@gmail.com',
 'mlflow.ownerId': '77168391110064'}>

In [5]:
# =======================
# LOAD DATA (from features.csv + labels.csv)
# =======================
# Load training data
X = pd.read_csv("../Part 1 Repository and Data Versioning/processed/features.csv")
y = pd.read_csv("../Part 1 Repository and Data Versioning/processed/labels.csv").squeeze()


# 80/20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# =======================
# FUNCTION TO TRAIN & LOG
# =======================
from mlflow.models.signature import infer_signature

def train_and_log(model, model_name, params=None):
    with mlflow.start_run(run_name=model_name):
        if params:
            mlflow.log_params(params)

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)

        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)

        # Create input DataFrame for signature inference (use some training or test data)
        input_sample = X_train.head(5)
        output_sample = model.predict(input_sample)

        # Infer signature
        signature = infer_signature(input_sample, output_sample)

        # Log model with signature
        mlflow.sklearn.log_model(model, "model", signature=signature)

        # Log scaler and feature columns as before
        scaler_path = os.path.abspath("../Part 1 Repository and Data Versioning/processed/scaler.pkl")
        feature_columns_path = os.path.abspath("../Part 1 Repository and Data Versioning/processed/feature_columns.pkl")

        mlflow.log_artifact(scaler_path)
        mlflow.log_artifact(feature_columns_path)

        print(f"{model_name} -> RMSE: {rmse:.4f}, R2: {r2:.4f}")

        return {"model": model, "rmse": rmse, "r2": r2}



In [10]:
# =======================
# TRAIN MODELS
# =======================
results = []

# Linear Regression
lr_model = LinearRegression()
results.append(train_and_log(lr_model, "LinearRegression"))

# Decision Tree
dt_params = {"max_depth": 10, "random_state": 42}
dt_model = DecisionTreeRegressor(**dt_params)
results.append(train_and_log(dt_model, "DecisionTree", params=dt_params))



LinearRegression -> RMSE: 69297.7167, R2: 0.6488
🏃 View run LinearRegression at: dbc-52ca4c3c-7d43.cloud.databricks.com/ml/experiments/853290876873368/runs/ae44bfe5072d4fb484a5e347e2858b9e
🧪 View experiment at: dbc-52ca4c3c-7d43.cloud.databricks.com/ml/experiments/853290876873368




DecisionTree -> RMSE: 60747.6952, R2: 0.7301
🏃 View run DecisionTree at: dbc-52ca4c3c-7d43.cloud.databricks.com/ml/experiments/853290876873368/runs/fb5a3a1d0a9b43ecafe26522bfbbca97
🧪 View experiment at: dbc-52ca4c3c-7d43.cloud.databricks.com/ml/experiments/853290876873368


In [8]:
# =======================
# SELECT BEST MODEL
# =======================
best_model_info = min(results, key=lambda x: x["rmse"])
best_model = best_model_info["model"]

print(f"\nBest Model: {best_model} with RMSE={best_model_info['rmse']:.4f}")



Best Model: DecisionTreeRegressor(max_depth=10, random_state=42) with RMSE=60747.6952


In [14]:
scaler_path = os.path.abspath("../Part 1 Repository and Data Versioning/processed/scaler.pkl")
feature_columns_path = os.path.abspath("../Part 1 Repository and Data Versioning/processed/feature_columns.pkl")

In [15]:
# =======================
# REGISTER MODEL
# =======================
mlflow.set_tracking_uri("databricks")
MODEL_NAME  = "mlops.default.California_Housing_Best_Model"

input_sample = X_train.head(5)
output_sample = best_model.predict(input_sample)

# Infer signature
signature = infer_signature(input_sample, output_sample)

with mlflow.start_run(run_name="Register_Best_Model"):
    mlflow.sklearn.log_model(best_model, "model", registered_model_name=MODEL_NAME, signature=signature)
    mlflow.log_metric("rmse", best_model_info["rmse"])
    mlflow.log_metric("r2", best_model_info["r2"])
    mlflow.log_artifact(scaler_path)
    mlflow.log_artifact(feature_columns_path)

Registered model 'mlops.default.California_Housing_Best_Model' already exists. Creating a new version of this model...


Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Created version '2' of model 'mlops.default.california_housing_best_model'.


🏃 View run Register_Best_Model at: dbc-52ca4c3c-7d43.cloud.databricks.com/ml/experiments/853290876873368/runs/2d10a651626641a2bcd8648c4770b612
🧪 View experiment at: dbc-52ca4c3c-7d43.cloud.databricks.com/ml/experiments/853290876873368
