In [None]:
!pip install pandas scikit-learn mlflow


Defaulting to user installation because normal site-packages is not writeable
Collecting mlflow
  Downloading mlflow-3.2.0-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==3.2.0 (from mlflow)
  Downloading mlflow_skinny-3.2.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-tracing==3.2.0 (from mlflow)
  Downloading mlflow_tracing-3.2.0-py3-none-any.whl.metadata (19 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting waitress<4 (from mlflow)
  Downloading waitress-3.0.2-py3-none-any.whl.metadata (5.8 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.2.0->mlflow)
  Downloading databricks_sdk-0.62.0-py3-none-any.whl.metadata (39 kB)
Collecting fastapi<1 (from mlflow-skinny==3.2.0->mlflo

 # Training & Registration Script

In [5]:
import os
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
# =======================
# CONFIGURATION
# =======================
mlflow.set_tracking_uri("http://127.0.0.1:5000")  # Change if using remote MLflow
mlflow.set_experiment("California_Housing_Models")
MODEL_NAME = "California_Housing_Best_Model"

2025/08/09 10:47:55 INFO mlflow.tracking.fluent: Experiment with name 'California_Housing_Models' does not exist. Creating a new experiment.


In [None]:
# =======================
# LOAD DATA (from features.csv + labels.csv)
# =======================
# Load training data
X = pd.read_csv("../Part 1 Repository and Data Versioning/processed/features.csv")
y = pd.read_csv("../Part 1 Repository and Data Versioning/processed/labels.csv").squeeze()


# 80/20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# =======================
# FUNCTION TO TRAIN & LOG
# =======================
def train_and_log(model, model_name, params=None):
    with mlflow.start_run(run_name=model_name):
        if params:
            mlflow.log_params(params)
        
        # Train
        model.fit(X_train, y_train)

        # Predict
        y_pred = model.predict(X_test)

        # Metrics
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)

        # Log metrics
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)

        # Log model
        mlflow.sklearn.log_model(model, "model")

        print(f"{model_name} -> RMSE: {rmse:.4f}, R2: {r2:.4f}")
        return {"model": model, "rmse": rmse, "r2": r2}

In [8]:

# =======================
# TRAIN MODELS
# =======================
results = []

# Linear Regression
lr_model = LinearRegression()
results.append(train_and_log(lr_model, "LinearRegression"))

# Decision Tree
dt_params = {"max_depth": 10, "random_state": 42}
dt_model = DecisionTreeRegressor(**dt_params)
results.append(train_and_log(dt_model, "DecisionTree", params=dt_params))



LinearRegression -> RMSE: 69297.7167, R2: 0.6488
🏃 View run LinearRegression at: http://127.0.0.1:5000/#/experiments/466548277255127213/runs/33f539647cd34c4eba177701e7e96a7f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/466548277255127213




DecisionTree -> RMSE: 60747.6952, R2: 0.7301
🏃 View run DecisionTree at: http://127.0.0.1:5000/#/experiments/466548277255127213/runs/13e98ee457504270987524de298172a5
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/466548277255127213


In [None]:
# =======================
# SELECT BEST MODEL
# =======================
best_model_info = min(results, key=lambda x: x["rmse"])
best_model = best_model_info["model"]

print(f"\nBest Model: {best_model} with RMSE={best_model_info['rmse']:.4f}")



Best Model: DecisionTreeRegressor(max_depth=10, random_state=42) with RMSE=60747.6952


In [10]:
# =======================
# REGISTER MODEL
# =======================
with mlflow.start_run(run_name="Register_Best_Model"):
    mlflow.sklearn.log_model(best_model, "model", registered_model_name=MODEL_NAME)
    mlflow.log_metric("rmse", best_model_info["rmse"])
    mlflow.log_metric("r2", best_model_info["r2"])

Successfully registered model 'California_Housing_Best_Model'.
2025/08/09 10:54:01 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: California_Housing_Best_Model, version 1
Created version '1' of model 'California_Housing_Best_Model'.


🏃 View run Register_Best_Model at: http://127.0.0.1:5000/#/experiments/466548277255127213/runs/d2e557ffe4aa4307a049c13fbd3e8ac7
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/466548277255127213
