In [None]:
!pip install pandas scikit-learn mlflow


 # Training & Registration Script

In [1]:
import os
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np

In [6]:
from mlflow.tracking import MlflowClient

client = MlflowClient()
mlflow.set_experiment("California_Housing_Experiment")

2025/08/10 11:27:13 INFO mlflow.tracking.fluent: Experiment with name 'California_Housing_Experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/340664372522611120', creation_time=1754805433696, experiment_id='340664372522611120', last_update_time=1754805433696, lifecycle_stage='active', name='California_Housing_Experiment', tags={}>

In [7]:
# =======================
# CONFIGURATION
# =======================
mlflow.set_tracking_uri("http://127.0.0.1:5000")
MODEL_NAME = "California_Housing_Best_Model"

In [8]:
# =======================
# LOAD DATA (from features.csv + labels.csv)
# =======================
# Load training data
X = pd.read_csv("../Part 1 Repository and Data Versioning/processed/features.csv")
y = pd.read_csv("../Part 1 Repository and Data Versioning/processed/labels.csv").squeeze()


# 80/20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# =======================
# FUNCTION TO TRAIN & LOG
# =======================
def train_and_log(model, model_name, params=None):
    with mlflow.start_run(run_name=model_name):
        if params:
            mlflow.log_params(params)

        # Train your model here (make sure you scale your data beforehand)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)

        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)

        mlflow.sklearn.log_model(model, "model")

        print(f"{model_name} -> RMSE: {rmse:.4f}, R2: {r2:.4f}")
        
        return {"model": model, "rmse": rmse, "r2": r2} 


In [11]:

# =======================
# TRAIN MODELS
# =======================
results = []

# Linear Regression
lr_model = LinearRegression()
results.append(train_and_log(lr_model, "LinearRegression"))

# Decision Tree
dt_params = {"max_depth": 10, "random_state": 42}
dt_model = DecisionTreeRegressor(**dt_params)
results.append(train_and_log(dt_model, "DecisionTree", params=dt_params))



LinearRegression -> RMSE: 69297.7167, R2: 0.6488
🏃 View run LinearRegression at: http://127.0.0.1:5000/#/experiments/340664372522611120/runs/206e329dcacd492992e6707ba65ddccd
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/340664372522611120




DecisionTree -> RMSE: 60747.6952, R2: 0.7301
🏃 View run DecisionTree at: http://127.0.0.1:5000/#/experiments/340664372522611120/runs/6d72ea1909804bcf938bc3d2e412484b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/340664372522611120


In [12]:
# =======================
# SELECT BEST MODEL
# =======================
best_model_info = min(results, key=lambda x: x["rmse"])
best_model = best_model_info["model"]

print(f"\nBest Model: {best_model} with RMSE={best_model_info['rmse']:.4f}")



Best Model: DecisionTreeRegressor(max_depth=10, random_state=42) with RMSE=60747.6952


In [13]:
scaler_path = os.path.abspath("../Part 1 Repository and Data Versioning/processed/scaler.pkl")
feature_columns_path = os.path.abspath("../Part 1 Repository and Data Versioning/processed/feature_columns.pkl")

In [14]:
# =======================
# REGISTER MODEL
# =======================
with mlflow.start_run(run_name="Register_Best_Model"):
    mlflow.sklearn.log_model(best_model, "model", registered_model_name=MODEL_NAME)
    mlflow.log_metric("rmse", best_model_info["rmse"])
    mlflow.log_metric("r2", best_model_info["r2"])
    mlflow.log_artifact(scaler_path)
    mlflow.log_artifact(feature_columns_path)

Successfully registered model 'California_Housing_Best_Model'.
2025/08/10 11:27:39 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: California_Housing_Best_Model, version 1
Created version '1' of model 'California_Housing_Best_Model'.


🏃 View run Register_Best_Model at: http://127.0.0.1:5000/#/experiments/340664372522611120/runs/c0de05e031dd44a2912a465a841efbf8
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/340664372522611120
