In [4]:
!pip install mlflow
!pip install -U scikit-learn pandas joblib





[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Invalid requirement: 'pandas\xa0joblib': Expected end or semicolon (after name and no valid version specifier)
    pandas joblib
          ^


In [None]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import os
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from pathlib import Path
import numpy as np
from sklearn.preprocessing import StandardScaler

# create artifact repository
REPO_ROOT = Path.cwd()
MODELS_DIR = REPO_ROOT / "models"
MODELS_DIR.mkdir(parents=True, exist_ok=True)
DATA_DIR_RAW = REPO_ROOT / ".." / "data/raw"
DATA_DIR_RAW.mkdir(parents=True, exist_ok=True)
DATA_DIR_PROCESSED = REPO_ROOT / ".."/ "data/processed"
DATA_DIR_PROCESSED.mkdir(parents=True, exist_ok=True)
print(f"Artifacts will be stored in: {MODELS_DIR}")


# Load dataset
housing = fetch_california_housing(as_frame=True)
df = housing.frame

# Preprocessing
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop("MedHouseVal", axis=1))

# Create processed DataFrame
df_processed = pd.DataFrame(scaled_features, columns=housing.feature_names)
df_processed["MedHouseVal"] = df["MedHouseVal"]

# Save raw and processed datasets
df.to_csv(DATA_DIR_RAW / "california_housing_raw.csv", index=False)
df_processed.to_csv(DATA_DIR_PROCESSED / "california_housing_processed.csv", index=False)

print("Data loaded and preprocessed successfully.")


X = df.drop("MedHouseVal", axis=1)
y = df["MedHouseVal"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




# Check and create default experiment if needed
if not mlflow.get_experiment_by_name("Default"):
    mlflow.create_experiment(name="Default")

# ------------------------

#input signature


# -------------

# ------------------------
# Linear Regression - MLflow Run
# ------------------------

# Optional: Check where it's tracking
print("Tracking URI:", mlflow.get_tracking_uri())

mlflow.set_experiment("LogisticRegressionExperiment")

with mlflow.start_run(run_name="LogisticRegressionRun") as lr_run:
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    lr_preds = lr_model.predict(X_test)

    input_example = X_test.sample(5)
    predicted = lr_model.predict(input_example)

    # Infer model signature
    signature = infer_signature(input_example, predicted)


    lr_rmse = np.sqrt(mean_squared_error(y_test, lr_preds))  # default squared=True
    lr_r2 = r2_score(y_test, lr_preds)

    print("🔹 Logicstic Regression:")
    print(f"RMSE: {lr_rmse:.4f}")
    print(f"R² Score: {lr_r2:.4f}")

# ------------------------
# Decision Tree - MLflow Run
# ------------------------
mlflow.set_experiment("DecisionTreeExperiment")
with mlflow.start_run(run_name="DecisionTreeRun") as dt_run:
    dt_model = DecisionTreeRegressor(random_state=42, max_depth=10)
    dt_model.fit(X_train, y_train)
    dt_preds = dt_model.predict(X_test)

    dt_rmse = np.sqrt(mean_squared_error(y_test, dt_preds))
    dt_r2 = r2_score(y_test, dt_preds)

    input_example = X_test.sample(5)
    predicted = dt_model.predict(input_example)

    # Infer model signature
    signature = infer_signature(input_example, predicted)

    print("\n🔹 Decision Tree Regressor:")
    print(f"RMSE: {dt_rmse:.4f}")
    print(f"R² Score: {dt_r2:.4f}")


def dumpModelAndRegister(model_var_list):
    #model_var_list=list(model)
    # Save model locally
    joblib.dump(model_var_list[0], MODELS_DIR / "betterModel.pkl")

    mlflow.start_run(run_id=model_var_list[3].info.run_id)

    # Log model
    mlflow.sklearn.log_model(model_var_list[0], name="model", input_example=input_example, signature=signature)

    # Register model
    mlflow.register_model(
        model_uri=f"runs:/{model_var_list[3].info.run_id}/model",
        name="BestModel"
    )

    # Log metrics
    mlflow.log_metric("rmse", model_var_list[1])
    mlflow.log_metric("r2", model_var_list[2])

    mlflow.end_run()



dt_modelparams=[dt_model, dt_rmse, dt_r2, dt_run]
lr_modelparams=[lr_model, lr_rmse, lr_r2, lr_run]


#Choose best model based on rmse and r2 values

if dt_rmse < lr_rmse and dt_r2 > lr_r2:
    better_model = dt_modelparams
elif lr_rmse < dt_rmse and lr_r2 > dt_r2:
    better_model = lr_modelparams
elif dt_r2 > lr_r2:
    better_model = dt_modelparams
else:
    better_model = lr_modelparams   

#dumping model and registering model

dumpModelAndRegister(better_model)

AttributeError: partially initialized module 'mlflow' has no attribute 'version' (most likely due to a circular import)