In [12]:
import os
from pathlib import Path

import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [13]:
# Resolve repo root robustly (works both locally and in nbconvert)
CWD = Path.cwd().resolve()
candidates = [CWD, CWD.parent, CWD.parent.parent]
REPO_ROOT = next(
    (p for p in candidates if (p / "notebooks").exists() and ((p / "models").exists() or (p / "artifacts").exists() or (p / "src").exists())),
    CWD
)

tracking_uri = os.getenv("MLFLOW_TRACKING_URI", (REPO_ROOT / "mlruns").resolve().as_uri())
experiment_name = os.getenv("MLFLOW_EXPERIMENT_NAME", "predictive-maintenance-cmapss")

mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment(experiment_name)

print("REPO_ROOT =", REPO_ROOT)
print("tracking_uri =", tracking_uri)
print("MLFLOW_TRACKING_URI =", mlflow.get_tracking_uri())
print("Experiment =", experiment_name)

REPO_ROOT = C:\Users\abc\Downloads\PredictiveMaintenantanceProject
tracking_uri = file:///C:/Users/abc/Downloads/PredictiveMaintenantanceProject/mlruns
MLFLOW_TRACKING_URI = file:///C:/Users/abc/Downloads/PredictiveMaintenantanceProject/mlruns
Experiment = predictive-maintenance-cmapss


In [14]:
val_path = Path(os.getenv("VAL_CSV", REPO_ROOT / "artifacts" / "processed" / "val.csv"))
val_df = pd.read_csv(val_path)

candidate_targets = ["RUL", "rul", "target", "y"]
target_col = next((c for c in candidate_targets if c in val_df.columns), None)
if target_col is None:
    raise ValueError(f"Could not infer target column in val.csv; tried {candidate_targets}")

id_cols = [c for c in ["unit", "engine_id", "id", "cycle", "time_cycles"] if c in val_df.columns]
feature_cols = [c for c in val_df.columns if c not in id_cols + [target_col]]

X_val = val_df[feature_cols].copy()
y_val = val_df[target_col].copy()

print("val_path =", val_path)
print("target_col =", target_col)
print("id_cols =", id_cols)
print("n_features =", len(feature_cols))

val_path = C:\Users\abc\Downloads\PredictiveMaintenantanceProject\artifacts\processed\val.csv
target_col = RUL
id_cols = ['engine_id', 'cycle']
n_features = 26


In [15]:
selection_metric = os.getenv("MODEL_SELECTION_METRIC", "mae")
ascending = os.getenv("MODEL_SELECTION_DIRECTION", "ASC")  # "DESC" for higher-is-better metrics

runs_df = mlflow.search_runs(
    experiment_names=[experiment_name],
    order_by=[f"metrics.{selection_metric} {ascending}"],
)

if runs_df.empty:
    raise RuntimeError("No runs found. Run 01_mlflow_experiments.ipynb first.")

# Keep only runs that have artifacts/model (because we load runs:/<run_id>/model). [web:894][web:895]
def _has_model_artifact(run_id: str) -> bool:
    try:
        items = mlflow.artifacts.list_artifacts(run_id=run_id, artifact_path="model")
        return len(items) > 0
    except Exception:
        return False

mask = runs_df["run_id"].apply(_has_model_artifact)
runs_with_model = runs_df[mask].copy()

print("Total runs found:", len(runs_df))
print("Runs with model artifact:", len(runs_with_model))

if runs_with_model.empty:
    raise RuntimeError(
        "Runs exist but none contain artifact_path='model'. "
        "Ensure Notebook 01 logs the model with mlflow.sklearn.log_model(..., artifact_path='model')."
    )

cols = [c for c in [
    "run_id",
    "status",
    "start_time",
    f"metrics.{selection_metric}",
    "metrics.rmse",
    "tags.mlflow.runName",
] if c in runs_with_model.columns]

leaderboard = runs_with_model[cols].copy()
leaderboard.head(20)


Total runs found: 2
Runs with model artifact: 1


Unnamed: 0,run_id,status,start_time,metrics.mae,metrics.rmse,tags.mlflow.runName
0,fb22caa1b93341aa92dc4cf4c4d349e8,FINISHED,2026-01-26 15:24:28.550000+00:00,118.58622,145.076562,baseline-cmapss


In [16]:
import re

if "fd_set" in val_df.columns:
    val_df["fd_set"] = val_df["fd_set"].astype(str).str.extract(r"(\d+)", expand=False).astype(int)

id_cols = [c for c in ["unit", "engine_id", "id", "cycle", "time_cycles"] if c in val_df.columns]
target_col = next((c for c in ["RUL", "rul", "target", "y"] if c in val_df.columns), None)
feature_cols = [c for c in val_df.columns if c not in id_cols + [target_col]]

X_val = val_df[feature_cols].copy()
y_val = val_df[target_col].copy()

In [17]:
import numpy as np

best_run_id = leaderboard.iloc[0]["run_id"]
model_uri = f"runs:/{best_run_id}/model"

best_model = mlflow.sklearn.load_model(model_uri)

y_pred = best_model.predict(X_val)

mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print("best_run_id =", best_run_id)
print({"mae": mae, "rmse": rmse})



best_run_id = fb22caa1b93341aa92dc4cf4c4d349e8
{'mae': 118.58621978759766, 'rmse': 145.0765624915858}


In [18]:
art_dir = Path("notebooks/_artifacts")
art_dir.mkdir(parents=True, exist_ok=True)

leaderboard_path = art_dir / "runs_leaderboard.csv"
leaderboard.to_csv(leaderboard_path, index=False)

with mlflow.start_run(run_name="model-selection") as run:
    mlflow.log_param("selected_run_id", best_run_id)
    mlflow.log_param("selection_metric", selection_metric)
    mlflow.log_param("selection_direction", ascending)
    mlflow.log_param("target_col", target_col)
    mlflow.log_param("id_cols", ",".join(id_cols))
    mlflow.log_param("n_features", int(len(feature_cols)))

    mlflow.log_metric("selected_mae", float(mae))
    mlflow.log_metric("selected_rmse", float(rmse))

    mlflow.log_artifact(str(leaderboard_path))

print("selection_run_id =", run.info.run_id)

selection_run_id = 1f39ddec07c44e1db081b16438941454
