In [1]:
import os
from pathlib import Path

import mlflow
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
REPO_ROOT = Path.cwd().parent  
tracking_uri = (REPO_ROOT / "mlruns").resolve().as_uri()  

mlflow.set_tracking_uri(tracking_uri)

experiment_name = "predictive-maintenance-cmapss"
mlflow.set_experiment(experiment_name)

print("tracking_uri =", tracking_uri)
print("MLFLOW_TRACKING_URI =", mlflow.get_tracking_uri())
print("Experiment =", experiment_name)

tracking_uri = file:///C:/Users/abc/Downloads/PredictiveMaintenantanceProject/mlruns
MLFLOW_TRACKING_URI = file:///C:/Users/abc/Downloads/PredictiveMaintenantanceProject/mlruns
Experiment = predictive-maintenance-cmapss


  return FileStore(store_uri, store_uri)


In [3]:
REPO_ROOT = Path.cwd().parent
val_path = Path(os.getenv("VAL_CSV", REPO_ROOT / "artifacts" / "processed" / "val.csv"))
val_df = pd.read_csv(val_path)

candidate_targets = ["RUL", "rul", "target", "y"]
target_col = next((c for c in candidate_targets if c in val_df.columns), None)
if target_col is None:
    raise ValueError(f"Could not infer target column in val.csv; tried {candidate_targets}")

id_cols = [c for c in ["unit", "engine_id", "id", "cycle", "time_cycles"] if c in val_df.columns]
feature_cols = [c for c in val_df.columns if c not in id_cols + [target_col]]

X_val = val_df[feature_cols]
y_val = val_df[target_col]

print("target_col =", target_col)
print("id_cols =", id_cols)
print("n_features =", len(feature_cols))

target_col = RUL
id_cols = ['engine_id', 'cycle']
n_features = 26


In [4]:
selection_metric = os.getenv("MODEL_SELECTION_METRIC", "mae")
ascending = os.getenv("MODEL_SELECTION_DIRECTION", "ASC")  # use DESC for higher-is-better metrics

runs_df = mlflow.search_runs(
    experiment_names=[experiment_name],
    order_by=[f"metrics.{selection_metric} {ascending}"],
)

if runs_df.empty:
    raise RuntimeError("No runs found. Run 01_mlflow_experiments.ipynb (and/or additional runs) first.")

cols = [c for c in [
    "run_id",
    "status",
    "start_time",
    f"metrics.{selection_metric}",
    "metrics.rmse",
    "tags.mlflow.runName",
] if c in runs_df.columns]

leaderboard = runs_df[cols].copy()
leaderboard.head(20)

Unnamed: 0,run_id,status,start_time,metrics.mae,metrics.rmse,tags.mlflow.runName
0,d528d7fe00894c9c9df8320d86b739ee,FINISHED,2026-01-23 12:39:28.852000+00:00,118.58622,145.076562,baseline-cmapss
1,6fc7f90ac60949dfad981cf759a51d2c,FINISHED,2026-01-23 08:58:34.025000+00:00,118.58622,145.076562,baseline-cmapss
2,f528d12c3ebb4aabbcf2b969c403fbd0,FINISHED,2026-01-23 08:46:03.717000+00:00,118.58622,145.076562,baseline-cmapss
3,d94fdfb873774069b53e8e7f2f696380,FINISHED,2026-01-23 09:19:45.504000+00:00,,,model-selection
4,262edcb362a344d6a182642b3e23f723,FINISHED,2026-01-23 09:18:00.191000+00:00,,,model-selection


In [5]:
import re

if "fd_set" in val_df.columns:
    val_df["fd_set"] = val_df["fd_set"].astype(str).str.extract(r"(\d+)", expand=False).astype(int)

# rebuild feature_cols / X_val after the fix (important)
id_cols = [c for c in ["unit", "engine_id", "id", "cycle", "time_cycles"] if c in val_df.columns]
target_col = next((c for c in ["RUL", "rul", "target", "y"] if c in val_df.columns), None)
feature_cols = [c for c in val_df.columns if c not in id_cols + [target_col]]

X_val = val_df[feature_cols].copy()
y_val = val_df[target_col].copy()

In [6]:
import numpy as np

best_run_id = leaderboard.iloc[0]["run_id"]
model_uri = f"runs:/{best_run_id}/model"  # artifact_path="model" from notebook 01

best_model = mlflow.sklearn.load_model(model_uri)

y_pred = best_model.predict(X_val)

mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print("best_run_id =", best_run_id)
print({"mae": mae, "rmse": rmse})

  from .autonotebook import tqdm as notebook_tqdm


Downloading artifacts:   0%|                                                 | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|                                                 | 0/1 [00:00<?, ?it/s]




Downloading artifacts:   0%|                                                 | 0/5 [00:00<?, ?it/s]

Downloading artifacts:  20%|███████▊                               | 1/5 [00:00<00:00, 6786.90it/s]

Downloading artifacts:  40%|████████████████                        | 2/5 [00:00<00:00, 250.45it/s]

Downloading artifacts:  60%|████████████████████████                | 3/5 [00:00<00:00, 258.65it/s]

Downloading artifacts:  80%|████████████████████████████████        | 4/5 [00:00<00:00, 306.93it/s]

Downloading artifacts: 100%|████████████████████████████████████████| 5/5 [00:00<00:00, 330.43it/s]

Downloading artifacts: 100%|████████████████████████████████████████| 5/5 [00:00<00:00, 302.19it/s]






best_run_id = d528d7fe00894c9c9df8320d86b739ee
{'mae': 118.58621978759766, 'rmse': 145.0765624915858}


In [7]:
art_dir = Path("notebooks/_artifacts")
art_dir.mkdir(parents=True, exist_ok=True)

leaderboard_path = art_dir / "runs_leaderboard.csv"
leaderboard.to_csv(leaderboard_path, index=False)

with mlflow.start_run(run_name="model-selection") as run:
    mlflow.log_param("selected_run_id", best_run_id)
    mlflow.log_param("selection_metric", selection_metric)
    mlflow.log_param("selection_direction", ascending)
    mlflow.log_param("target_col", target_col)
    mlflow.log_param("id_cols", ",".join(id_cols))
    mlflow.log_param("n_features", int(len(feature_cols)))

    mlflow.log_metric("selected_mae", float(mae))
    mlflow.log_metric("selected_rmse", float(rmse))

    mlflow.log_artifact(str(leaderboard_path))

print("selection_run_id =", run.info.run_id)

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



selection_run_id = 303817b8f32645efb02e2961924572dc
