In [1]:
import os
from pathlib import Path

import joblib
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
REPO_ROOT = Path.cwd().parent  
tracking_uri = (REPO_ROOT / "mlruns").resolve().as_uri()  

mlflow.set_tracking_uri(tracking_uri)

experiment_name = "predictive-maintenance-cmapss"
mlflow.set_experiment(experiment_name)

print("tracking_uri =", tracking_uri)
print("MLFLOW_TRACKING_URI =", mlflow.get_tracking_uri())
print("Experiment =", experiment_name)

tracking_uri = file:///C:/Users/abc/Downloads/PredictiveMaintenantanceProject/mlruns
MLFLOW_TRACKING_URI = file:///C:/Users/abc/Downloads/PredictiveMaintenantanceProject/mlruns
Experiment = predictive-maintenance-cmapss


  return FileStore(store_uri, store_uri)


In [3]:
import re

REPO_ROOT = Path.cwd().parent

if not (REPO_ROOT / "artifacts").exists() and (REPO_ROOT.parent / "artifacts").exists():
    REPO_ROOT = REPO_ROOT.parent

train_path = Path(os.getenv("TRAIN_CSV", REPO_ROOT / "artifacts" / "processed" / "train.csv"))
val_path = Path(os.getenv("VAL_CSV", REPO_ROOT / "artifacts" / "processed" / "val.csv"))
model_path = Path(os.getenv("MODEL_PATH", REPO_ROOT / "models" / "best_model.joblib"))

if not train_path.exists():
    raise FileNotFoundError(f"Missing train CSV at: {train_path}")
if not val_path.exists():
    raise FileNotFoundError(f"Missing val CSV at: {val_path}")
if not model_path.exists():
    raise FileNotFoundError(f"Missing model file at: {model_path}")

train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)

# Common target names in CMAPSS pipelines
candidate_targets = ["RUL", "rul", "target", "y"]
target_col = next((c for c in candidate_targets if c in train_df.columns), None)
if target_col is None:
    raise ValueError(
        f"Could not infer target column. Expected one of {candidate_targets}. "
        f"Columns: {list(train_df.columns)[:30]}..."
    )

# Exclude non-feature identifier columns commonly present in CMAPSS
id_cols = [c for c in ["unit", "engine_id", "id", "cycle", "time_cycles"] if c in train_df.columns]

feature_cols = [c for c in train_df.columns if c not in id_cols + [target_col]]

print("REPO_ROOT =", REPO_ROOT)
print("train_path =", train_path)
print("val_path   =", val_path)
print("model_path =", model_path)

print("target_col =", target_col)
print("id_cols =", id_cols)
print("n_features =", len(feature_cols))

# Ensure val has all required feature columns
missing_in_val = [c for c in feature_cols if c not in val_df.columns]
if missing_in_val:
    raise ValueError(f"val.csv missing expected feature columns: {missing_in_val[:10]}")

def _coerce_fd_like_column(df: pd.DataFrame, col: str) -> bool:
    if col not in df.columns:
        return False
    # Convert 'FD001' -> 1, etc. If values don't match, fail fast with examples.
    extracted = df[col].astype(str).str.extract(r"(\d+)", expand=False)
    if extracted.isna().any():
        bad = df.loc[extracted.isna(), col].head(5).tolist()
        raise ValueError(f"{col} contains values that cannot be parsed as FD###: {bad}")
    df[col] = extracted.astype(int)
    return True

coerced_cols = []
if "fd_set" in feature_cols:
    _coerce_fd_like_column(train_df, "fd_set")
    _coerce_fd_like_column(val_df, "fd_set")
    coerced_cols.append("fd_set")

# Build X/y (keep the exact feature count/order expected by the trained pipeline)
X_val = val_df[feature_cols].copy()
y_val = val_df[target_col].copy()

# Final hard check: all features must now be numeric for your current pipeline
non_numeric = X_val.select_dtypes(exclude="number").columns.tolist()
if non_numeric:
    sample_info = {}
    for c in non_numeric[:10]:
        sample_info[c] = X_val[c].dropna().astype(str).unique()[:5].tolist()
    raise ValueError(
        "Non-numeric feature columns still present (model pipeline likely expects numeric). "
        f"Columns={non_numeric}. Samples={sample_info}"
    )

print("Coerced columns:", coerced_cols)
print("X_val shape:", X_val.shape)

REPO_ROOT = C:\Users\abc\Downloads\PredictiveMaintenantanceProject
train_path = C:\Users\abc\Downloads\PredictiveMaintenantanceProject\artifacts\processed\train.csv
val_path   = C:\Users\abc\Downloads\PredictiveMaintenantanceProject\artifacts\processed\val.csv
model_path = C:\Users\abc\Downloads\PredictiveMaintenantanceProject\models\best_model.joblib
target_col = RUL
id_cols = ['engine_id', 'cycle']
n_features = 26


Coerced columns: ['fd_set']
X_val shape: (31914, 26)


In [4]:
# Identify non-numeric feature columns (these will break median/mean imputers)
non_numeric = X_val.select_dtypes(exclude="number").columns.tolist()
print("Non-numeric feature cols:", non_numeric)

if non_numeric:
    for c in non_numeric[:10]:
        print(c, "sample values:", X_val[c].dropna().astype(str).unique()[:5])

Non-numeric feature cols: []


In [5]:
import numpy as np

model = joblib.load(model_path)

y_pred = model.predict(X_val[feature_cols].to_numpy())

mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

with mlflow.start_run(run_name="baseline-cmapss"):
    mlflow.log_param("model_path", str(model_path))
    mlflow.log_param("target_col", target_col)
    mlflow.log_param("id_cols", ",".join(id_cols))
    mlflow.log_param("n_features", int(len(feature_cols)))
    mlflow.log_param("features", ",".join(feature_cols[:50]))  # keep logs compact

    mlflow.log_metric("mae", float(mae))
    mlflow.log_metric("rmse", float(rmse))

    mlflow.sklearn.log_model(model, artifact_path="model")

print({"mae": mae, "rmse": rmse})

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet





{'mae': 118.58621978759766, 'rmse': 145.0765624915858}
