## Primary imports

In [18]:
import os
import yaml
import mlflow
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from tqdm.auto import tqdm

# --- Import key functions from our training script ---
# Make sure 'bearing_model_training.py' is in the same directory or accessible in the Python path
from bearing_model_training import (
    discover_available_bins,
    calculate_rms_dynamic,
    bpfo_bpfi_from_geometry,
    band_energy_around_dynamic,
    neighbor_triplet,
    nearest_bin,
    spectral_kurtosis,
    spectral_crest_factor,
    thd_centered_triplet,
    sideband_energy_ratio,
    sideband_modulation_index,
)

# --- MLflow Base Class (for connection) ---
# Ensure mlflow_base.py is also accessible
from mlflow_base import MLflowBase

print("Setup Complete. Libraries and functions imported.")

Setup Complete. Libraries and functions imported.


## step 2: Define Constants and Connect to MLflow

In [22]:
# --- Configuration ---
TENANT_ID = "13" #"28"
MACHINE_ID = "247" #"257"
DATASET_FILENAME = "iotts.harmonics_247.csv" # "iotts.harmonics_257.csv" # The dataset used for training
DATA_DIR = "data"
ARTIFACT_CONFIG_NAME = "bearing_config.yaml"

# --- MLflow Connection ---
EXPERIMENT_NAME = f"bearing_fault_monitoring/{TENANT_ID}/{MACHINE_ID}"

print(f"Connecting to MLflow experiment: '{EXPERIMENT_NAME}'")
mlbase = MLflowBase(experiment_name=EXPERIMENT_NAME)
client = mlbase.client

Connecting to MLflow experiment: 'bearing_fault_monitoring/13/247'


## step 3: Fetch Latest Configuration from MLflow Artifacts

In [23]:
print("Searching for the most recent run...")
# Find the most recent run in the experiment
runs = client.search_runs(
    experiment_ids=[mlbase.experiment_id],
    order_by=["attributes.start_time DESC"],
    max_results=1
)

if not runs:
    raise Exception(f"No runs found in experiment '{EXPERIMENT_NAME}'. Please run the training script first.")

latest_run = runs[0]
run_id = latest_run.info.run_id
print(f"Found latest run with ID: {run_id}")

# --- Download the config artifact ---
local_artifact_path = client.download_artifacts(run_id, "bearing_config.yaml")
print(f"Configuration artifact downloaded to: {local_artifact_path}")

# --- Load the configuration ---
with open(local_artifact_path, 'r') as f:
    config = yaml.safe_load(f)

print("\nSuccessfully loaded configuration:")
print(yaml.dump(config, indent=2))

Searching for the most recent run...
Found latest run with ID: 9e3a976c871249b0b758c5dd6ea599c0


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Configuration artifact downloaded to: C:\Users\ghosh\AppData\Local\Temp\tmp16hcgisz\bearing_config.yaml

Successfully loaded configuration:
ball_diameter: 8.0
bdf_threshold: 0.7
contact_angle_deg: 0.0
freq_resolution: 50.0
normalization:
  minmax:
    bpfi_band_energy:
      max: 2.4968064195730943
      min: 0.0
    bpfo_band_energy:
      max: 2.446163380175349
      min: 0.0
    sideband_energy_ratio_rms_3:
      max: 1.2385913967491034e+41
      min: 1.0
    sideband_modulation_index_rms_4:
      max: 1.2385913967491034e+41
      min: 1.0
    spectral_crest_factor_rms_1:
      max: 5.477225575051661
      min: 1.2248783362567577
    spectral_kurtosis_rms_0:
      max: 30.00000000000003
      min: -1.9964628921212475
    total_harmonic_distortion_rms_2:
      max: 1.2385913967491034e+41
      min: 1.0
num_rolling_elements: 8
pitch_diameter: 60.0
shaft_rpm: 1296.0
weights:
  bpfi_band_energy: 0.1
  bpfo_band_energy: 0.1
  sideband_energy_ratio_rms_3: 0.1
  sideband_modulation_index_r

## step 4: Load the Dataset

In [24]:
dataset_path = os.path.join(DATA_DIR, DATASET_FILENAME)

if not os.path.exists(dataset_path):
    raise FileNotFoundError(f"Dataset not found at {dataset_path}. Make sure it's in the '{DATA_DIR}' directory.")

print(f"Loading dataset from: {dataset_path}")
df = pd.read_csv(dataset_path)

# Optional: If your data has a timestamp column, parse it.
if "timestamp" in df.columns:
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df = df.sort_values("timestamp").reset_index(drop=True)

print(f"Dataset loaded successfully. Shape: {df.shape}")
df.head()

Loading dataset from: data\iotts.harmonics_247.csv
Dataset loaded successfully. Shape: (2073, 184)


Unnamed: 0,timestamp,metaData.machine_id,metaData.tenant_id,vh3_21,vh2_21,ch1_16,ch3_1,ch3_14,ch3_24,ch3_7,...,vh1_1,vh1_26,vh3_3,ch2_0,ch3_29,vh2_6,ch3_21,ch2_29,ch2_6,vh1_11
0,2025-08-04 08:17:51+00:00,247,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2025-08-04 08:21:29+00:00,247,13,,,,2.103125,2.890625,2.1,3.6733639999999996e-41,...,0.0,0.0,,0.0,2.890625,,2.9,2.1,0.0,0.0
2,2025-08-04 08:22:00+00:00,247,13,,,2.0,2.1,2.1,2.896875,5.510186e-41,...,0.0,0.0,,2.00625,2.896875,,2.896875,0.0,3.6733639999999996e-41,0.0
3,2025-08-04 08:22:31+00:00,247,13,,,0.0,3.0,0.0,3.0,2.09375,...,0.0,0.0,,2.0,3.00625,,3.00625,0.0,0.0,0.0
4,2025-08-04 08:23:02+00:00,247,13,,,2.890625,5.510186e-41,2.0,2.890625,5.510186e-41,...,0.0,0.0,,2.0,0.0,,0.0,0.0,2.09375,0.0


## step 5: BDF Calculation Logic

In [25]:
def normalize_value_minmax(x: float, stats: dict) -> float:
    """Applies robust min-max normalization using pre-calculated stats."""
    vmin, vmax = stats.get("min"), stats.get("max")
    if vmax is None or vmin is None or vmax == vmin:
        return 0.0
    # Clip the value to be within the learned min/max bounds
    x_clipped = np.clip(x, vmin, vmax)
    return float((x_clipped - vmin) / (vmax - vmin))

def compute_bdf_row(row: pd.Series, weights: dict, norm_cfg: dict) -> float:
    """Computes the BDF for a single row of features."""
    total, wsum = 0.0, 0.0
    for feat, w in weights.items():
        if feat in row.index and pd.notna(row[feat]):
            x = float(row[feat])
            # Use the specific normalization stats for this feature
            feature_norm_stats = norm_cfg.get(feat, {"min": 0.0, "max": 1.0})
            xnorm = normalize_value_minmax(x, feature_norm_stats)
            total += xnorm * w
            wsum += w
    return float(total / wsum) if wsum > 0 else 0.0

print("BDF calculation functions are defined.")

BDF calculation functions are defined.


## Cell 6: Perform Feature Engineering

In [26]:
# Create a copy for feature engineering
df_features = df.copy()

# 1. Discover bins and calculate RMS
print("Step 1: Calculating RMS values...")
available_bins = discover_available_bins(df_features)
df_features, rms_cols = calculate_rms_dynamic(df_features, available_bins)

# 2. Calculate fault frequencies from config
print("Step 2: Calculating fault frequencies (BPFO/BPFI)...")
bpfo_hz, bpfi_hz = bpfo_bpfi_from_geometry(config)

# 3. Calculate band energies
print("Step 3: Calculating band energies...")
df_features["bpfo_band_energy"] = band_energy_around_dynamic(df_features[rms_cols], bpfo_hz, config["freq_resolution"], available_bins)
df_features["bpfi_band_energy"] = band_energy_around_dynamic(df_features[rms_cols], bpfi_hz, config["freq_resolution"], available_bins)

# 4. Calculate stateless spectral features
print("Step 4: Calculating stateless spectral features...")
fr_hz = config["shaft_rpm"] / 60.0
target_bin = int(round(fr_hz / config["freq_resolution"]))
fundamental_bin = nearest_bin(target_bin, available_bins)
trip_bins = neighbor_triplet(fundamental_bin, available_bins)
trip_cols = [f"rms_{k}" for k in trip_bins]

ordered_rms_cols = [f"rms_{k}" for k in available_bins]
rms_arr = df_features[ordered_rms_cols].values

# This loop replicates the logic from your training script
for i in tqdm(range(len(df_features)), desc="Calculating features per row"):
    row_spec = rms_arr[i, :]
    trip_vals = df_features.loc[df_features.index[i], trip_cols].values.astype(float)
    df_features.at[i, "spectral_kurtosis_rms_0"] = spectral_kurtosis(row_spec)
    df_features.at[i, "spectral_crest_factor_rms_1"] = spectral_crest_factor(row_spec)
    df_features.at[i, "total_harmonic_distortion_rms_2"] = thd_centered_triplet(trip_vals)
    df_features.at[i, "sideband_energy_ratio_rms_3"] = sideband_energy_ratio(trip_vals)
    df_features.at[i, "sideband_modulation_index_rms_4"] = sideband_modulation_index(trip_vals)

print("✅ Feature engineering complete.")
df_features[list(config['weights'].keys())].describe()

Step 1: Calculating RMS values...
Step 2: Calculating fault frequencies (BPFO/BPFI)...
Step 3: Calculating band energies...
Step 4: Calculating stateless spectral features...


Calculating features per row:   0%|          | 0/2073 [00:00<?, ?it/s]

✅ Feature engineering complete.


Unnamed: 0,spectral_kurtosis_rms_0,spectral_crest_factor_rms_1,total_harmonic_distortion_rms_2,sideband_energy_ratio_rms_3,sideband_modulation_index_rms_4,bpfo_band_energy,bpfi_band_energy
count,1945.0,1945.0,1759.0,1759.0,1759.0,2073.0,2073.0
mean,1.280645,1.883539,9.961393e+39,9.961393e+39,9.961393e+39,1.125758,1.130052
std,5.737147,0.834473,2.604462e+40,2.604462e+40,2.604462e+40,0.623388,0.663816
min,-2.148127,1.054093,1.0,1.0,1.0,0.0,0.0
25%,-1.228049,1.468482,1.158017,1.158017,1.583955,0.694023,0.595392
50%,-0.232122,1.630153,1.572223,1.572223,2.213213,1.171239,1.191988
75%,1.300073,1.927744,3.505428,3.505428,4.35976,1.57996,1.637278
max,30.0,5.477226,2.619074e+41,2.619074e+41,2.619074e+41,3.559124,2.995595


## step 7: Calculate BDF Score and Detect Faults

In [27]:
print("Calculating BDF scores for all rows...")

# Extract weights and normalization stats from the loaded config
weights = config["weights"]
norm_stats = config["normalization"]["minmax"]
bdf_threshold = config["bdf_threshold"]

# Apply the BDF calculation to each row of the dataframe
df_features["bdf_score"] = df_features.apply(
    lambda row: compute_bdf_row(row, weights, norm_stats),
    axis=1
)

# Determine if a fault is detected based on the threshold
df_features["fault_detected"] = df_features["bdf_score"] >= bdf_threshold

print(f"BDF calculation complete. Threshold for fault is > {bdf_threshold}")
print(f"Total faults detected: {df_features['fault_detected'].sum()} out of {len(df_features)} data points.")

# Display results
df_features[['bdf_score', 'fault_detected']].head()

Calculating BDF scores for all rows...
BDF calculation complete. Threshold for fault is > 0.7
Total faults detected: 0 out of 2073 data points.


Unnamed: 0,bdf_score,fault_detected
0,0.0,False
1,0.144566,False
2,0.181372,False
3,0.207811,False
4,0.152421,False


## step 8: Visualize the BDF Results

In [28]:
print("Generating Plotly visualization...")

# Determine the x-axis: use 'timestamp' if available, otherwise use index
x_axis_data = df_features['timestamp'] if 'timestamp' in df_features.columns else df_features.index
x_axis_title = 'Timestamp' if 'timestamp' in df_features.columns else 'Data Point Index'


# Create the main figure
fig = go.Figure()

# Add the BDF score as a line plot
fig.add_trace(go.Scatter(
    x=x_axis_data,
    y=df_features['bdf_score'],
    mode='lines',
    name='BDF Score',
    line=dict(color='blue', width=2)
))

# Add the detected fault points as markers
fault_points = df_features[df_features['fault_detected']]
fig.add_trace(go.Scatter(
    x=fault_points[x_axis_data.name], # Use the same x-axis data for faults
    y=fault_points['bdf_score'],
    mode='markers',
    name='Fault Detected',
    marker=dict(color='red', size=8, symbol='x')
))

# Add a horizontal line for the threshold
fig.add_hline(
    y=bdf_threshold,
    line_width=2,
    line_dash="dash",
    line_color="orange",
    annotation_text=f"Fault Threshold ({bdf_threshold})",
    annotation_position="bottom right"
)

# Update layout for better readability
fig.update_layout(
    title=f'Bearing Degradation Factor (BDF) for Machine {MACHINE_ID}',
    xaxis_title=x_axis_title,
    yaxis_title='BDF Score',
    legend_title='Legend',
    template='plotly_white',
    height=600
)

fig.show()

Generating Plotly visualization...
