In [1]:
# --- Cell 1: Setup & MLflow ---
import os, joblib, yaml
import numpy as np
import pandas as pd
import mlflow

from mlflow_base import MLflowBase  # your wrapper

# Experiment / run naming
EXPERIMENT_NAME = "rotor_fault_monitoring/16/78"
RUN_NAME        = "pca_ocsvm_sidebands_v1"   # algorithm/approach name

# End any existing active run (notebook safety)
if mlflow.active_run() is not None:
    mlflow.end_run()

# Start run
mlbase = MLflowBase(EXPERIMENT_NAME)
run = mlbase.start_run(run_name=RUN_NAME)
print("MLflow run:", mlflow.active_run().info.run_id)

# IDs & dataset
dataset_path = "iot.harmonics_m78_may26.csv"
tenant_id    = "16"
machine_id   = "78"

# Core training parameters
n_components = 5
svm_nu       = 0.05
svm_gamma    = "scale"

mlbase.log_params({
    "algorithm_name": RUN_NAME,
    "dataset_path": os.path.basename(dataset_path),
    "tenant_id": tenant_id,
    "machine_id": machine_id,
    "pca_n_components": n_components,
    "svm_nu": svm_nu,
    "svm_gamma": svm_gamma,
})


MLflow run: 3ca55ecbcf484916ba288aeb803f7ffd


In [2]:
# --- Cell 2: Load & preprocess data ---
df = pd.read_csv(dataset_path)
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")

# Drop irrelevant columns if present
drop_cols = [c for c in ["_id", "tenant_id", "machine_id", "type"] if c in df.columns]
df = df.drop(columns=drop_cols)

print("Data shape:", df.shape)
display(df.head(3))

harmonics_data = df.drop(columns=["timestamp"])


Data shape: (219396, 181)


Unnamed: 0,timestamp,vh2_5,vh2_4,vh1_20,vh2_23,vh2_9,vh1_14,vh1_26,vh1_12,vh1_0,...,ch1_7,ch2_6,ch1_1,ch3_0,ch3_26,ch1_2,ch2_15,ch1_23,ch2_5,ch2_3
0,2024-03-01 00:00:04+00:00,0.0,5.5,4.173536e-08,0.0,0.0,-1.5949370000000002e-23,-1.5949370000000002e-23,4.173536e-08,0.0,...,,,,,,,,,,
1,2024-03-01 00:00:26+00:00,5.59375,1.8366819999999998e-41,0.0,0.0,7.346868e-41,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,2024-03-01 00:00:49+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [3]:
# --- Cell 3: Define AdvancedRotorFaultTrainer ---
from sklearn.decomposition import PCA
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import time

class AdvancedRotorFaultTrainer:
    def __init__(self, n_components, svm_nu, svm_gamma):
        self.pca = PCA(n_components=n_components)
        self.scaler = StandardScaler()
        self.svm = OneClassSVM(nu=svm_nu, gamma=svm_gamma)
        self.imputer = SimpleImputer(strategy='mean')
    
#     def extract_sideband_features(self, harmonics_data, slip_freq=5, fundamental=50):
#         harmonics_indices = [i for i in range(1, 31)]
#         sideband_features = []

#         for phase in ['ch', 'vh']:
#             for h in harmonics_indices:
#                 for ph in range(1, 3+1):
#                     col = f"{phase}{ph}_{h}"
#                     if col in harmonics_data.columns:
#                         sideband_features.append(harmonics_data[col])
#                     else:
#                         sideband_features.append(np.nan)
#         return np.array(sideband_features).T

    def extract_sideband_features(self, harmonics_data, slip_freq=5, fundamental=50):
        harmonics_indices = [i for i in range(1, 31)]
        feature_cols = []

        for phase in ['ch', 'vh']:
            for h in harmonics_indices:
                for ph in range(1, 4):   # 1‚Äì3 phases
                    col = f"{phase}{ph}_{h}"
                    if col in harmonics_data.columns:
                        feature_cols.append(harmonics_data[col].values)
                    else:
                        print(f"Warning: Column {col} missing, filling with NaN")
                        feature_cols.append(np.full(len(harmonics_data), np.nan))

        # Stack into samples √ó features matrix
        return np.column_stack(feature_cols)


#     def fit(self, harmonics_data):
#         sideband_data = self.extract_sideband_features(harmonics_data)
#         imputed_data  = self.imputer.fit_transform(sideband_data)
#         scaled_data   = self.scaler.fit_transform(imputed_data)
#         pca_data      = self.pca.fit_transform(scaled_data)
#         self.svm.fit(pca_data)
#         return pca_data

    def fit(self, harmonics_data):
        t0 = time.time()
        sideband_data = self.extract_sideband_features(harmonics_data)
        print(f"Feature extraction done in {time.time()-t0:.2f}s")

        t1 = time.time()
        imputed_data  = self.imputer.fit_transform(sideband_data)
        print(f"Imputation done in {time.time()-t1:.2f}s")

        t2 = time.time()
        scaled_data   = self.scaler.fit_transform(imputed_data)
        print(f"Scaling done in {time.time()-t2:.2f}s")

        t3 = time.time()
        pca_data      = self.pca.fit_transform(scaled_data)
        print(f"PCA done in {time.time()-t3:.2f}s")

        t4 = time.time()
        self.svm.fit(pca_data)
        print(f"SVM fit done in {time.time()-t4:.2f}s")

        print(f"‚úÖ Total training time: {time.time()-t0:.2f}s")
        return pca_data

In [4]:
# --- Cell 4: Train model ---
trainer = AdvancedRotorFaultTrainer(n_components, svm_nu, svm_gamma)

# just adding below code line to create a subset and quickly finish training so I can validate the pipeline, do not use this
# when training
harmonics_subset = harmonics_data.sample(n=5000, random_state=42)

# pca_data = trainer.fit(harmonics_data)
pca_data = trainer.fit(harmonics_subset)

print("‚úÖ Model trained. PCA variance explained:", trainer.pca.explained_variance_ratio_.sum())


Feature extraction done in 0.02s
Imputation done in 0.08s
Scaling done in 0.04s




PCA done in 0.07s
SVM fit done in 0.41s
‚úÖ Total training time: 0.63s
‚úÖ Model trained. PCA variance explained: 0.48903782820937


In [5]:
# --- Cell 5: Log metrics & save model ---
# Simple decision scores
scores = trainer.svm.decision_function(pca_data)

metrics = {
    "explained_variance": float(trainer.pca.explained_variance_ratio_.sum()),
    "score_mean": float(np.mean(scores)),
    "score_std": float(np.std(scores)),
}
mlbase.log_metrics(metrics)

# Save model locally
model_path = f"rotor_fault_detector_{machine_id}.pkl"
joblib.dump((trainer.imputer, trainer.scaler, trainer.pca, trainer.svm), model_path)

# Log model in MLflow
mlflow.sklearn.log_model(trainer.svm, artifact_path="model")
mlbase.log_artifact(run.info.run_id, local_path=model_path, artifact_path="artifacts")

# Save config YAML
config = {
    "tenant_id": tenant_id,
    "machine_id": machine_id,
    "pca_n_components": n_components,
    "svm_nu": svm_nu,
    "svm_gamma": svm_gamma,
    "decision_threshold": float(np.percentile(scores, 5)),  # anomaly threshold
    "model_path": model_path,
}
yaml_path = f"rotor_fault_config_{machine_id}.yaml"
with open(yaml_path, "w") as f:
    yaml.dump(config, f, sort_keys=False)

mlbase.log_artifact(run.info.run_id, local_path=yaml_path, artifact_path="configs")
print("‚úÖ Model + Config logged to MLflow")


INFO:botocore.credentials:Found credentials in environment variables.


‚úÖ Model + Config logged to MLflow


In [6]:
# --- Cell 5b: Attach run documentation (note) ---
import mlflow
import numpy as np

def _fmt(x, nd=4):
    if x is None or (isinstance(x, float) and (np.isnan(x) or np.isinf(x))):
        return "NaN"
    try:
        return f"{float(x):.{nd}f}"
    except Exception:
        return str(x)

mlflow.set_tag(
    "mlflow.note.content",
f"""
# üåÄ Rotor Fault Detection ‚Äî Run Summary

### Parameters
- **Algorithm**: `{RUN_NAME}`
- **Tenant / Machine**: `{tenant_id}` / `{machine_id}`
- **Dataset**: `{os.path.basename(dataset_path)}`
- **PCA components**: `{n_components}`
- **SVM parameters**: ŒΩ = `{svm_nu}`, Œ≥ = `{svm_gamma}`

### Key Metrics
- **Explained variance (PCA)**: `{_fmt(metrics['explained_variance'])}`
- **SVM score mean**: `{_fmt(metrics['score_mean'])}`
- **SVM score std**: `{_fmt(metrics['score_std'])}`

### Artifacts
- **Model file**: `{model_path}`
- **Config YAML**: `{yaml_path}`

---

**Usage**:  
This run trained a PCA + OneClassSVM rotor fault model and logged it to MLflow.  
"""
)


In [7]:
# --- Cell 6: End MLflow run ---
mlflow.end_run()
print("MLflow run ended.")


üèÉ View run pca_ocsvm_sidebands_v1 at: https://mlops.zolnoi.app/#/experiments/15/runs/3ca55ecbcf484916ba288aeb803f7ffd
üß™ View experiment at: https://mlops.zolnoi.app/#/experiments/15
MLflow run ended.
