In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# For ONNX conversion
import skl2onnx
from skl2onnx.common.data_types import FloatTensorType

# --- Configuration ---
DATASET_PATH = "../datasets/MASTER_DATASET.csv"
# We are now saving two model files: one for fast inference (ONNX) and one for interpretability (joblib)
ONNX_MODEL_PATH = "../artifacts/isolation_forest_model.onnx"
SKLEARN_MODEL_PATH = "../artifacts/rtm_random_forest.joblib"
MEAN_PATH = "../artifacts/rtm_scaler_mean.npy"
SCALE_PATH = "../artifacts/rtm_scaler_scale.npy"

# --- 1. Load Data ---
print("Loading data...")
df = pd.read_csv(DATASET_PATH)
print("✅ Data loaded successfully.")


# --- 2. Feature Engineering ---
print("\n--- Starting Feature Engineering ---")
WINDOW_SIZE = 5
df["Vibration_roll_mean"] = df["Vibration"].rolling(window=WINDOW_SIZE).mean()
df["Power_Consumption_roll_mean"] = (
    df["Power_Consumption"].rolling(window=WINDOW_SIZE).mean()
)
df["Vibration_roll_std"] = df["Vibration"].rolling(window=WINDOW_SIZE).std()
df["Power_Consumption_roll_std"] = (
    df["Power_Consumption"].rolling(window=WINDOW_SIZE).std()
)
df.bfill(inplace=True)
df.ffill(inplace=True)
FEATURE_COLUMNS = [
    "Pressure_In",
    "Temperature_In",
    "Flow_Rate",
    "Pressure_Out",
    "Temperature_Out",
    "Efficiency",
    "Power_Consumption",
    "Vibration",
    "Ambient_Temperature",
    "Humidity",
    "Air_Pollution",
    "Frequency",
    "Amplitude",
    "Phase_Angle",
    "Velocity",
    "Stiffness",
    "Vibration_roll_mean",
    "Power_Consumption_roll_mean",
    "Vibration_roll_std",
    "Power_Consumption_roll_std",
]
print(f"✅ Feature engineering complete. Total features: {len(FEATURE_COLUMNS)}")


# --- 3. Prepare Data for Supervised Learning ---
print("\n--- Preparing data for supervised learning ---")
X = df[FEATURE_COLUMNS]
y = df["Status"].apply(lambda x: 1 if x == "Normal" else -1)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("✅ Data split and scaled successfully.")


# --- 4. Train New RandomForest Model ---
print("\n--- Training New RandomForest Model ---")
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train_scaled, y_train)
print("✅ New model trained successfully.")


# --- 5. Evaluate the New Model ---
print("\n--- Evaluating New Model Performance on Test Data ---")
y_pred = model.predict(X_test_scaled)
print("\n--- Model Performance Metrics ---")
print(
    classification_report(y_test, y_pred, target_names=["Anomaly (-1)", "Normal (1)"])
)
print("\n--- Confusion Matrix ---")
cm = confusion_matrix(y_test, y_pred)
print(cm)


# --- 6. Save New Models (ONNX & Sklearn) and Scaler ---
print("\n--- Saving new models and scaler ---")

# Save the model in ONNX format for fast prediction
target_opset = {"": 15, "ai.onnx.ml": 3}
initial_type = [("input", FloatTensorType([None, len(FEATURE_COLUMNS)]))]
onnx_model = skl2onnx.convert_sklearn(
    model, initial_types=initial_type, target_opset=target_opset
)
with open(ONNX_MODEL_PATH, "wb") as f:
    f.write(onnx_model.SerializeToString())
print(f"✅ New ONNX model saved to: {ONNX_MODEL_PATH}")

# Save the original scikit-learn model using joblib for future use (e.g., interpretability)
joblib.dump(model, SKLEARN_MODEL_PATH)
print(f"✅ New Sklearn model saved to: {SKLEARN_MODEL_PATH}")

# Save the scaler parameters
np.save(MEAN_PATH, scaler.mean_)
np.save(SCALE_PATH, scaler.scale_)
print(f"✅ New scaler parameters saved to: {MEAN_PATH} and {SCALE_PATH}")

Loading data...
✅ Data loaded successfully.

--- Starting Feature Engineering ---
✅ Feature engineering complete. Total features: 20

--- Preparing data for supervised learning ---
✅ Data split and scaled successfully.

--- Training New RandomForest Model ---
✅ New model trained successfully.

--- Evaluating New Model Performance on Test Data ---

--- Model Performance Metrics ---
              precision    recall  f1-score   support

Anomaly (-1)       0.98      0.95      0.96     12082
  Normal (1)       0.93      0.97      0.95      7918

    accuracy                           0.96     20000
   macro avg       0.95      0.96      0.96     20000
weighted avg       0.96      0.96      0.96     20000


--- Confusion Matrix ---
[[11465   617]
 [  235  7683]]

--- Saving new models and scaler ---
✅ New ONNX model saved to: ../artifacts/isolation_forest_model.onnx
✅ New Sklearn model saved to: ../artifacts/rtm_random_forest.joblib
✅ New scaler parameters saved to: ../artifacts/rtm_scaler_