In [2]:
pip install matplotlib seaborn scikit-learn


Collecting matplotlib
  Using cached matplotlib-3.10.6-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.7.2-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.3-cp311-cp311-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.60.0-cp311-cp311-win_amd64.whl.metadata (113 kB)
     ---------------------------------------- 0.0/113.8 kB ? eta -:--:--
     --- ------------------------------------ 10.2/113.8 kB ? eta -:--:--
     --------- --------------------------- 30.7/113.8 kB 435.7 kB/s eta 0:00:01
     ------------------- ----------------- 61.4/113.8 kB 465.5 kB/s eta 0:00:01
     ------------------------------------ 113.8/113.8 kB 734.


[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
pip install pandas numpy


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
!pip install xgboost


Collecting xgboost
  Using cached xgboost-3.0.5-py3-none-win_amd64.whl.metadata (2.1 kB)
Using cached xgboost-3.0.5-py3-none-win_amd64.whl (56.8 MB)
Installing collected packages: xgboost
Successfully installed xgboost-3.0.5



[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay
import numpy as np
import os


In [3]:
from pathlib import Path
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split

# Step 1: Set Correct Project Path (1 level up from notebooks/)
PROJECT_ROOT = Path().resolve().parent
DATA_DIR = PROJECT_ROOT / "data"
MODELS_DIR = PROJECT_ROOT / "models"
OUTPUTS_DIR = PROJECT_ROOT / "outputs"

# Step 2: Load model and data
clf = joblib.load(MODELS_DIR / "xgb_binary_classifier.pkl")
df = pd.read_csv(DATA_DIR / "processed_sensor_data.csv")
df['label'] = (df['RUL'] < 20).astype(int)

# Step 3: Split test data
units = df['unit'].unique()
_, test_units = train_test_split(units, test_size=0.2, random_state=42)
test_df = df[df['unit'].isin(test_units)]

exclude_cols = ['unit', 'cycle', 'RUL', 'label']
features = [col for col in df.columns if col not in exclude_cols]
X_test = test_df[features]
y_test = test_df['label']

# Step 4: Predict + ROC-AUC Plot
y_probs = clf.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_probs)
roc_auc = roc_auc_score(y_test, y_probs)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})", color="navy")
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC-AUC Curve – Failure Prediction")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(OUTPUTS_DIR / "roc_auc_curve.png", dpi=300)
plt.clf()
print("✅ ROC-AUC plot saved.")


✅ ROC-AUC plot saved.


<Figure size 800x600 with 0 Axes>

In [4]:
# ✅ What: Visualizes true/false positives and negatives
# 🤔 Why: Gives insight into model’s classification accuracy
# 🛠️ How: Use sklearn's confusion_matrix and heatmap

# Predict labels
y_pred = clf.predict(X_test)

# Generate matrix
cm = confusion_matrix(y_test, y_pred)
labels = ["Healthy", "Will Fail Soon"]

# Plot
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix – Failure Classification")

# Save
plt.tight_layout()
plt.savefig("outputs/confusion_matrix.png", dpi=300)
plt.clf()
print("✅ Confusion matrix plot saved.")


✅ Confusion matrix plot saved.


<Figure size 600x500 with 0 Axes>

In [9]:
# ✅ What: Bar plot of most important sensor features
# 🤔 Why: Shows which features the model used most in decision-making
# 🛠️ How: Use XGBoost's built-in feature_importances_ attribute

# ---------------------------------------------
# ✅ Step 0: Prepare Train Data for Reference
# ---------------------------------------------
from sklearn.model_selection import train_test_split

# Reload full dataset
df = pd.read_csv(DATA_DIR / "processed_sensor_data.csv")
df['label'] = (df['RUL'] < 20).astype(int)

# Split units
units = df['unit'].unique()
train_units, test_units = train_test_split(units, test_size=0.2, random_state=42)

# Filter rows
train_df = df[df['unit'].isin(train_units)]

# Features/target
target = 'label'
exclude_cols = ['unit', 'cycle', 'RUL', 'label']
features = [col for col in df.columns if col not in exclude_cols]

X_train = train_df[features]


# Extract feature importance
importances = clf.feature_importances_
feature_names = X_train.columns
importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=importance_df.head(15), palette="viridis")
plt.title("Top 15 Important Features – XGBoost Classifier")
plt.tight_layout()
plt.savefig(OUTPUTS_DIR / "feature_importance.png", dpi=300)
plt.clf()
print("✅ Feature importance plot saved.")




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x="Importance", y="Feature", data=importance_df.head(15), palette="viridis")


✅ Feature importance plot saved.


<Figure size 1000x600 with 0 Axes>