In [62]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report,
)
from functions import (clean_col_names, evaluate_metrics)


In [64]:
df = pd.read_csv('data/eval_data.csv')

voting1 = joblib.load("models/voting_stage1_model.pkl")
voting2 = joblib.load("models/voting_stage2_model.pkl")
thresholds_loaded = joblib.load("models/voting_stage1_thresholds.pkl")

# F1-optimized threshold
t_f1 = thresholds_loaded["best_threshold_f1"]

# Recall-optimized threshold
t_recall = thresholds_loaded["best_threshold_recall"]

In [3]:
le = LabelEncoder()
df['Type'] = le.fit_transform(df['Type'])

feature_columns = [
    'Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]',
    'Torque [Nm]', 'Tool wear [min]', 'Temp_diff', 'Torque_per_rpm', 'Wear_per_rpm',
    'Type'
]

X1 = df[feature_columns]
y1 = df['Target']

X1 = clean_col_names(X1)


In [55]:
# Predict failures on full dataset
y_pred_stage1 = voting1.predict(X1)
y_proba_stage1 = voting1.predict_proba(X1)[:, 1]

cm_stage1 = confusion_matrix(y1, y_pred_stage1)

# Extract useful values
TN, FP, FN, TP = cm_stage1.ravel()

# Stage 1 metrics
acc_stage1 = accuracy_score(y1, y_pred_stage1)
fnr_stage1 = FN / (FN + TP)   # False negative rate
fpr_stage1 = FP / (FP + TN)   # False positive rate
tp_rate_stage1 = TP / (TP + FN)

print(f"Stage 1 Accuracy: {acc_stage1:.3f}")
print(f"False Negatives (missed failures): {fnr_stage1:.3%}")
print(f"False Positives (false alarms): {fpr_stage1:.3%}")
print(cm_stage1)

Stage 1 Accuracy: 0.998
False Negatives (missed failures): 3.030%
False Positives (false alarms): 0.093%
[[9634    9]
 [  10  320]]


In [30]:
print(classification_report(y1, y_pred_stage1))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9643
           1       0.97      0.97      0.97       330

    accuracy                           1.00      9973
   macro avg       0.99      0.98      0.99      9973
weighted avg       1.00      1.00      1.00      9973



In [57]:
# --- If you want to use the Recall-based threshold
y_pred_recall = (y_proba_stage1 >= t_recall).astype(int)

print("\n=== Using Recall Threshold ===")
print(classification_report(y1, y_pred_recall))

cm_stage1r = confusion_matrix(y1, y_pred_recall)

# Extract useful values
TN_r, FP_r, FN_r, TP_r = cm_stage1r.ravel()

# Stage 1 metrics
acc_stage1r = accuracy_score(y1, y_pred_recall)
fnr_stage1r = FN_r / (FN_r + TP_r)   # False negative rate
fpr_stage1r = FP_r / (FP_r + TN_r)   # False positive rate
tp_rate_stage1r = TP_r / (TP_r + FN_r)

print(f"Stage 1 Accuracy: {acc_stage1r:.3f}")
print(f"False Negatives (missed failures): {fnr_stage1r:.3%}")
print(f"False Positives (false alarms): {fpr_stage1r:.3%}")
print(cm_stage1r)


=== Using Recall Threshold ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9643
           1       0.98      0.97      0.97       330

    accuracy                           1.00      9973
   macro avg       0.99      0.98      0.99      9973
weighted avg       1.00      1.00      1.00      9973

Stage 1 Accuracy: 0.998
False Negatives (missed failures): 3.333%
False Positives (false alarms): 0.073%
[[9636    7]
 [  11  319]]


In [58]:
# --- If you want to use the F1-based threshold
y_pred_f1 = (y_proba_stage1 >= t_f1).astype(int)


print("=== Using F1 Threshold ===")
print(classification_report(y1, y_pred_f1))

cm_stage1f = confusion_matrix(y1, y_pred_f1)

# Extract useful values
TN_f1, FP_f1, FN_f1, TP_f1 = cm_stage1f.ravel()

# Stage 1 metrics
acc_stage1f = accuracy_score(y1, y_pred_f1)
fnr_stage1f = FN_f1 / (FN_f1 + TP_f1)   # False negative rate
fpr_stage1f = FP_f1 / (FP_f1 + TN_f1)   # False positive rate
tp_rate_stage1f = TP_f1 / (TP_f1 + FN_f1)

print(f"Stage 1 Accuracy: {acc_stage1f:.3f}")
print(f"False Negatives (missed failures): {fnr_stage1f:.3%}")
print(f"False Positives (false alarms): {fpr_stage1f:.3%}")
print(cm_stage1f)

=== Using F1 Threshold ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9643
           1       0.98      0.97      0.97       330

    accuracy                           1.00      9973
   macro avg       0.99      0.98      0.99      9973
weighted avg       1.00      1.00      1.00      9973

Stage 1 Accuracy: 0.998
False Negatives (missed failures): 3.333%
False Positives (false alarms): 0.073%
[[9636    7]
 [  11  319]]


In [None]:
# == Filter correctly predicted failures ===
idx_correct_failures = (y1 == 1) & (y_pred_stage1 == 1)
X_failures_correct = X1[idx_correct_failures]
y_true_fail_type = y1[idx_correct_failures]
df_failures_correct = df.loc[idx_correct_failures].copy()
y2 = df_failures_correct['Failure Type']
y2 = le.fit_transform(y2)


diff = df[df['Target']==1].shape[0] - X_failures_correct.shape[0]

print(f"Number of Missed Failures without thresholds: {diff}")

Number of Missed Failures without thresholds: 10


In [23]:
print(f"t_f1 = {t_f1:.4f}")
print(f"t_recall = {t_recall:.4f}")


t_f1 = 0.5149
t_recall = 0.5149


In [51]:
# Stage 2 predictions (for correctly predicted failures)
y_pred_stage2 = voting2.predict(X_failures_correct)
y_proba_stage2 = voting2.predict_proba(X_failures_correct)
metrics_voting2 = evaluate_metrics(y2, y_pred_stage2, y_proba_stage2)

print("\nVoting Classifier Metrics:")
for k, v in metrics_voting2.items():
    print(f"{k}: {v}")

acc_stage2 = accuracy_score(y2, y_pred_stage2)
report_stage2 = classification_report(y2, y_pred_stage2)

print(f"\nStage 2 (Failure Cause) Accuracy: {acc_stage2:.3f}")
print()
print(report_stage2)



Voting Classifier Metrics:
Accuracy: 0.98125
Precision: 0.9811940208321788
Recall: 0.98125
F1: 0.9811647516414503
MCC: 0.9740089400817524
ROC_AUC: 0.9979183199834033
Confusion_Matrix: [[112   0   0   0]
 [  1  75   1   1]
 [  1   0  91   0]
 [  0   2   0  36]]
AUC_PR: 0.9904033000803643

Stage 2 (Failure Cause) Accuracy: 0.981

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       112
           1       0.97      0.96      0.97        78
           2       0.99      0.99      0.99        92
           3       0.97      0.95      0.96        38

    accuracy                           0.98       320
   macro avg       0.98      0.97      0.98       320
weighted avg       0.98      0.98      0.98       320



In [61]:
# Total samples
total_samples = len(y1)
failures_actual = np.sum(y1 == 1)
failures_predicted = np.sum(y_pred_stage1 == 1)

# Stage 1 effectiveness
correct_detections = TP
missed_failures = FN
false_alarms = FP

# Stage 1 effectiveness recall focused
correct_detections_r = TP_r
missed_failures_r = FN_r
false_alarms_r = FP_r

# Stage 1 effectiveness f1 focused
correct_detections_f1 = TP_f1
missed_failures_f1 = FN_f1
false_alarms_f1 = FP_f1

# Stage 2 effectiveness (among correctly detected failures)
correct_cause_preds = acc_stage2 * correct_detections

print(f"""
System Summary:
---------------
Total records: {total_samples}
Actual failures: {failures_actual}
Predicted failures: {failures_predicted}

Stage 1 (Failure Detection):
- Accuracy: {acc_stage1:.3f}
- Correct detections: {correct_detections} ({tp_rate_stage1:.2%})
- Missed failures (false negatives): {missed_failures} ({fnr_stage1:.2%})
- False alarms (false positives): {false_alarms} ({fpr_stage1:.2%})

Stage 1 (Failure Detection Recall Focused):
- Accuracy: {acc_stage1r:.3f}
- Correct detections: {correct_detections_r} ({tp_rate_stage1r:.2%})
- Missed failures (false negatives): {missed_failures_r} ({fnr_stage1r:.2%})
- False alarms (false positives): {false_alarms_r} ({fpr_stage1r:.2%})

Stage 1 (Failure Detection F1 Focused):
- Accuracy: {acc_stage1f:.3f}
- Correct detections: {correct_detections_f1} ({tp_rate_stage1f:.2%})
- Missed failures (false negatives): {missed_failures_f1} ({fnr_stage1f:.2%})
- False alarms (false positives): {false_alarms_f1} ({fpr_stage1f:.2%})

Stage 2 (Failure Cause Diagnosis):
- Accuracy: {acc_stage2:.3f}
- Correct cause predictions among detected failures: {acc_stage2*100:.2f}%

End-to-End:
- Overall system correctly predicted {acc_stage2 * tp_rate_stage1 * 100:.2f}% of all failure causes end-to-end.
""")





System Summary:
---------------
Total records: 9973
Actual failures: 330
Predicted failures: 329

Stage 1 (Failure Detection):
- Accuracy: 0.998
- Correct detections: 320 (96.97%)
- Missed failures (false negatives): 10 (3.03%)
- False alarms (false positives): 9 (0.09%)

Stage 1 (Failure Detection Recall Focused):
- Accuracy: 0.998
- Correct detections: 319 (96.67%)
- Missed failures (false negatives): 11 (3.33%)
- False alarms (false positives): 7 (0.07%)

Stage 1 (Failure Detection F1 Focused):
- Accuracy: 0.998
- Correct detections: 319 (96.67%)
- Missed failures (false negatives): 11 (3.33%)
- False alarms (false positives): 7 (0.07%)

Stage 2 (Failure Cause Diagnosis):
- Accuracy: 0.981
- Correct cause predictions among detected failures: 98.12%

End-to-End:
- Overall system correctly predicted 95.15% of all failure causes end-to-end.

