In [8]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import joblib  # for saving model

In [9]:
# Step 1: Load dataset
df = pd.read_csv("Dataset/3 Balanced_Predictive_Maintenance.csv")

In [10]:
# Step 2: Drop non-predictive features
df = df.drop(columns=['UDI', 'Product ID', 'Type', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF'])

In [11]:
# Step 3: Define features (X) and target (y)
X = df.drop('Machine failure', axis=1)
y = df['Machine failure']

In [12]:
# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [13]:
# Step 5: Define model and parameter grid
rf = RandomForestClassifier(n_estimators=300, max_depth=10, random_state=42)

In [14]:
# Set up Stratified K-Fold cross-validation
k = 10
cv = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

In [15]:
# Evaluate the model using cross-validation
scores = cross_val_score(rf, X, y, cv=cv, scoring='f1')

In [17]:
# Step 8: Evaluate on test set
y_pred = rf.predict(X_test)

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
print("🔎 Best Parameters:", grid_search.best_params_)
print("\n🧾 Classification Report:\n", classification_report(y_test, y_pred))
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("🎯 F1 Score:", f1_score(y_test, y_pred))

In [None]:
# Step 9: Confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# Step 10: Feature importance
importances = best_rf.feature_importances_
feature_names = X.columns
feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False)

In [None]:
plt.figure(figsize=(8, 5))
sns.barplot(x=feat_imp.values, y=feat_imp.index)
plt.title("Feature Importances (Random Forest)")
plt.show()

In [None]:
# Save the model
joblib.dump(best_rf, 'best_machine_failure_model.pkl')
print("\n✅ Model saved as 'best_machine_failure_model.pkl'")