In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Step 1: Load dataset
df = pd.read_csv("Dataset/3 Balanced_Predictive_Maintenance.csv")

In [3]:
# Step 2: Drop non-predictive features
df = df.drop(columns=['UDI', 'Product ID', 'Type', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF'])

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Machine failure
2,Target type,Binary
3,Original data shape,"(15457, 14)"
4,Transformed data shape,"(15457, 5)"
5,Transformed train set shape,"(10819, 5)"
6,Transformed test set shape,"(4638, 5)"
7,Ignore features,9
8,Numeric features,4
9,Preprocess,True


In [4]:
# Step 3: Define features (X) and target (y)
X = df.drop('Machine failure', axis=1)
y = df['Machine failure']

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Step 5: Define model and parameter grid
rf = RandomForestClassifier(random_state=42)
'''
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
'''

In [None]:
# Step 6: Perform GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           scoring='f1', cv=5, n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)

In [None]:
# Step 7: Get the best estimator
best_rf = grid_search.best_estimator_

In [None]:
# Step 8: Evaluate on test set
y_pred = best_rf.predict(X_test)

In [None]:
print("🔎 Best Parameters:", grid_search.best_params_)
print("\n🧾 Classification Report:\n", classification_report(y_test, y_pred))
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("🎯 F1 Score:", f1_score(y_test, y_pred))

In [None]:
# Step 9: Confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# Step 10: Feature importance
importances = best_rf.feature_importances_
feature_names = X.columns
feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False)

In [None]:
plt.figure(figsize=(8, 5))
sns.barplot(x=feat_imp.values, y=feat_imp.index)
plt.title("Feature Importances (Random Forest)")
plt.show()