In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

In [23]:
# Load data
X = pd.read_csv('X_resampled.csv')
y = pd.read_csv('y_resampled.csv').values.ravel()  # Flatten y to 1D

In [24]:
X

Unnamed: 0,temperature,pressure,vibration,humidity,equipment_Compressor,equipment_Pump,equipment_Turbine,location_Atlanta,location_Chicago,location_Houston,location_New York,location_San Francisco
0,58.180180,25.029278,0.606516,45.694907,False,False,True,True,False,False,False,False
1,75.740712,22.954018,2.338095,41.867407,True,False,False,False,True,False,False,False
2,71.358594,27.276830,1.389198,58.954409,False,False,True,False,False,False,False,True
3,71.616985,32.242921,1.770690,40.565138,False,True,False,True,False,False,False,False
4,66.506832,45.197471,0.345398,43.253795,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
13529,91.887807,35.758490,1.845431,37.897940,False,True,True,False,True,False,True,False
13530,62.911367,52.885485,1.560898,74.720669,False,True,True,True,False,False,True,False
13531,57.217928,23.037562,1.721068,42.791245,False,False,True,False,False,True,False,True
13532,78.145840,43.248404,2.691827,49.754771,True,False,False,False,False,True,True,False


In [25]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [26]:
# Base models
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', verbosity=0, random_state=42)
lgbm = LGBMClassifier(random_state=42)
catboost = CatBoostClassifier(verbose=0, random_state=42)

In [27]:
# Meta model
rf = RandomForestClassifier(random_state=42)

In [28]:
stack = StackingClassifier(
    estimators=[
        ('xgb', xgb),
        ('lgbm', lgbm),
        ('catboost', catboost)
    ],
    final_estimator=rf,
    cv=5,
    passthrough=True,  # Optional: pass original features to meta model
    n_jobs=-1
)

In [29]:
# Grid search parameters for meta-model (Random Forest)
param_grid = {
    'final_estimator__n_estimators': [100, 200],
    'final_estimator__max_depth': [5, 10, None],
    'final_estimator__min_samples_split': [2, 5]
}


In [30]:

grid = GridSearchCV(estimator=stack, param_grid=param_grid, cv=3, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


In [31]:
# Predict and evaluate on Train, Validation, and Test sets
for split_name, X_split, y_split in [('Train', X_train, y_train), ('Validation', X_val, y_val), ('Test', X_test, y_test)]:
    y_pred = grid.predict(X_split)
    print(f"\n--- {split_name} Set Evaluation ---")
    print("Accuracy:", accuracy_score(y_split, y_pred))
    print("Classification Report:\n", classification_report(y_split, y_pred))

# Best hyperparameters
print("\nBest Parameters for Meta Model:\n", grid.best_params_)


--- Train Set Evaluation ---
Accuracy: 0.9993534681813984
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      5413
           1       1.00      1.00      1.00      5414

    accuracy                           1.00     10827
   macro avg       1.00      1.00      1.00     10827
weighted avg       1.00      1.00      1.00     10827


--- Validation Set Evaluation ---
Accuracy: 0.9948263118994827
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99       677
           1       0.99      1.00      0.99       676

    accuracy                           0.99      1353
   macro avg       0.99      0.99      0.99      1353
weighted avg       0.99      0.99      0.99      1353


--- Test Set Evaluation ---
Accuracy: 0.9940915805022157
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00     