In [14]:
import joblib
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

In [4]:
d = np.load("../data/training_data/splits.npz", allow_pickle=True)
X_train = d["X_train"]
X_test = d["X_test"]
y_train = d["y_train"]
y_test = d["y_test"]

In [6]:
d = np.load("../data/training_data/features_labels.npz", allow_pickle=True)
X = d["X"]
y = d["y"]

In [9]:
# Load XGBoost (assuming it was saved with joblib)
xgboost_model = joblib.load('trained_models/xgb_model.pkl')
print("XGBoost loaded")

# Load LightGBM
lgbm_model = joblib.load('trained_models/lgbm_model.pkl')
print("LightGBM loaded")

# Load CatBoost
catboost_model = CatBoostClassifier().load_model('trained_models/catboost_model.cbm')
print("CatBoost loaded")

XGBoost loaded
LightGBM loaded
CatBoost loaded


In [25]:
# 1. SIMPLE HARD VOTING (Majority Vote)
print("Hard Voting (Majority Rule):")

# Get predictions from each model
pred_xgb = xgboost_model.predict(X_test)
pred_lgb = lgbm_model.predict(X_test)
pred_cat = catboost_model.predict(X_test)

if len(pred_xgb.shape) > 1:
    pred_xgb = pred_xgb.flatten()
if len(pred_lgb.shape) > 1:
    pred_lgb = pred_lgb.flatten()
if len(pred_cat.shape) > 1:
    pred_cat = pred_cat.flatten()

# Take majority vote
hard_vote_pred = []
for i in range(len(X_test)):
    votes = [pred_xgb[i], pred_lgb[i], pred_cat[i]]
    # Most common vote wins
    hard_vote_pred.append(np.bincount(votes).argmax())

# Evaluate
hard_acc = accuracy_score(y_test, hard_vote_pred)
print(f"Hard Voting Accuracy: {hard_acc:.4f}")
print(f"Classification report: \n")
print(classification_report(y_test, hard_vote_pred))

# 2. SIMPLE SOFT VOTING (Average Probabilities)
print("\nSoft Voting (Average Probabilities):")

# Get probability predictions
prob_xgb = xgboost_model.predict_proba(X_test)
prob_lgb = lgbm_model.predict_proba(X_test) 
prob_cat = catboost_model.predict_proba(X_test)

# Average the probabilities
avg_probs = (prob_xgb + prob_lgb + prob_cat) / 3

# Take class with highest average probability
soft_vote_pred = np.argmax(avg_probs, axis=1)

# Evaluate
soft_acc = accuracy_score(y_test, soft_vote_pred)
print(f"Soft Voting Accuracy: {soft_acc:.4f}")
print(f"Classification report: \n")
print(classification_report(y_test, soft_vote_pred))

# Compare with individual models
print("\nComparison:")
for name, model in [('XGBoost', xgboost_model), ('LightGBM', lgbm_model), ('CatBoost', catboost_model)]:
    acc = accuracy_score(y_test, model.predict(X_test))
    print(f"{name:12}: {acc:.4f}")
print(f"Hard Voting  : {hard_acc:.4f}")
print(f"Soft Voting  : {soft_acc:.4f}")

Hard Voting (Majority Rule):
Hard Voting Accuracy: 0.9788
Classification report: 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       425
           1       0.99      0.96      0.98       379
           2       0.96      0.97      0.97       397
           3       0.96      0.98      0.97       399

    accuracy                           0.98      1600
   macro avg       0.98      0.98      0.98      1600
weighted avg       0.98      0.98      0.98      1600


Soft Voting (Average Probabilities):
Soft Voting Accuracy: 0.9800
Classification report: 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       425
           1       0.99      0.97      0.98       379
           2       0.97      0.97      0.97       397
           3       0.96      0.98      0.97       399

    accuracy                           0.98      1600
   macro avg       0.98      0.98      0.98      1600
weighted a

In [26]:
# 1. Get probability predictions from each model on TRAINING data
print("1. Getting probability predictions from training data...")

prob_xgb_train = xgboost_model.predict_proba(X_train)
prob_lgb_train = lgbm_model.predict_proba(X_train) 
prob_cat_train = catboost_model.predict_proba(X_train)

# 2. Stack them horizontally to create new training features
print("2. Stacking predictions to create new training features...")

# Each model gives [n_samples, n_classes] predictions
# Stack them: [model1_class1, model1_class2, ..., model2_class1, model2_class2, ...]
stacked_train = np.hstack([prob_xgb_train, prob_lgb_train, prob_cat_train])

print(f"   Original training shape: {X_train.shape}")
print(f"   Stacked training shape: {stacked_train.shape}")

# 3. Do the same for TEST data
print("3. Doing the same for test data...")

prob_xgb_test = xgboost_model.predict_proba(X_test)
prob_lgb_test = lgbm_model.predict_proba(X_test)
prob_cat_test = catboost_model.predict_proba(X_test)

stacked_test = np.hstack([prob_xgb_test, prob_lgb_test, prob_cat_test])
print(f"   Stacked test shape: {stacked_test.shape}")

# 4. Train a simple model on these stacked features
print("4. Training meta-learner on stacked features...")

# Use Logistic Regression as the meta-learner
meta_model = LogisticRegression(
    max_iter=1000,  # Increase iterations for convergence
    random_state=42,
    n_jobs=-1
)

meta_model.fit(stacked_train, y_train)
print("   Meta-learner trained!")

# 5. Make final predictions
print("5. Making final predictions...")

stacking_predictions = meta_model.predict(stacked_test)
stacking_accuracy = accuracy_score(y_test, stacking_predictions)

print(f"\nStacking Ensemble Accuracy: {stacking_accuracy:.4f}")
print(f"Classification report: \n")
print(classification_report(y_test, stacking_predictions))

# 6. Compare with individual models
print("\nComparison")

# Get individual model accuracies
indiv_results = []

# XGBoost
xgb_pred = xgboost_model.predict(X_test)
xgb_acc = accuracy_score(y_test, xgb_pred)
indiv_results.append(('XGBoost', xgb_acc))

# LightGBM
lgb_pred = lgbm_model.predict(X_test)
lgb_acc = accuracy_score(y_test, lgb_pred)
indiv_results.append(('LightGBM', lgb_acc))

# CatBoost
cat_pred = catboost_model.predict(X_test)
cat_acc = accuracy_score(y_test, cat_pred)
indiv_results.append(('CatBoost', cat_acc))

# Display results
for name, acc in indiv_results:
    print(f"{name:10}: {acc:.4f}")

print(f"Stacking Ensemble: {stacking_accuracy:.4f}")

# Find best
all_results = dict(indiv_results)
all_results['Stacking'] = stacking_accuracy

best_name = max(all_results, key=all_results.get)
best_acc = all_results[best_name]

print(f"\nBest accuracy: {best_name} ({best_acc:.4f})")

if best_name == 'Stacking':
    improvement = best_acc - max(xgb_acc, lgb_acc, cat_acc)
    print(f"Improvement over best individual model: {improvement:.4f}")

1. Getting probability predictions from training data...
2. Stacking predictions to create new training features...
   Original training shape: (6400, 16)
   Stacked training shape: (6400, 12)
3. Doing the same for test data...
   Stacked test shape: (1600, 12)
4. Training meta-learner on stacked features...
   Meta-learner trained!
5. Making final predictions...

Stacking Ensemble Accuracy: 0.9800
Classification report: 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       425
           1       0.99      0.97      0.98       379
           2       0.97      0.97      0.97       397
           3       0.96      0.98      0.97       399

    accuracy                           0.98      1600
   macro avg       0.98      0.98      0.98      1600
weighted avg       0.98      0.98      0.98      1600


Comparison
XGBoost   : 0.9750
LightGBM  : 0.9844
CatBoost  : 0.9756
Stacking Ensemble: 0.9800

Best accuracy: LightGBM (0.9844)


In [29]:
# Save all reports
for name, pred in [('hard_voting', hard_vote_pred), 
                   ('soft_voting', soft_vote_pred), 
                   ('stacking', stacking_predictions)]:
    report = classification_report(y_test, pred, output_dict=True)
    df = pd.DataFrame(report).transpose()
    df.to_csv(f"performance_metrics/ensemble_classification_reports/{name}_report.csv")
    print(f"{name}_report.csv saved")

hard_voting_report.csv saved
soft_voting_report.csv saved
stacking_report.csv saved
