# Ensemble Model for Salary Prediction

This notebook implements an ensemble/stacking model that combines XGBoost, LightGBM, and CatBoost for maximum F1 score (>0.90) on the salary prediction dataset.


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, f1_score, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline


## 1. Load and Prepare Data


In [None]:
# Load processed data
train_data = pd.read_csv('./for_cursur/Data/salary.train.processed.csv', index_col='id')
test_data = pd.read_csv('./for_cursur/Data/salary.test.processed.csv', index_col='id')
live_data = pd.read_csv('./for_cursur/Data/salary.live.processed.csv', index_col='id')

print(f"Training data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")
print(f"Live data shape: {live_data.shape}")

# Prepare features and target
X_train = train_data.drop(columns=['label'])
y_train = train_data['label']
X_test = test_data.drop(columns=['label'])
y_test = test_data['label']
X_live = live_data

print(f"\nFeature matrix shape: {X_train.shape}")
print(f"Target distribution: {y_train.value_counts().to_dict()}")


## 2. Define Base Models


In [None]:
# Define optimized base models
xgb_model = xgb.XGBClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.03,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_alpha=0.5,
    reg_lambda=1.5,
    scale_pos_weight=2.5,
    random_state=42,
    eval_metric='logloss',
    verbosity=0
)

lgb_model = lgb.LGBMClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.03,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_alpha=0.5,
    reg_lambda=1.5,
    class_weight='balanced',
    random_state=42,
    verbose=-1,
    force_col_wise=True
)

catboost_model = CatBoostClassifier(
    iterations=400,
    depth=6,
    learning_rate=0.03,
    subsample=0.85,
    colsample_bylevel=0.85,
    l2_leaf_reg=4,
    class_weights=[1, 2.5],
    random_state=42,
    verbose=False,
    thread_count=-1
)

print("Base models defined successfully!")
print("Models: XGBoost, LightGBM, CatBoost")


## 3. Create Ensemble Models


In [None]:
# Create Voting Classifier (Hard Voting)
voting_hard = VotingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('lgb', lgb_model),
        ('catboost', catboost_model)
    ],
    voting='hard'
)

# Create Voting Classifier (Soft Voting)
voting_soft = VotingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('lgb', lgb_model),
        ('catboost', catboost_model)
    ],
    voting='soft'
)

# Create Stacking Classifier
stacking_model = StackingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('lgb', lgb_model),
        ('catboost', catboost_model)
    ],
    final_estimator=LogisticRegression(random_state=42, class_weight='balanced'),
    cv=5,
    stack_method='predict_proba'
)

print("Ensemble models created successfully!")
print("Models: Hard Voting, Soft Voting, Stacking")


## 4. Train and Evaluate Ensemble Models


In [None]:
# Train and evaluate all ensemble models
models = {
    'Hard Voting': voting_hard,
    'Soft Voting': voting_soft,
    'Stacking': stacking_model
}

results = {}

print("Training and evaluating ensemble models...")
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Calculate metrics
    train_f1 = f1_score(y_train, y_pred_train)
    test_f1 = f1_score(y_test, y_pred_test)
    
    # Store results
    results[name] = {
        'model': model,
        'train_f1': train_f1,
        'test_f1': test_f1
    }
    
    print(f"{name} - Train F1: {train_f1:.4f}, Test F1: {test_f1:.4f}")

# Find best model
best_model_name = max(results.keys(), key=lambda x: results[x]['test_f1'])
best_model = results[best_model_name]['model']
best_f1 = results[best_model_name]['test_f1']

print(f"\n=== BEST ENSEMBLE MODEL ===")
print(f"Best Model: {best_model_name}")
print(f"Best F1 Score: {best_f1:.4f}")
print(f"Target achieved: {'✅ YES' if best_f1 >= 0.90 else '❌ NO'}")


## 5. Detailed Evaluation of Best Model


In [None]:
# Detailed evaluation of best model
y_pred_test = best_model.predict(X_test)
y_pred_proba_test = best_model.predict_proba(X_test)[:, 1]

# Calculate additional metrics
test_auc = roc_auc_score(y_test, y_pred_proba_test)

print(f"=== DETAILED EVALUATION - {best_model_name} ===")
print(f"Test F1 Score: {best_f1:.4f}")
print(f"Test AUC Score: {test_auc:.4f}")

# Detailed classification report
print("\n=== DETAILED CLASSIFICATION REPORT ===")
print(classification_report(y_test, y_pred_test))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_test)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Purples', 
            xticklabels=['Low Income', 'High Income'], 
            yticklabels=['Low Income', 'High Income'])
plt.title(f'Confusion Matrix - {best_model_name} Model')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Model comparison
print("\n=== MODEL COMPARISON ===")
for name, result in results.items():
    print(f"{name:15s}: F1 = {result['test_f1']:.4f}")


## 6. Make Predictions on Live Data


In [None]:
# Make predictions on live data using best model
live_predictions = best_model.predict(X_live)
live_probabilities = best_model.predict_proba(X_live)[:, 1]

# Create prediction dataframe
live_results = pd.DataFrame({
    'id': X_live.index,
    'predicted_label': live_predictions,
    'probability_high_income': live_probabilities
})

print(f"Live data predictions completed using {best_model_name}!")
print(f"Number of predictions: {len(live_results)}")
print(f"High income predictions: {live_predictions.sum()}")
print(f"Low income predictions: {len(live_predictions) - live_predictions.sum()}")

# Show sample predictions
print("\nSample predictions:")
print(live_results.head(10))


## 7. Save Model and Results


In [None]:
# Save the best ensemble model
joblib.dump(best_model, './for_cursur/ensemble_model.joblib')

# Save predictions
live_results.to_csv('./for_cursur/ensemble_predictions.csv', index=False)

# Save model configuration
model_config = {
    'model_type': f'Ensemble - {best_model_name}',
    'best_model': best_model_name,
    'all_results': {name: result['test_f1'] for name, result in results.items()},
    'test_f1_score': best_f1,
    'test_auc_score': test_auc,
    'feature_count': X_train.shape[1],
    'training_samples': len(X_train),
    'test_samples': len(X_test),
    'base_models': ['XGBoost', 'LightGBM', 'CatBoost']
}

import json
with open('./for_cursur/ensemble_model_config.json', 'w') as f:
    json.dump(model_config, f, indent=2)

print("Ensemble model and results saved successfully!")
print(f"\nFinal Test F1 Score: {best_f1:.4f}")
print(f"Target achieved: {'✅ YES' if best_f1 >= 0.90 else '❌ NO'}")
print(f"\nBest performing ensemble: {best_model_name}")
