# 04 - Ensemble Models

This notebook demonstrates heterogeneous ensemble models for software effort estimation.

## Contents:
1. Ensemble architecture
2. Combination rules
3. Training ensembles
4. Comparison with standalone models

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import warnings
warnings.filterwarnings('ignore')
sys.path.append('..')

from src.data.data_loader import DataLoader
from src.data.preprocessor import DataPreprocessor
from src.models.ensemble_model import EnsembleModel, create_all_ensembles
from src.models.cbr_model import CBRModel
from src.models.cocomo_model import COCOMOModel
from src.models.ml_models import XGBoostModel
from src.evaluation.metrics import calculate_all_metrics
from sklearn.model_selection import train_test_split

%matplotlib inline
plt.style.use('seaborn-v0_8-whitegrid')

## Load and Preprocess Data

In [None]:
# Load data
loader = DataLoader('cocomo81')
df = loader.load_raw_data()

# Preprocess
preprocessor = DataPreprocessor()
X, y = preprocessor.preprocess_pipeline(df, scale=True)

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

## 1. Ensemble Architecture

Our heterogeneous ensemble combines three different types of models:
- **CBR (Case-Based Reasoning)**: Memory-based, uses similar past projects
- **COCOMO II**: Algorithmic, based on expert knowledge
- **ML Model**: Data-driven (XGBoost, ANN, KNN, or SVR)

In [None]:
# Create ensemble with XGBoost
ensemble = EnsembleModel(
    ml_model_name="XGBoost",
    combination_rule="median"
)

print("Ensemble Components:")
print(f"  1. CBR Model: {ensemble.cbr_model}")
print(f"  2. COCOMO Model: {ensemble.cocomo_model}")
print(f"  3. ML Model: {ensemble.ml_model}")
print(f"\nCombination Rule: {ensemble.combination_rule}")

## 2. Combination Rules

In [None]:
# Demonstrate combination rules
pred1 = np.array([100, 200, 300])
pred2 = np.array([120, 180, 350])
pred3 = np.array([110, 210, 280])

print("Example Predictions:")
print(f"  CBR: {pred1}")
print(f"  COCOMO: {pred2}")
print(f"  XGBoost: {pred3}")

# Median combination
median_result = np.median(np.stack([pred1, pred2, pred3]), axis=0)
print(f"\nMedian Combination: {median_result}")

# Mean combination
mean_result = np.mean(np.stack([pred1, pred2, pred3]), axis=0)
print(f"Mean Combination: {mean_result}")

# Linear combination (weighted)
weights = [0.4, 0.3, 0.3]
linear_result = weights[0]*pred1 + weights[1]*pred2 + weights[2]*pred3
print(f"Linear Combination (0.4, 0.3, 0.3): {linear_result}")

## 3. Training Ensembles

In [None]:
# Train ensemble
ensemble.fit(X_train, y_train)

print("Ensemble trained successfully!")
print(f"\nComponent Training Times:")
times = ensemble.get_component_training_times()
for component, time in times.items():
    print(f"  {component}: {time:.4f}s")

In [None]:
# Get predictions
ensemble_predictions = ensemble.predict(X_test)
individual_predictions = ensemble.predict_individual(X_test)

print("Sample Predictions (first 5):")
print(f"  Actual: {y_test[:5]}")
print(f"  CBR: {individual_predictions['CBR'][:5]}")
print(f"  COCOMO: {individual_predictions['COCOMO'][:5]}")
print(f"  XGBoost: {individual_predictions['XGBoost'][:5]}")
print(f"  Ensemble: {ensemble_predictions[:5]}")

In [None]:
# Evaluate ensemble
ensemble_metrics = calculate_all_metrics(y_test, ensemble_predictions)

print("\nEnsemble Results:")
for metric, value in ensemble_metrics.items():
    print(f"  {metric}: {value:.4f}")

## 4. Compare All Ensemble Variants

In [None]:
# Create all ensemble variants
ml_models = ['XGBoost', 'ANN', 'KNN', 'SVR']
ensemble_results = []

for ml_name in ml_models:
    print(f"\nTraining Ensemble with {ml_name}...")
    
    ens = EnsembleModel(ml_model_name=ml_name, combination_rule='median')
    ens.fit(X_train, y_train)
    preds = ens.predict(X_test)
    metrics = calculate_all_metrics(y_test, preds)
    
    ensemble_results.append({
        'Ensemble': f'CBR+COCOMO+{ml_name}',
        'MAE': metrics['MAE'],
        'MMRE': metrics['MMRE'],
        'PRED(0.25)': metrics['PRED(0.25)'],
        'Training Time': ens.training_time
    })

ensemble_df = pd.DataFrame(ensemble_results)
print("\n=== Ensemble Comparison ===")
print(ensemble_df.to_string(index=False))

In [None]:
# Compare combination rules
rules = ['median', 'mean', 'linear']
rule_results = []

for rule in rules:
    ens = EnsembleModel(ml_model_name='XGBoost', combination_rule=rule)
    ens.fit(X_train, y_train)
    preds = ens.predict(X_test)
    metrics = calculate_all_metrics(y_test, preds)
    
    rule_results.append({
        'Combination Rule': rule.capitalize(),
        'MAE': metrics['MAE'],
        'MMRE': metrics['MMRE'],
        'PRED(0.25)': metrics['PRED(0.25)']
    })

rule_df = pd.DataFrame(rule_results)
print("\n=== Combination Rule Comparison ===")
print(rule_df.to_string(index=False))

In [None]:
# Visualize ensemble comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# MAE comparison
colors = plt.cm.Set2(np.linspace(0, 1, len(ensemble_df)))
axes[0].bar(ensemble_df['Ensemble'], ensemble_df['MAE'], color=colors)
axes[0].set_ylabel('MAE')
axes[0].set_title('Ensemble MAE Comparison')
axes[0].tick_params(axis='x', rotation=45)

# PRED comparison
axes[1].bar(ensemble_df['Ensemble'], ensemble_df['PRED(0.25)'], color=colors)
axes[1].set_ylabel('PRED(0.25)')
axes[1].set_title('Ensemble PRED(0.25) Comparison')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('../reports/figures/ensemble_comparison.png', dpi=150)
plt.show()

## Summary

### Key Findings:
1. **Median combination** is most robust against outliers
2. **Ensemble models** combine strengths of different approaches
3. **XGBoost-based ensemble** often performs best