# 03 - Standalone Models

This notebook demonstrates individual models for software effort estimation.

## Models Covered:
1. Case-Based Reasoning (CBR)
2. COCOMO II
3. XGBoost
4. Artificial Neural Network (ANN)
5. K-Nearest Neighbors (KNN)
6. Support Vector Regression (SVR)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import warnings
warnings.filterwarnings('ignore')
sys.path.append('..')

from src.data.data_loader import DataLoader
from src.data.preprocessor import DataPreprocessor
from src.models.cbr_model import CBRModel
from src.models.cocomo_model import COCOMOModel, PureCOCOMO
from src.models.ml_models import XGBoostModel, ANNModel, KNNModel, SVRModel
from src.evaluation.metrics import calculate_all_metrics
from sklearn.model_selection import train_test_split

%matplotlib inline
plt.style.use('seaborn-v0_8-whitegrid')

## Load and Preprocess Data

In [None]:
# Load data
loader = DataLoader('cocomo81')
df = loader.load_raw_data()

# Preprocess
preprocessor = DataPreprocessor()
X, y = preprocessor.preprocess_pipeline(df, scale=True)

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

## 1. Case-Based Reasoning (CBR)

In [None]:
# Train CBR model
cbr = CBRModel(k=5, similarity_metric='euclidean', weighting_scheme='distance')
cbr.fit(X_train, y_train)

# Predict
cbr_predictions = cbr.predict(X_test)

# Evaluate
cbr_metrics = calculate_all_metrics(y_test, cbr_predictions)

print("CBR Results:")
print(f"  Training time: {cbr.training_time:.4f}s")
for metric, value in cbr_metrics.items():
    print(f"  {metric}: {value:.4f}")

## 2. COCOMO II

In [None]:
# Train COCOMO model (with NN correction)
cocomo = COCOMOModel(use_nn_correction=False)
cocomo.fit(X_train, y_train)

# Predict
cocomo_predictions = cocomo.predict(X_test)

# Evaluate
cocomo_metrics = calculate_all_metrics(y_test, cocomo_predictions)

print("COCOMO II Results:")
print(f"  Training time: {cocomo.training_time:.4f}s")
for metric, value in cocomo_metrics.items():
    print(f"  {metric}: {value:.4f}")

## 3. XGBoost

In [None]:
# Train XGBoost
xgb = XGBoostModel()
xgb.fit(X_train, y_train)

# Predict
xgb_predictions = xgb.predict(X_test)

# Evaluate
xgb_metrics = calculate_all_metrics(y_test, xgb_predictions)

print("XGBoost Results:")
print(f"  Training time: {xgb.training_time:.4f}s")
for metric, value in xgb_metrics.items():
    print(f"  {metric}: {value:.4f}")

## 4. Artificial Neural Network (ANN)

In [None]:
# Train ANN
ann = ANNModel()
ann.fit(X_train, y_train)

# Predict
ann_predictions = ann.predict(X_test)

# Evaluate
ann_metrics = calculate_all_metrics(y_test, ann_predictions)

print("ANN Results:")
print(f"  Training time: {ann.training_time:.4f}s")
for metric, value in ann_metrics.items():
    print(f"  {metric}: {value:.4f}")

## 5. K-Nearest Neighbors (KNN)

In [None]:
# Train KNN
knn = KNNModel()
knn.fit(X_train, y_train)

# Predict
knn_predictions = knn.predict(X_test)

# Evaluate
knn_metrics = calculate_all_metrics(y_test, knn_predictions)

print("KNN Results:")
print(f"  Training time: {knn.training_time:.4f}s")
for metric, value in knn_metrics.items():
    print(f"  {metric}: {value:.4f}")

## 6. Support Vector Regression (SVR)

In [None]:
# Train SVR
svr = SVRModel()
svr.fit(X_train, y_train)

# Predict
svr_predictions = svr.predict(X_test)

# Evaluate
svr_metrics = calculate_all_metrics(y_test, svr_predictions)

print("SVR Results:")
print(f"  Training time: {svr.training_time:.4f}s")
for metric, value in svr_metrics.items():
    print(f"  {metric}: {value:.4f}")

## Model Comparison

In [None]:
# Compile results
results = {
    'Model': ['CBR', 'COCOMO', 'XGBoost', 'ANN', 'KNN', 'SVR'],
    'MAE': [cbr_metrics['MAE'], cocomo_metrics['MAE'], xgb_metrics['MAE'], 
            ann_metrics['MAE'], knn_metrics['MAE'], svr_metrics['MAE']],
    'MMRE': [cbr_metrics['MMRE'], cocomo_metrics['MMRE'], xgb_metrics['MMRE'],
             ann_metrics['MMRE'], knn_metrics['MMRE'], svr_metrics['MMRE']],
    'PRED(0.25)': [cbr_metrics['PRED(0.25)'], cocomo_metrics['PRED(0.25)'], xgb_metrics['PRED(0.25)'],
                   ann_metrics['PRED(0.25)'], knn_metrics['PRED(0.25)'], svr_metrics['PRED(0.25)']],
    'Training Time': [cbr.training_time, cocomo.training_time, xgb.training_time,
                      ann.training_time, knn.training_time, svr.training_time]
}

results_df = pd.DataFrame(results)
print("\n=== Model Comparison ===")
print(results_df.to_string(index=False))

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# MAE comparison
colors = plt.cm.viridis(np.linspace(0, 1, 6))
axes[0].bar(results_df['Model'], results_df['MAE'], color=colors)
axes[0].set_ylabel('MAE')
axes[0].set_title('Mean Absolute Error')
axes[0].tick_params(axis='x', rotation=45)

# MMRE comparison
axes[1].bar(results_df['Model'], results_df['MMRE'], color=colors)
axes[1].set_ylabel('MMRE')
axes[1].set_title('Mean Magnitude of Relative Error')
axes[1].tick_params(axis='x', rotation=45)

# PRED(0.25) comparison
axes[2].bar(results_df['Model'], results_df['PRED(0.25)'], color=colors)
axes[2].set_ylabel('PRED(0.25)')
axes[2].set_title('Prediction Accuracy (within 25%)')
axes[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('../reports/figures/standalone_comparison.png', dpi=150)
plt.show()

In [None]:
# Actual vs Predicted plot
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

all_predictions = [
    ('CBR', cbr_predictions),
    ('COCOMO', cocomo_predictions),
    ('XGBoost', xgb_predictions),
    ('ANN', ann_predictions),
    ('KNN', knn_predictions),
    ('SVR', svr_predictions)
]

for i, (name, preds) in enumerate(all_predictions):
    axes[i].scatter(y_test, preds, alpha=0.7, edgecolor='black')
    axes[i].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    axes[i].set_xlabel('Actual Effort')
    axes[i].set_ylabel('Predicted Effort')
    axes[i].set_title(f'{name}')

plt.tight_layout()
plt.savefig('../reports/figures/actual_vs_predicted.png', dpi=150)
plt.show()

## Summary

### Key Findings:
1. **Best MAE**: Varies by dataset and split
2. **Training Time**: CBR and KNN are fastest, ANN is slowest
3. **PRED(0.25)**: Higher is better - indicates prediction accuracy