# Model Training (Alternative Workflow)
## Enrollment: 23113099

---

## ⚠️ NOTE: This notebook is not used in the primary workflow


## Primary Implementation

All model training and evaluation are contained in:
**→ `PropertyValuation_SatelliteImagery_23113099.ipynb`**

This single notebook includes:
- ✅ Baseline XGBoost training
- ✅ Neural network fusion training
- ✅ Enhanced XGBoost training (tabular + images)
- ✅ Grad-CAM explainability
- ✅ Comprehensive model comparison
- ✅ Final predictions (23113099_final.csv)

---

## Why This File Exists

This file is included only to satisfy the recommended three-file structure mentioned in guidelines. However, following the clarification that a single notebook is acceptable, all implementation is in the ENHANCED notebook for better integration and analysis.

---

## To Run the Project

1. Open `PropertyValuation_SatelliteImagery_23113099.ipynb`
2. Run all cells sequentially
3. All results are generated, including:
   - Baseline results
   - Neural fusion results
   - Enhanced XGBoost results
   - Final predictions file

**No need to run this file.**

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import json
import warnings
warnings.filterwarnings('ignore')

print('Libraries imported successfully')

In [None]:
# Load enhanced datasets
train_data = pd.read_csv('data/processed/train_enhanced.csv')
test_data = pd.read_csv('data/processed/test_enhanced.csv')

# Load metadata
with open('outputs/preprocessing_metadata.json', 'r') as f:
    metadata = json.load(f)

print(f'\nTraining samples: {len(train_data):,}')
print(f'Test samples: {len(test_data):,}')
print(f'Total features: {metadata["total_features"]}')
print(f'  - Tabular: {metadata["tabular_features"]}')
print(f'  - Image PCA: {metadata["pca_components"]}')

print('\n✓ Data loaded successfully')

In [None]:
# All features (tabular + PCA)
feature_cols = [col for col in train_data.columns 
                if col not in ['id', 'date', 'price']]

# Separate tabular and PCA features
tabular_cols = [col for col in feature_cols if not col.startswith('cnn_pc')]
pca_cols = [col for col in feature_cols if col.startswith('cnn_pc')]

print(f'Feature columns:')
print(f'  Tabular: {len(tabular_cols)}')
print(f'  Image PCA: {len(pca_cols)}')
print(f'  Total: {len(feature_cols)}')

# Extract data
X_all_features = train_data[feature_cols].values
X_tabular_only = train_data[tabular_cols].values
y_train = train_data['price'].values

X_test_all = test_data[feature_cols].values
X_test_tabular = test_data[tabular_cols].values
test_ids = test_data['id'].values

print('\n✓ Features prepared')

In [None]:
# Scale all features
scaler_all = StandardScaler()
X_all_scaled = scaler_all.fit_transform(X_all_features)
X_test_all_scaled = scaler_all.transform(X_test_all)

# Scale tabular only
scaler_tabular = StandardScaler()
X_tabular_scaled = scaler_tabular.fit_transform(X_tabular_only)
X_test_tabular_scaled = scaler_tabular.transform(X_test_tabular)

print('✓ Features scaled')

In [None]:
# Split for both feature sets
X_train_all, X_val_all, y_train_split, y_val_split = train_test_split(
    X_all_scaled, y_train, test_size=0.2, random_state=42
)

X_train_tab, X_val_tab, _, _ = train_test_split(
    X_tabular_scaled, y_train, test_size=0.2, random_state=42
)

print(f'Training samples: {X_train_all.shape[0]:,}')
print(f'Validation samples: {X_val_all.shape[0]:,}')
print('\n✓ Data split complete')

In [None]:
# Train baseline
baseline_model = xgb.XGBRegressor(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.05,
    random_state=42,
    n_jobs=-1
)

print('Training baseline XGBoost...')
baseline_model.fit(X_train_tab, y_train_split)

# Evaluate
val_pred_baseline = baseline_model.predict(X_val_tab)
baseline_rmse = np.sqrt(mean_squared_error(y_val_split, val_pred_baseline))
baseline_r2 = r2_score(y_val_split, val_pred_baseline)
baseline_mae = mean_absolute_error(y_val_split, val_pred_baseline)

print('\nBaseline Results:')
print(f'  RMSE: ${baseline_rmse:,.2f}')
print(f'  R² Score: {baseline_r2:.4f}')
print(f'  MAE: ${baseline_mae:,.2f}')

In [None]:
print('\n' + '='*70)
print('MODEL 2: ENHANCED XGBOOST (TABULAR + IMAGE PCA)')
print('='*70)

# Train enhanced model
enhanced_model = xgb.XGBRegressor(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    random_state=42,
    n_jobs=-1
)

print('Training enhanced XGBoost...')
enhanced_model.fit(X_train_all, y_train_split)

# Evaluate
val_pred_enhanced = enhanced_model.predict(X_val_all)
enhanced_rmse = np.sqrt(mean_squared_error(y_val_split, val_pred_enhanced))
enhanced_r2 = r2_score(y_val_split, val_pred_enhanced)
enhanced_mae = mean_absolute_error(y_val_split, val_pred_enhanced)

print('\nEnhanced Results:')
print(f'  RMSE: ${enhanced_rmse:,.2f}')
print(f'  R² Score: {enhanced_r2:.4f}')
print(f'  MAE: ${enhanced_mae:,.2f}')

improvement = ((enhanced_r2 - baseline_r2) / baseline_r2) * 100
print(f'\nImprovement over baseline: {improvement:+.2f}%')

In [None]:
print('\n' + '='*70)
print('MODEL COMPARISON')
print('='*70)

# Create comparison DataFrame
comparison_df = pd.DataFrame({
    'Model': ['Baseline XGBoost (Tabular Only)', 
              'Enhanced XGBoost (Tabular + Image PCA)'],
    'RMSE': [baseline_rmse, enhanced_rmse],
    'R² Score': [baseline_r2, enhanced_r2],
    'MAE': [baseline_mae, enhanced_mae]
})

print('\n' + comparison_df.to_string(index=False))

# Determine best model
best_model_name = 'Enhanced XGBoost' if enhanced_r2 > baseline_r2 else 'Baseline XGBoost'
best_r2 = max(baseline_r2, enhanced_r2)

print(f'\n{"="*70}')
print(f'BEST MODEL: {best_model_name}')
print(f'R² Score: {best_r2:.4f}')
print(f'{"="*70}')

# Save comparison
comparison_df.to_csv('outputs/model_comparison_alternative.csv', index=False)
print('\n✓ Comparison saved to outputs/model_comparison_alternative.csv')

In [None]:
print('\n' + '='*70)
print('CREATING VISUALIZATIONS')
print('='*70)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# R² Comparison
models = ['Baseline\n(Tabular)', 'Enhanced\n(Tab+Img PCA)']
r2_scores = [baseline_r2, enhanced_r2]
colors = ['steelblue', 'green']

bars = axes[0].bar(range(2), r2_scores, color=colors, alpha=0.8, edgecolor='black')
axes[0].set_xticks(range(2))
axes[0].set_xticklabels(models)
axes[0].set_ylabel('R² Score')
axes[0].set_title('Model Comparison: R² Score')
axes[0].grid(True, alpha=0.3, axis='y')

for bar, score in zip(bars, r2_scores):
    height = bar.get_height()
    axes[0].text(bar.get_x() + bar.get_width()/2., height,
                 f'{score:.4f}',
                 ha='center', va='bottom', fontweight='bold')

# RMSE Comparison
rmse_scores = [baseline_rmse, enhanced_rmse]
bars = axes[1].bar(range(2), rmse_scores, color=colors, alpha=0.8, edgecolor='black')
axes[1].set_xticks(range(2))
axes[1].set_xticklabels(models)
axes[1].set_ylabel('RMSE ($)')
axes[1].set_title('Model Comparison: RMSE (Lower is Better)')
axes[1].grid(True, alpha=0.3, axis='y')

for bar, score in zip(bars, rmse_scores):
    height = bar.get_height()
    axes[1].text(bar.get_x() + bar.get_width()/2., height,
                 f'${score/1000:.0f}K',
                 ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig('outputs/model_comparison_alternative.png', dpi=300, bbox_inches='tight')
plt.show()

print('✓ Visualization saved to outputs/model_comparison_alternative.png')

In [None]:
# Use enhanced model (best performer)
print(f'Using: {best_model_name}')

if enhanced_r2 > baseline_r2:
    test_predictions = enhanced_model.predict(X_test_all_scaled)
    model_used = 'Enhanced XGBoost'
else:
    test_predictions = baseline_model.predict(X_test_tabular_scaled)
    model_used = 'Baseline XGBoost'

# Create submission
submission = pd.DataFrame({
    'id': test_ids,
    'predicted_price': test_predictions
})

submission.to_csv('23113099_final_alternative.csv', index=False)

print(f'\n{"="*70}')
print('✓ PREDICTIONS COMPLETE')
print(f'{"="*70}')
print(f'Model used: {model_used}')
print(f'File created: 23113099_final_alternative.csv')
print(f'Samples: {len(submission):,}')
print(f'\nPrediction statistics:')
print(f'  Mean: ${test_predictions.mean():,.0f}')
print(f'  Median: ${np.median(test_predictions):,.0f}')
print(f'  Min: ${test_predictions.min():,.0f}')
print(f'  Max: ${test_predictions.max():,.0f}')

print('\nFirst 5 predictions:')
print(submission.head())

In [None]:
print('\nResults Summary:')
print(f'  Baseline XGBoost: R² {baseline_r2:.4f}, RMSE ${baseline_rmse:,.0f}')
print(f'  Enhanced XGBoost: R² {enhanced_r2:.4f}, RMSE ${enhanced_rmse:,.0f}')
print(f'  Best Model: {best_model_name} (R² {best_r2:.4f})')

print('\nFiles Generated:')
print('  • 23113099_final_alternative.csv (predictions)')
print('  • outputs/model_comparison_alternative.csv')
print('  • outputs/model_comparison_alternative.png')

print('\n' + '='*70)
print('✓ ALL TASKS COMPLETE')
print('='*70)

print('\nNote: This alternative workflow provides baseline results.')
print('For comprehensive analysis including:')
print('  - Neural network fusion architecture')
print('  - Grad-CAM visual explainability')
print('  - Comprehensive EDA')
print('  - Detailed feature analysis')
print('\nPlease use PropertyValuation_SatelliteImagery_23113099.ipynb instead.')