# Data Preprocessing (Alternative Workflow)
## Enrollment: 23113099

---

## ⚠️ NOTE: This notebook is not used in the primary workflow


## Primary Implementation

All preprocessing, model training, and analysis are contained in:
**→ `PropertyValuation_SatelliteImagery_23113099.ipynb`**

This single notebook includes:
- ✅ Data loading and preprocessing
- ✅ Satellite image acquisition
- ✅ CNN feature extraction
- ✅ Exploratory data analysis
- ✅ Baseline model training (XGBoost)
- ✅ Neural network fusion
- ✅ Enhanced XGBoost (tabular + image PCA)
- ✅ Grad-CAM explainability
- ✅ Comprehensive performance comparison
- ✅ Final predictions generation

---

## Why This File Exists

This file is included only to satisfy the recommended three-file structure mentioned in guidelines. However, following the clarification that a single notebook is acceptable, all implementation is in the ENHANCED notebook for:
- Complete workflow visibility
- Easier reproducibility
- Better documentation
- Comprehensive analysis

---

## To Run the Project

1. Open `PropertyValuation_SatelliteImagery_23113099.ipynb`
2. Run all cells sequentially
3. All outputs, models, and predictions are generated automatically

**No need to run this file.**

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import torch
import torchvision.models as models
from torchvision import transforms
from PIL import Image
from pathlib import Path
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pickle
import warnings
warnings.filterwarnings('ignore')

print('Libraries imported successfully')

### **Configuration**

In [None]:
# Configuration
SAMPLE_SIZE = None  # Use full dataset (set to number for testing)
ZOOM_LEVEL = 17
IMAGE_SIZE = 224

print('='*70)
print('CONFIGURATION')
print('='*70)
print(f'Sample size: {"Full dataset" if SAMPLE_SIZE is None else SAMPLE_SIZE}')
print(f'Zoom level: {ZOOM_LEVEL}')
print(f'Image size: {IMAGE_SIZE}x{IMAGE_SIZE}')
print('='*70)

### **Load Data**

In [None]:
train_data = pd.read_csv('data/raw/train.csv')
test_data = pd.read_csv('data/raw/test.csv')

if SAMPLE_SIZE:
    train_data = train_data.head(SAMPLE_SIZE)
    test_data = test_data.head(SAMPLE_SIZE)

print(f'\nTraining samples: {len(train_data):,}')
print(f'Test samples: {len(test_data):,}')
print(f'Features: {train_data.shape[1]}')

print('\n✓ Data loaded successfully')

### **Download Satellite Images**

In [None]:
from data_fetcher import SatelliteImageFetcher

# Initialize fetchers
train_fetcher = SatelliteImageFetcher(
    output_dir='data/images/train',
    zoom=ZOOM_LEVEL,
    image_size=IMAGE_SIZE
)

test_fetcher = SatelliteImageFetcher(
    output_dir='data/images/test',
    zoom=ZOOM_LEVEL,
    image_size=IMAGE_SIZE
)

# Save to temp CSV
train_data.to_csv('temp_train_sample.csv', index=False)
test_data.to_csv('temp_test_sample.csv', index=False)

# Download training images
print('\nDownloading training images...')
train_results_df = train_fetcher.fetch_dataset(
    csv_path='temp_train_sample.csv',
    method='esri',
    lat_col='lat',
    lon_col='long',
    id_col='id',
    delay=0.1
)

# Download test images
print('\nDownloading test images...')
test_results_df = test_fetcher.fetch_dataset(
    csv_path='temp_test_sample.csv',
    method='esri',
    lat_col='lat',
    lon_col='long',
    id_col='id',
    delay=0.1
)

print('\n' + '='*70)
print(f'✓ Training images: {len(train_results_df):,}')
print(f'✓ Test images: {len(test_results_df):,}')
print('='*70)

**Extract CNN Features**

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Load ResNet50
print('\nLoading ResNet50...')
resnet = models.resnet50(pretrained=True)
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])
resnet.eval()
resnet.to(device)
print('✓ ResNet50 loaded')

# Image transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Feature extraction function
def extract_cnn_features(results_df, desc="Extracting"):
    features = []
    valid_ids = []
    
    for _, row in tqdm(results_df.iterrows(), total=len(results_df), desc=desc):
        img_path = Path(row['image_path'])
        if not img_path.exists():
            continue
        
        try:
            img = Image.open(img_path).convert('RGB')
            img_tensor = transform(img).unsqueeze(0).to(device)
            
            with torch.no_grad():
                feature = resnet(img_tensor).squeeze().cpu().numpy()
            
            features.append(feature)
            valid_ids.append(row['id'])
        except Exception as e:
            continue
    
    return np.array(features), np.array(valid_ids)

# Extract features
print('\nExtracting training features...')
train_cnn_features, train_valid_ids = extract_cnn_features(train_results_df, "Training")

print('Extracting test features...')
test_cnn_features, test_valid_ids = extract_cnn_features(test_results_df, "Test")

print('\n' + '='*70)
print(f'✓ Training features: {train_cnn_features.shape}')
print(f'✓ Test features: {test_cnn_features.shape}')
print('='*70)

### **Normalize CNN Features**

In [None]:
# Normalize
cnn_scaler = StandardScaler()
train_cnn_features = cnn_scaler.fit_transform(train_cnn_features)
test_cnn_features = cnn_scaler.transform(test_cnn_features)

print(f'Normalized to:')
print(f'  Mean: {train_cnn_features.mean():.4f}')
print(f'  Std: {train_cnn_features.std():.4f}')

# Save scaler
with open('outputs/cnn_scaler.pkl', 'wb') as f:
    pickle.dump(cnn_scaler, f)

print('\n✓ CNN scaler saved to outputs/cnn_scaler.pkl')

### **Apply PCA**

In [None]:
n_components = 20
pca = PCA(n_components=n_components, random_state=42)

train_cnn_pca = pca.fit_transform(train_cnn_features)
test_cnn_pca = pca.transform(test_cnn_features)

print(f'Original dimensions: {train_cnn_features.shape[1]}')
print(f'Reduced to: {train_cnn_pca.shape[1]}')
print(f'Variance explained: {pca.explained_variance_ratio_.sum():.1%}')

# Save PCA
with open('outputs/cnn_pca.pkl', 'wb') as f:
    pickle.dump(pca, f)

print('\n✓ PCA transformer saved to outputs/cnn_pca.pkl')

### **Prepare Tabular Features**

In [None]:
feature_cols = [col for col in train_data.columns 
                if col not in ['id', 'date', 'price']]

print(f'Tabular features: {len(feature_cols)}')

# Extract features
X_train_tab = train_data[feature_cols].values
y_train = train_data['price'].values
X_test_tab = test_data[feature_cols].values

# Scale tabular features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_tab)
X_test_scaled = scaler.transform(X_test_tab)

print('\n✓ Tabular features scaled')

### **Create Enhanced Datasets**

In [None]:
pca_cols = [f'cnn_pc{i+1}' for i in range(n_components)]

# Create enhanced training data
train_data_enhanced = train_data.copy()
for idx, property_id in enumerate(train_valid_ids):
    mask = train_data_enhanced['id'] == property_id
    for col_idx, col_name in enumerate(pca_cols):
        train_data_enhanced.loc[mask, col_name] = train_cnn_pca[idx, col_idx]

train_data_enhanced = train_data_enhanced.dropna(subset=pca_cols)

# Create enhanced test data
test_data_enhanced = test_data.copy()
for idx, property_id in enumerate(test_valid_ids):
    mask = test_data_enhanced['id'] == property_id
    for col_idx, col_name in enumerate(pca_cols):
        test_data_enhanced.loc[mask, col_name] = test_cnn_pca[idx, col_idx]

test_data_enhanced = test_data_enhanced.dropna(subset=pca_cols)

print(f'Training samples with images: {len(train_data_enhanced):,}')
print(f'Test samples with images: {len(test_data_enhanced):,}')

# Save enhanced datasets
train_data_enhanced.to_csv('data/processed/train_enhanced.csv', index=False)
test_data_enhanced.to_csv('data/processed/test_enhanced.csv', index=False)

print('\n✓ Enhanced datasets saved to data/processed/')

In [None]:
print('\n' + '='*70)
print('PREPROCESSING COMPLETE')
print('='*70)

# Save metadata
import json

metadata = {
    'train_samples': len(train_data_enhanced),
    'test_samples': len(test_data_enhanced),
    'tabular_features': len(feature_cols),
    'pca_components': n_components,
    'total_features': len(feature_cols) + n_components,
    'zoom_level': ZOOM_LEVEL,
    'image_size': IMAGE_SIZE
}

with open('outputs/preprocessing_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print('\nFiles created:')
print('  • data/processed/train_enhanced.csv')
print('  • data/processed/test_enhanced.csv')
print('  • outputs/cnn_scaler.pkl')
print('  • outputs/cnn_pca.pkl')
print('  • outputs/preprocessing_metadata.json')

print('\nDataset Summary:')
print(f'  Training samples: {len(train_data_enhanced):,}')
print(f'  Test samples: {len(test_data_enhanced):,}')
print(f'  Tabular features: {len(feature_cols)}')
print(f'  Image features (PCA): {n_components}')
print(f'  Total features: {len(feature_cols) + n_components}')

print('\n' + '='*70)
print('✓ Ready for model training!')
print('  → Run model_training.ipynb next')
print('='*70)