# Feature Generation for Bathymetry Modeling

This notebook generates additional features from the preprocessed Sentinel-2 data for bathymetry modeling.

Steps:
1. Load preprocessed features and water mask
2. Generate additional spectral features
3. Create training dataset
4. Save features for model training

## Parameters

In [None]:
# Parameters
region_name = "Lakshadweep"
features_path = None
output_dir = None

## Setup and Configuration

In [None]:
import os
import sys
import json
import numpy as np
import matplotlib.pyplot as plt
import logging
from pathlib import Path
from sklearn.preprocessing import StandardScaler

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger(__name__)

# Add project root to path
project_dir = Path().absolute().parent
if project_dir.name != 'sdb_project':
    project_dir = project_dir / 'sdb_project'
sys.path.append(str(project_dir))

# Load configurations
config_path = project_dir / 'config' / 'location_config.json'
with open(config_path) as f:
    config = json.load(f)

# Load notebook configuration with SAFE file information
notebook_config_path = project_dir / 'config' / 'notebook_config.json'
notebook_config = {}
if notebook_config_path.exists():
    with open(notebook_config_path) as f:
        notebook_config = json.load(f)
    logger.info(f"Loaded notebook config for region: {notebook_config.get('region', 'Unknown')}")
    logger.info(f"Using SAFE file: {notebook_config.get('safe_file_name', 'Not set')}")

# Load band extraction configuration
band_config_path = project_dir / 'config' / 'band_extraction_config.json'
band_config = {}
if band_config_path.exists():
    with open(band_config_path) as f:
        band_config = json.load(f)
    logger.info(f"Loaded band config with {len(band_config.get('bands', []))} bands")

# Use parameters if provided, otherwise use configuration
if region_name is None:
    region_name = notebook_config.get('region', config.get('region_name', 'unknown'))

# Set up paths using the new configuration
safe_file_path = notebook_config.get('safe_file_path', '')
data_root = notebook_config.get('data_root', f'data/sentinel/{region_name}')
output_root = notebook_config.get('output_root', f'outputs/{region_name}')

logger.info(f"Working with region: {region_name}")
logger.info(f"Data root: {data_root}")
logger.info(f"Output root: {output_root}")
if safe_file_path:
    logger.info(f"SAFE file: {Path(safe_file_path).name}")

# Set up paths
region_slug = region_name.lower().replace(' ', '_')
processed_data_dir = project_dir / 'data' / 'sentinel' / region_slug / 'processed'
if output_dir is None:
    output_dir = project_dir / 'outputs' / region_slug

Path(output_dir).mkdir(parents=True, exist_ok=True)

print(f"‚úÖ Feature generation for: {region_name}")
print(f"üìÅ Data directory: {processed_data_dir}")
print(f"üìÅ Output directory: {output_dir}")

Using data from: test_mangalore\data\sentinel\mangalore
Output directories created at: data\processed


## Load Preprocessed Data

Load the features and water mask from the preprocessing step.

In [None]:
# Load preprocessed data
if features_path is None:
    features_path = processed_data_dir / 'features.npy'

water_mask_path = processed_data_dir / 'water_mask.npy'
metadata_path = processed_data_dir / 'processing_metadata.json'

# Check if files exist
if not features_path.exists():
    raise FileNotFoundError(f"Features file not found: {features_path}")
if not water_mask_path.exists():
    raise FileNotFoundError(f"Water mask not found: {water_mask_path}")

# Load data
features = np.load(features_path)
water_mask = np.load(water_mask_path)

print(f"‚úÖ Loaded features with shape: {features.shape}")
print(f"‚úÖ Loaded water mask with shape: {water_mask.shape}")

# Load metadata if available
feature_names = ['B02', 'B03', 'B04', 'B08']  # Default names
if metadata_path.exists():
    with open(metadata_path) as f:
        metadata = json.load(f)
    if 'feature_names' in metadata['processing_info']:
        feature_names = metadata['processing_info']['feature_names']

print(f"üìä Feature names: {feature_names}")

Extracting B02...
Found B02 at test_mangalore\data\sentinel\mangalore\43PGQ_B02.jp2
Extracting B03...
Found B03 at test_mangalore\data\sentinel\mangalore\43PGQ_B03.jp2
Extracting B03...
Found B03 at test_mangalore\data\sentinel\mangalore\43PGQ_B03.jp2
Extracting B04...
Found B04 at test_mangalore\data\sentinel\mangalore\43PGQ_B04.jp2
Extracting B04...
Found B04 at test_mangalore\data\sentinel\mangalore\43PGQ_B04.jp2
Extracting B08...
Found B08 at test_mangalore\data\sentinel\mangalore\43PGQ_B08.jp2
Extracting B08...
Found B08 at test_mangalore\data\sentinel\mangalore\43PGQ_B08.jp2
Band extraction complete!
Band extraction complete!


## Generate Additional Features

Create additional spectral features that are useful for bathymetry estimation.

In [None]:
# Extract water pixels only (flatten to 1D arrays)
water_pixels = features[water_mask]
print(f"üìä Water pixels for analysis: {len(water_pixels)}")

if len(water_pixels) == 0:
    raise ValueError("No water pixels found! Check water mask.")

# Remove NaN values
valid_mask = ~np.isnan(water_pixels).any(axis=1)
water_pixels = water_pixels[valid_mask]
print(f"üìä Valid water pixels: {len(water_pixels)}")

# Generate additional features from the spectral bands
def generate_spectral_features(pixels):
    """Generate additional spectral features for bathymetry"""
    if pixels.shape[1] < 4:
        raise ValueError(f"Expected at least 4 bands, got {pixels.shape[1]}")
    
    # Assuming order: B02, B03, B04, B08, [indices...]
    b02, b03, b04, b08 = pixels[:, 0], pixels[:, 1], pixels[:, 2], pixels[:, 3]
    
    additional_features = []
    additional_names = []
    
    # Band ratios (useful for water depth)
    ratios = {
        'B02_B04_ratio': b02 / (b04 + 1e-8),
        'B03_B04_ratio': b03 / (b04 + 1e-8), 
        'B02_B03_ratio': b02 / (b03 + 1e-8),
        'B08_B04_ratio': b08 / (b04 + 1e-8)
    }
    
    # Log-transformed bands (better for depth correlation)
    log_bands = {
        'log_B02': np.log(b02 + 1e-6),
        'log_B03': np.log(b03 + 1e-6),
        'log_B04': np.log(b04 + 1e-6)
    }
    
    # Combine all additional features
    all_additional = {**ratios, **log_bands}
    
    for name, values in all_additional.items():
        if not np.any(np.isnan(values)) and not np.any(np.isinf(values)):
            additional_features.append(values.reshape(-1, 1))
            additional_names.append(name)
    
    if additional_features:
        return np.hstack(additional_features), additional_names
    else:
        return np.array([]).reshape(len(pixels), 0), []

# Generate additional features
additional_features, additional_names = generate_spectral_features(water_pixels)
print(f"‚úÖ Generated {len(additional_names)} additional features: {additional_names}")

# Combine original and additional features
if additional_features.size > 0:
    all_features = np.hstack([water_pixels, additional_features])
    all_feature_names = feature_names + additional_names
else:
    all_features = water_pixels
    all_feature_names = feature_names

print(f"üìä Total features: {all_features.shape[1]}")
print(f"üìä Feature names: {all_feature_names}")

QA60 band not available, skipping cloud masking...
Cloud masking applied to all bands
Cloud masking applied to all bands


## Generate Synthetic Depth Data

Since we don't have real bathymetry data, generate synthetic depth values for training.

In [None]:
# Generate synthetic depth data for training
# This is a placeholder - in reality you would use ICESat-2, sonar, or other depth measurements
np.random.seed(42)  # Reproducible results

def generate_synthetic_depths(features):
    """Generate realistic synthetic depth values based on spectral features"""
    
    # Use blue band (B02) as primary depth indicator
    blue_band = features[:, 0]  # First feature should be B02
    
    # Normalize blue band
    blue_norm = (blue_band - np.nanmin(blue_band)) / (np.nanmax(blue_band) - np.nanmin(blue_band))
    
    # Create depth relationship: deeper water = lower reflectance in blue
    # Invert and scale to reasonable depth range (0-30 meters)
    base_depth = (1 - blue_norm) * 25.0 + 2.0  # 2-27 meter range
    
    # Add some noise and complexity
    # Use green/blue ratio for additional variation
    if features.shape[1] > 1:
        green_blue_ratio = features[:, 1] / (features[:, 0] + 1e-8)
        depth_variation = green_blue_ratio * 5.0  # Up to 5m variation
        base_depth += depth_variation
    
    # Add random noise (measurement uncertainty)
    noise = np.random.normal(0, 1.5, len(base_depth))
    synthetic_depth = base_depth + noise
    
    # Ensure positive depths
    synthetic_depth = np.clip(synthetic_depth, 0.5, 50.0)
    
    return synthetic_depth

# Generate synthetic depths
depths = generate_synthetic_depths(all_features)
print(f"‚úÖ Generated {len(depths)} synthetic depth measurements")
print(f"üìä Depth range: {np.min(depths):.1f} - {np.max(depths):.1f} meters")
print(f"üìä Mean depth: {np.mean(depths):.1f} ¬± {np.std(depths):.1f} meters")

# Visualize depth distribution
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(depths, bins=30, alpha=0.7, edgecolor='black')
plt.xlabel('Depth (meters)')
plt.ylabel('Frequency')
plt.title('Synthetic Depth Distribution')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.scatter(all_features[:, 0], depths, alpha=0.5, s=1)
plt.xlabel('Blue Band Reflectance (B02)')
plt.ylabel('Depth (meters)')
plt.title('Depth vs Blue Band')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(Path(output_dir) / 'synthetic_depth_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

Applying atmospheric correction...


INFO:preprocess:Successfully corrected 4 bands


Atmospheric correction complete!


## Prepare Training Dataset

Scale features and prepare the final dataset for model training.

In [None]:
# Feature scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(all_features)

print(f"‚úÖ Scaled features to zero mean and unit variance")
print(f"üìä Feature means: {np.mean(scaled_features, axis=0).round(3)}")
print(f"üìä Feature stds: {np.std(scaled_features, axis=0).round(3)}")

# Save training data
training_data_dir = processed_data_dir / 'training_data'
training_data_dir.mkdir(exist_ok=True)

# Save features, depths, and scaler
np.save(training_data_dir / 'features.npy', scaled_features)
np.save(training_data_dir / 'depths.npy', depths)

# Save scaler
import joblib
scaler_path = training_data_dir / 'feature_scaler.joblib'
joblib.dump(scaler, scaler_path)

print(f"‚úÖ Saved training features: {training_data_dir / 'features.npy'}")
print(f"‚úÖ Saved depths: {training_data_dir / 'depths.npy'}")
print(f"‚úÖ Saved scaler: {scaler_path}")

# Save feature metadata
feature_metadata = {
    'region_name': region_name,
    'n_samples': len(scaled_features),
    'n_features': len(all_feature_names),
    'feature_names': all_feature_names,
    'depth_stats': {
        'min': float(np.min(depths)),
        'max': float(np.max(depths)), 
        'mean': float(np.mean(depths)),
        'std': float(np.std(depths))
    },
    'water_pixel_count': len(water_pixels),
    'training_data_paths': {
        'features': str(training_data_dir / 'features.npy'),
        'depths': str(training_data_dir / 'depths.npy'),
        'scaler': str(scaler_path)
    }
}

metadata_path = training_data_dir / 'feature_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(feature_metadata, f, indent=2)

print(f"‚úÖ Saved metadata: {metadata_path}")
print(f"\nüéØ Feature generation complete! Ready for model training.")
print(f"üìä Dataset summary:")
print(f"   - Samples: {feature_metadata['n_samples']:,}")
print(f"   - Features: {feature_metadata['n_features']} ({', '.join(all_feature_names[:5])}...)")
print(f"   - Depth range: {feature_metadata['depth_stats']['min']:.1f} - {feature_metadata['depth_stats']['max']:.1f}m")

Calculating water indices...


INFO:preprocess:Calculated indices: ['NDWI', 'MNDWI', 'SR', 'BR_ratio']


Creating water mask...


INFO:preprocess:Created water mask with 25437 water pixels


Water indices and mask calculation complete!
Available indices: ['NDWI', 'MNDWI', 'SR', 'BR_ratio']
Number of water pixels: 25437
