# CAMELS-SPAT Large Sample Experiment Tutorial

This notebook demonstrates how to run CONFLUENCE over multiple watersheds from the CAMELS-SPAT dataset for large-sample hydrology analysis.

## 1. Setup and Imports

In [None]:
import sys
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import subprocess
import yaml
from datetime import datetime

# Add CONFLUENCE to path
confluence_path = Path('../').resolve()
sys.path.append(str(confluence_path))

# Set up plotting style
plt.style.use('default')
%matplotlib inline

print("Setup complete!")

## 2. Configure the Experiment

In [None]:
# Configuration for the large sample experiment
experiment_config = {
    'dataset': 'camels-spat',  # CAMELS-SPAT dataset
    'max_watersheds': 5,  # Number of watersheds to process
    'dry_run': False,  # Set to True to test without submitting jobs
    'experiment_name': 'camelsspat_tutorial',
    'template_config': '/home/darri.eythorsson/code/CONFLUENCE/0_config_files/config_distributed_basin_template.yaml',
    'config_dir': '/home/darri.eythorsson/code/CONFLUENCE/0_config_files/camels_spat',
    'camelsspat_script': '/home/darri.eythorsson/code/CONFLUENCE/9_scripts/run_watersheds_camelsspat.py',
    'camelsspat_dir': '/work/comphyd_lab/data/_to-be-moved/camels-spat-upload/shapefiles/meso-scale/shapes-distributed',
    'metadata_csv': 'camels-spat-metadata.csv'
}

# Create experiment directory
experiment_dir = Path(f"./experiments/{experiment_config['experiment_name']}")
experiment_dir.mkdir(parents=True, exist_ok=True)

# Save configuration
with open(experiment_dir / 'experiment_config.yaml', 'w') as f:
    yaml.dump(experiment_config, f)

print(f"Experiment configured: {experiment_config['experiment_name']}")
print(f"Processing {experiment_config['max_watersheds']} watersheds from {experiment_config['dataset']}")

## 3. Extract Watershed Information from CAMELS-SPAT Shapefiles

In [None]:
# Import function from the CAMELS-SPAT script to extract shapefile info
sys.path.append(str(Path(experiment_config['camelsspat_script']).parent))
from run_watersheds_camelsspat import extract_shapefile_info

# Check if we already have watershed info cached
watersheds_csv = experiment_dir / 'camelsspat_watersheds.csv'

if watersheds_csv.exists():
    print(f"Loading existing watershed information from {watersheds_csv}")
    watersheds_df = pd.read_csv(watersheds_csv)
else:
    print(f"Extracting watershed information from {experiment_config['camelsspat_dir']}...")
    watersheds_df = extract_shapefile_info(experiment_config['camelsspat_dir'])
    watersheds_df.to_csv(watersheds_csv, index=False)
    print(f"Saved watershed information to {watersheds_csv}")

print(f"\nFound {len(watersheds_df)} watersheds")
print("\nFirst 5 watersheds:")
print(watersheds_df.head()[['ID', 'Basin_File', 'River_File', 'Area_km2']])

## 4. Load and Merge CAMELS-SPAT Metadata

In [None]:
# Try to load CAMELS-SPAT metadata if available
metadata_path = experiment_config['metadata_csv']

if os.path.exists(metadata_path):
    print(f"Loading CAMELS-SPAT metadata from {metadata_path}")
    metadata_df = pd.read_csv(metadata_path)
    
    # Clean column names
    metadata_df.columns = [col.strip() for col in metadata_df.columns]
    print(f"Found metadata with {len(metadata_df)} rows and columns: {metadata_df.columns.tolist()}")
    
    # Create standardized ID column for merging
    watersheds_df['Metadata_ID'] = watersheds_df['ID'].str.replace(r'^[A-Z]+_', '', regex=True)
    
    # Merge metadata with watershed information
    print("\nMerging shapefile information with metadata...")
    watersheds_merged = pd.merge(
        watersheds_df, 
        metadata_df, 
        left_on='Metadata_ID',
        right_on='ID',
        how='left',
        suffixes=('', '_metadata')
    )
    
    # Save merged data
    watersheds_merged.to_csv(experiment_dir / 'camelsspat_watersheds_merged.csv', index=False)
    print(f"Saved merged watershed information to {experiment_dir / 'camelsspat_watersheds_merged.csv'}")
    watersheds_df = watersheds_merged
else:
    print(f"No metadata file found at {metadata_path}. Proceeding with shapefile information only.")

## 5. Visualize Watershed Locations

In [None]:
# Plot watershed locations on a map
if 'Lat' in watersheds_df.columns and 'Lon' in watersheds_df.columns:
    fig, ax = plt.subplots(figsize=(12, 8))
    
    # Filter out any missing coordinates
    valid_coords = watersheds_df.dropna(subset=['Lat', 'Lon'])
    
    # Color by country
    countries = valid_coords['ID'].str.extract(r'^([A-Z]+)_')[0].unique()
    colors = plt.cm.Set3(np.linspace(0, 1, len(countries)))
    
    for country, color in zip(countries, colors):
        country_data = valid_coords[valid_coords['ID'].str.startswith(country)]
        ax.scatter(country_data['Lon'], country_data['Lat'], 
                  s=country_data.get('Area_km2', 100)/10, 
                  c=[color], alpha=0.6, label=country)
    
    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')
    ax.set_title('CAMELS-SPAT Watershed Locations')
    ax.legend()
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("No coordinate information available for visualization")

## 6. Launch CONFLUENCE for Multiple Watersheds

In [None]:
# Launch the large sample experiment
cmd = [
    'python', experiment_config['camelsspat_script']
]

# Add command-line arguments
if experiment_config['dry_run']:
    cmd.extend(['--dry-run'])
    print("DRY RUN MODE - No jobs will be submitted")

# Add max watersheds limit if specified
if experiment_config['max_watersheds'] > 0:
    cmd.extend(['--max-watersheds', str(experiment_config['max_watersheds'])])

print(f"Launching CONFLUENCE for {experiment_config['max_watersheds']} watersheds...")
print(f"Command: {' '.join(cmd)}")

# Execute
result = subprocess.run(cmd, capture_output=True, text=True, cwd=str(Path(experiment_config['camelsspat_script']).parent))

print("\nOutput:")
print(result.stdout[:1000] + "..." if len(result.stdout) > 1000 else result.stdout)

# Save submission log
with open(experiment_dir / 'submission.log', 'w') as f:
    f.write(result.stdout)

## 7. Parse Job Submission Results

In [None]:
# Parse the submission log to extract job IDs
submission_log = experiment_dir / 'submission.log'
submitted_jobs = []

if submission_log.exists():
    with open(submission_log, 'r') as f:
        log_content = f.read()
    
    # Extract job submissions from log
    import re
    pattern = r'Domain: ([^,]+), Job ID: (\d+)'
    matches = re.findall(pattern, log_content)
    
    for domain, job_id in matches:
        submitted_jobs.append({
            'domain': domain,
            'job_id': job_id
        })
    
    if submitted_jobs:
        jobs_df = pd.DataFrame(submitted_jobs)
        print("Submitted Jobs:")
        print(jobs_df)
        
        # Save job information
        jobs_df.to_csv(experiment_dir / 'submitted_jobs.csv', index=False)
else:
    print("No submission log found or no jobs submitted.")

## 8. Monitor Job Status

In [None]:
# Check SLURM job status
def check_job_status(user=None):
    user = user or os.environ.get('USER')
    cmd = ['squeue', '-u', user]
    result = subprocess.run(cmd, capture_output=True, text=True)
    return result.stdout

print("Current jobs:")
print(check_job_status())

# Check specific job IDs if we have them
if submitted_jobs:
    job_ids = [job['job_id'] for job in submitted_jobs]
    cmd = ['squeue', '-j', ','.join(job_ids)]
    result = subprocess.run(cmd, capture_output=True, text=True)
    print("\nStatus of submitted CAMELS-SPAT jobs:")
    print(result.stdout)

## 9. Find Completed Simulations

In [None]:
# Find completed watershed simulations
confluence_data_dir = Path("/work/comphyd_lab/data/CONFLUENCE_data")
camelsspat_dir = confluence_data_dir / "camels_spat"

completed = []
if camelsspat_dir.exists():
    # Look for domain directories
    for domain_dir in camelsspat_dir.glob("domain_*"):
        watershed_id = domain_dir.name.replace('domain_', '')
        sim_dir = domain_dir / "simulations"
        
        # Check if simulation files exist
        if sim_dir.exists() and list(sim_dir.rglob("*.nc")):
            completed.append({
                'watershed_id': watershed_id,
                'domain_dir': domain_dir,
                'sim_dir': sim_dir
            })

print(f"Completed simulations: {len(completed)}")
for ws in completed:
    print(f"  - Watershed {ws['watershed_id']}")

# Save completed simulations info
if completed:
    completed_df = pd.DataFrame(completed)
    completed_df.to_csv(experiment_dir / 'completed_simulations.csv', index=False)
    print(f"\nSaved completed simulations info to {experiment_dir / 'completed_simulations.csv'}")

## 10. Load and Visualize Results

In [None]:
# Function to load SUMMA output
def load_summa_output(sim_dir, variable='scalarSWE'):
    import xarray as xr
    
    # Look for SUMMA output files
    summa_output_dir = sim_dir / 'run_1' / 'SUMMA'
    if not summa_output_dir.exists():
        return None
        
    output_files = list(summa_output_dir.glob("*timestep*.nc"))
    if output_files:
        ds = xr.open_dataset(output_files[0])
        if variable in ds.variables:
            return pd.DataFrame({
                'time': pd.to_datetime(ds.time.values),
                'value': ds[variable].values.flatten()
            })
    return None

# Plot results for completed watersheds
if completed:
    fig, ax = plt.subplots(figsize=(12, 6))
    
    for ws in completed[:3]:  # Plot first 3
        data = load_summa_output(ws['sim_dir'])
        if data is not None:
            ax.plot(data['time'], data['value'], 
                   label=f"Watershed {ws['watershed_id']}", 
                   linewidth=2, alpha=0.7)
    
    ax.set_xlabel('Date')
    ax.set_ylabel('Snow Water Equivalent (mm)')
    ax.set_title('SWE Comparison - CAMELS-SPAT Watersheds')
    ax.legend()
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("No completed simulations found yet.")

## 11. Load and Compare Streamflow Data

In [None]:
# Function to load mizuRoute output
def load_streamflow(sim_dir):
    import xarray as xr
    
    # Look for mizuRoute output
    mizuroute_dir = sim_dir / 'run_1' / 'mizuRoute'
    if not mizuroute_dir.exists():
        return None
        
    output_files = list(mizuroute_dir.glob("*.nc"))
    if output_files:
        ds = xr.open_dataset(output_files[0])
        # Try different variable names for routed runoff
        for var in ['IRFroutedRunoff', 'routedRunoff', 'discharge']:
            if var in ds.variables:
                return pd.DataFrame({
                    'time': pd.to_datetime(ds.time.values),
                    'simulated': ds[var].values.flatten()
                })
    return None

# Function to load observed streamflow
def load_observed_streamflow(domain_dir, watershed_id):
    obs_dir = domain_dir / 'observations' / 'streamflow' / 'preprocessed'
    obs_file = list(obs_dir.glob(f"*{watershed_id}*streamflow*.csv"))
    
    if obs_file:
        obs_df = pd.read_csv(obs_file[0])
        # Standardize column names
        if 'datetime' in obs_df.columns:
            obs_df['time'] = pd.to_datetime(obs_df['datetime'])
        elif 'date' in obs_df.columns:
            obs_df['time'] = pd.to_datetime(obs_df['date'])
            
        if 'discharge_m3s' in obs_df.columns:
            obs_df['observed'] = obs_df['discharge_m3s']
        elif 'streamflow' in obs_df.columns:
            obs_df['observed'] = obs_df['streamflow']
            
        return obs_df[['time', 'observed']]
    return None

# Compare observed and simulated streamflow for completed watersheds
if completed:
    fig, axes = plt.subplots(min(3, len(completed)), 1, figsize=(12, 4*min(3, len(completed))))
    if len(completed) == 1:
        axes = [axes]
    
    for i, ws in enumerate(completed[:3]):
        sim_flow = load_streamflow(ws['sim_dir'])
        obs_flow = load_observed_streamflow(ws['domain_dir'], ws['watershed_id'])
        
        if sim_flow is not None and obs_flow is not None:
            # Merge on time
            merged = pd.merge(obs_flow, sim_flow, on='time', how='inner')
            
            axes[i].plot(merged['time'], merged['observed'], label='Observed', color='blue', alpha=0.7)
            axes[i].plot(merged['time'], merged['simulated'], label='Simulated', color='red', alpha=0.7)
            axes[i].set_title(f'Streamflow Comparison - {ws["watershed_id"]}')
            axes[i].set_ylabel('Discharge (m³/s)')
            axes[i].legend()
            axes[i].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print("No completed simulations found yet.")

## 12. Calculate Performance Metrics

In [None]:
# Calculate performance metrics
def calculate_metrics(obs, sim):
    """Calculate various performance metrics"""
    # Remove NaN values
    mask = ~(np.isnan(obs) | np.isnan(sim))
    obs, sim = obs[mask], sim[mask]
    
    if len(obs) == 0:
        return {}
    
    # Nash-Sutcliffe Efficiency
    nse = 1 - np.sum((obs - sim)**2) / np.sum((obs - np.mean(obs))**2)
    
    # RMSE
    rmse = np.sqrt(np.mean((obs - sim)**2))
    
    # Percent Bias
    pbias = 100 * np.sum(sim - obs) / np.sum(obs)
    
    # Correlation
    corr = np.corrcoef(obs, sim)[0, 1]
    
    return {
        'NSE': nse,
        'RMSE': rmse,
        'PBIAS': pbias,
        'Correlation': corr
    }

# Calculate metrics for all completed watersheds
metrics_list = []

for ws in completed:
    sim_flow = load_streamflow(ws['sim_dir'])
    obs_flow = load_observed_streamflow(ws['domain_dir'], ws['watershed_id'])
    
    if sim_flow is not None and obs_flow is not None:
        # Merge on time
        merged = pd.merge(obs_flow, sim_flow, on='time', how='inner')
        
        # Calculate metrics
        metrics = calculate_metrics(merged['observed'].values, merged['simulated'].values)
        metrics['watershed_id'] = ws['watershed_id']
        metrics_list.append(metrics)

if metrics_list:
    metrics_df = pd.DataFrame(metrics_list)
    print("Performance Metrics:")
    print(metrics_df)
    
    # Save metrics
    metrics_df.to_csv(experiment_dir / 'performance_metrics.csv', index=False)
    
    # Plot metrics
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    axes = axes.ravel()
    
    for i, metric in enumerate(['NSE', 'RMSE', 'PBIAS', 'Correlation']):
        axes[i].bar(range(len(metrics_df)), metrics_df[metric])
        axes[i].set_xlabel('Watershed')
        axes[i].set_ylabel(metric)
        axes[i].set_title(f'{metric} by Watershed')
        axes[i].set_xticks(range(len(metrics_df)))
        axes[i].set_xticklabels(metrics_df['watershed_id'], rotation=45)
    
    plt.tight_layout()
    plt.show()
else:
    print("No metrics calculated yet.")

## 13. Create Summary Report