# CARAVAN Large Sample Experiment Tutorial

This notebook demonstrates how to run CONFLUENCE over multiple watersheds from the CARAVAN dataset for large-sample hydrology analysis.

## 1. Setup and Imports

In [None]:
import sys
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import subprocess
import yaml
from datetime import datetime

# Add CONFLUENCE to path
confluence_path = Path('../').resolve()
sys.path.append(str(confluence_path))

# Set up plotting style
plt.style.use('default')
%matplotlib inline

print("Setup complete!")

## 2. Configure the Experiment

In [None]:
# Configuration for the large sample experiment
experiment_config = {
    'dataset': 'camels',  # CARAVAN dataset to use
    'max_watersheds': 5,  # Number of watersheds to process
    'dry_run': False,  # Set to True to test without submitting jobs
    'experiment_name': 'caravan_tutorial',
    'template_config': '/home/darri.eythorsson/code/CONFLUENCE/0_config_files/config_caravan_template.yaml',
    'config_dir': '/home/darri.eythorsson/code/CONFLUENCE/0_config_files/caravan',
    'caravan_script': '/home/darri.eythorsson/code/CONFLUENCE/9_scripts/run_watersheds_caravan.py'
}

# Create experiment directory
experiment_dir = Path(f"./experiments/{experiment_config['experiment_name']}")
experiment_dir.mkdir(parents=True, exist_ok=True)

# Save configuration
with open(experiment_dir / 'experiment_config.yaml', 'w') as f:
    yaml.dump(experiment_config, f)

print(f"Experiment configured: {experiment_config['experiment_name']}")
print(f"Processing {experiment_config['max_watersheds']} watersheds from {experiment_config['dataset']}")

## 3. List Available CARAVAN Datasets

In [None]:
# List available CARAVAN datasets
cmd = ['python', experiment_config['caravan_script'], '--list-datasets']
result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)

## 4. Discover Watersheds in Selected Dataset

In [None]:
# Get watershed information for the selected dataset
watersheds_csv = experiment_dir / f"{experiment_config['dataset']}_watersheds.csv"

cmd = [
    'python', experiment_config['caravan_script'],
    '--dataset', experiment_config['dataset'],
    '--watersheds-csv', str(watersheds_csv)
]

print(f"Discovering watersheds in {experiment_config['dataset']}...")
result = subprocess.run(cmd, capture_output=True, text=True)

# Load watershed information
if watersheds_csv.exists():
    watersheds_df = pd.read_csv(watersheds_csv)
    print(f"Found {len(watersheds_df)} watersheds")
    print("\nFirst 5 watersheds:")
    print(watersheds_df.head())

## 5. Launch CONFLUENCE for Multiple Watersheds

In [None]:
# Launch the large sample experiment
cmd = [
    'python', experiment_config['caravan_script'],
    '--dataset', experiment_config['dataset'],
    '--template', experiment_config['template_config'],
    '--config-dir', experiment_config['config_dir'],
    '--max-watersheds', str(experiment_config['max_watersheds']),
    '--watersheds-csv', str(watersheds_csv)
]

if experiment_config['dry_run']:
    cmd.append('--dry-run')
    print("DRY RUN MODE - No jobs will be submitted")

print(f"Launching CONFLUENCE for {experiment_config['max_watersheds']} watersheds...")
print(f"Command: {' '.join(cmd)}")

# Execute
result = subprocess.run(cmd, capture_output=True, text=True)

print("\nOutput:")
print(result.stdout[:500] + "..." if len(result.stdout) > 500 else result.stdout)

# Save submission log
with open(experiment_dir / 'submission.log', 'w') as f:
    f.write(result.stdout)

## 6. Monitor Job Status

In [None]:
# Check SLURM job status
def check_job_status(user=None):
    user = user or os.environ.get('USER')
    cmd = ['squeue', '-u', user]
    result = subprocess.run(cmd, capture_output=True, text=True)
    return result.stdout

print("Current jobs:")
print(check_job_status())

## 7. Find Completed Simulations

In [None]:
# Find completed watershed simulations
confluence_data_dir = Path("/work/comphyd_lab/data/CONFLUENCE_data")
caravan_dir = confluence_data_dir / "caravan"

completed = []
if caravan_dir.exists():
    for domain_dir in caravan_dir.glob(f"domain_{experiment_config['dataset']}_*"):
        watershed_id = domain_dir.name.split('_')[-1]
        sim_dir = domain_dir / "simulations"
        
        # Check if simulation files exist
        if sim_dir.exists() and list(sim_dir.rglob("*.nc")):
            completed.append({
                'watershed_id': watershed_id,
                'domain_dir': domain_dir,
                'sim_dir': sim_dir
            })

print(f"Completed simulations: {len(completed)}")
for ws in completed:
    print(f"  - Watershed {ws['watershed_id']}")

## 8. Load and Visualize Results

In [None]:
# Simple function to load results
def load_summa_output(sim_dir, variable='scalarSWE'):
    import xarray as xr
    
    output_files = list(sim_dir.rglob("*timestep*.nc"))
    if output_files:
        ds = xr.open_dataset(output_files[0])
        if variable in ds.variables:
            return pd.DataFrame({
                'time': pd.to_datetime(ds.time.values),
                'value': ds[variable].values.flatten()
            })
    return None

# Plot results for completed watersheds
if completed:
    fig, ax = plt.subplots(figsize=(12, 6))
    
    for ws in completed[:3]:  # Plot first 3
        data = load_summa_output(ws['sim_dir'])
        if data is not None:
            ax.plot(data['time'], data['value'], 
                   label=f"Watershed {ws['watershed_id']}", 
                   linewidth=2, alpha=0.7)
    
    ax.set_xlabel('Date')
    ax.set_ylabel('Snow Water Equivalent (mm)')
    ax.set_title(f'SWE Comparison - {experiment_config["dataset"].upper()} Watersheds')
    ax.legend()
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

## 9. Calculate Simple Performance Metrics

In [None]:
# Calculate NSE for watersheds with streamflow data
def calculate_nse(obs, sim):
    """Calculate Nash-Sutcliffe Efficiency"""
    mask = ~(np.isnan(obs) | np.isnan(sim))
    obs, sim = obs[mask], sim[mask]
    
    if len(obs) == 0:
        return np.nan
    
    return 1 - np.sum((obs - sim)**2) / np.sum((obs - np.mean(obs))**2)

# Example of loading observed data and calculating metrics
metrics = []
for ws in completed[:3]:  # Process first 3
    try:
        # Load observed streamflow
        obs_file = ws['domain_dir'] / 'observations' / 'streamflow' / 'raw_data' / f"{experiment_config['dataset']}_{ws['watershed_id']}_Discharge.csv"
        
        if obs_file.exists():
            obs_df = pd.read_csv(obs_file)
            # Here you would align with simulated data and calculate metrics
            print(f"Found observations for watershed {ws['watershed_id']}")
            
            # Placeholder for actual metric calculation
            metrics.append({
                'watershed_id': ws['watershed_id'],
                'nse': np.random.random()  # Replace with actual calculation
            })
    except Exception as e:
        print(f"Error processing {ws['watershed_id']}: {e}")

if metrics:
    metrics_df = pd.DataFrame(metrics)
    print("\nPerformance Metrics:")
    print(metrics_df)

## 10. Create Summary Report

In [None]:
# Generate experiment summary
print("=== CARAVAN Large Sample Experiment Summary ===")
print(f"Dataset: {experiment_config['dataset']}")
print(f"Experiment: {experiment_config['experiment_name']}")
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"\nWatersheds:")
print(f"  Requested: {experiment_config['max_watersheds']}")
print(f"  Completed: {len(completed)}")

if metrics:
    print(f"\nPerformance:")
    print(f"  Mean NSE: {metrics_df['nse'].mean():.3f}")
    print(f"  Best NSE: {metrics_df['nse'].max():.3f}")

# Save summary
summary = {
    'experiment': experiment_config,
    'completed_watersheds': [ws['watershed_id'] for ws in completed],
    'date': datetime.now().isoformat()
}

with open(experiment_dir / 'summary.yaml', 'w') as f:
    yaml.dump(summary, f)

print(f"\nResults saved to: {experiment_dir}")

## Next Steps

This tutorial covered the basics of running CONFLUENCE over multiple CARAVAN watersheds. For more advanced analysis:

1. **Regional Analysis**: Group watersheds by climate or geography
2. **Model Comparison**: Test different model structures
3. **Parameter Transfer**: Develop regionalization relationships
4. **Machine Learning**: Predict performance from attributes
5. **Climate Scenarios**: Run future projections

See the full documentation for detailed examples of these analyses.