# CARAVAN Large Sample Experiment Tutorial

This notebook demonstrates how to run CONFLUENCE over multiple watersheds from the CARAVAN dataset for large-sample hydrology analysis.

## 1. Setup and Imports

In [None]:
import sys
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import subprocess
import yaml
from datetime import datetime

# Add CONFLUENCE to path
confluence_path = Path('../').resolve()
sys.path.append(str(confluence_path))

# Set up plotting style
plt.style.use('default')
%matplotlib inline

print("Setup complete!")

## 2. Configure the Experiment

In [None]:
# Configuration for the large sample experiment
experiment_config = {
    'dataset': 'camels',  # CARAVAN dataset to use
    'max_watersheds': 5,  # Number of watersheds to process
    'dry_run': False,  # Set to True to test without submitting jobs
    'experiment_name': 'caravan_tutorial',
    'template_config': '/home/darri.eythorsson/code/CONFLUENCE/0_config_files/config_caravan_template.yaml',
    'config_dir': '/home/darri.eythorsson/code/CONFLUENCE/0_config_files/caravan',
    'caravan_script': '/home/darri.eythorsson/code/CONFLUENCE/examples/run_watersheds_caravan.py'
}

# Create experiment directory
experiment_dir = Path(f"./experiments/{experiment_config['experiment_name']}")
experiment_dir.mkdir(parents=True, exist_ok=True)

# Save configuration
with open(experiment_dir / 'experiment_config.yaml', 'w') as f:
    yaml.dump(experiment_config, f)

print(f"Experiment configured: {experiment_config['experiment_name']}")
print(f"Processing {experiment_config['max_watersheds']} watersheds from {experiment_config['dataset']}")

## 3. List Available CARAVAN Datasets

In [None]:
# List available CARAVAN datasets
cmd = ['python', experiment_config['caravan_script'], '--list-datasets']
result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)

## 4. Discover Watersheds in Selected Dataset

In [None]:
# Get watershed information for the selected dataset
watersheds_csv = experiment_dir / f"{experiment_config['dataset']}_watersheds.csv"

cmd = [
    'python', experiment_config['caravan_script'],
    '--dataset', experiment_config['dataset'],
    '--watersheds-csv', str(watersheds_csv)
]

print(f"Discovering watersheds in {experiment_config['dataset']}...")
result = subprocess.run(cmd, capture_output=True, text=True)

# Load watershed information
if watersheds_csv.exists():
    watersheds_df = pd.read_csv(watersheds_csv)
    print(f"Found {len(watersheds_df)} watersheds")
    print("\nFirst 5 watersheds:")
    print(watersheds_df.head())

## 5. Launch CONFLUENCE for Multiple Watersheds

In [None]:
'''
# Launch the large sample experiment
cmd = [
    'python', experiment_config['caravan_script'],
    '--dataset', experiment_config['dataset'],
    '--template', experiment_config['template_config'],
    '--config-dir', experiment_config['config_dir'],
    '--max-watersheds', str(experiment_config['max_watersheds']),
    '--watersheds-csv', str(watersheds_csv)
]

if experiment_config['dry_run']:
    cmd.append('--dry-run')
    print("DRY RUN MODE - No jobs will be submitted")

print(f"Launching CONFLUENCE for {experiment_config['max_watersheds']} watersheds...")
print(f"Command: {' '.join(cmd)}")

# Execute
result = subprocess.run(cmd, capture_output=True, text=True)

print("\nOutput:")
print(result.stdout[:500] + "..." if len(result.stdout) > 500 else result.stdout)

# Save submission log
with open(experiment_dir / 'submission.log', 'w') as f:
    f.write(result.stdout)
'''

## 6. Monitor Job Status

In [None]:
'''
# Check SLURM job status
def check_job_status(user=None):
    user = user or os.environ.get('USER')
    cmd = ['squeue', '-u', user]
    result = subprocess.run(cmd, capture_output=True, text=True)
    return result.stdout

print("Current jobs:")
print(check_job_status())
'''

## 7. Find Completed Simulations

In [None]:
# Find completed watershed simulations
confluence_data_dir = Path("/work/comphyd_lab/data/CONFLUENCE_data")
caravan_dir = confluence_data_dir / "caravan"

completed = []
if caravan_dir.exists():
    for domain_dir in caravan_dir.glob(f"domain_{experiment_config['dataset']}_*"):
        watershed_id = domain_dir.name.split('_')[-1]
        sim_dir = domain_dir / "simulations"
        
        # Check if simulation files exist
        if sim_dir.exists() and list(sim_dir.rglob("*.nc")):
            completed.append({
                'watershed_id': watershed_id,
                'domain_dir': domain_dir,
                'sim_dir': sim_dir
            })

print(f"Completed simulations: {len(completed)}")
for ws in completed:
    print(f"  - Watershed {ws['watershed_id']}")

## 8. Load and Visualize Results

In [None]:
# Simple function to load results
def load_summa_output(sim_dir, variable='averageRoutedRunoff'):
    import xarray as xr
    
    output_files = list(sim_dir.rglob("*timestep*.nc"))
    if output_files:
        ds = xr.open_dataset(output_files[0])
        if variable in ds.variables:
            return pd.DataFrame({
                'time': pd.to_datetime(ds.time.values),
                'value': ds[variable].values.flatten()
            })
    return None

# Plot results for completed watersheds
if completed:
    fig, ax = plt.subplots(figsize=(12, 6))
    
    for ws in completed[:10]:  # Plot first 10
        data = load_summa_output(ws['sim_dir'])
        if data is not None:
            ax.plot(data['time'], data['value'], 
                   label=f"Watershed {ws['watershed_id']}", 
                   linewidth=2, alpha=0.7)
    
    ax.set_xlabel('Date')
    ax.set_ylabel('Snow Water Equivalent (mm)')
    ax.set_title(f'SWE Comparison - {experiment_config["dataset"].upper()} Watersheds')
    ax.legend()
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

## 9. Calculate Performance Metrics

In [None]:
# Calculate NSE and KGE for watersheds with streamflow data
import numpy as np

def calculate_nse(obs, sim):
    """Calculate Nash-Sutcliffe Efficiency"""
    mask = ~(np.isnan(obs) | np.isnan(sim))
    obs, sim = obs[mask], sim[mask]
    
    if len(obs) == 0:
        return np.nan
    
    return 1 - np.sum((obs - sim)**2) / np.sum((obs - np.mean(obs))**2)

def calculate_kge(obs, sim):
    """Calculate Kling-Gupta Efficiency"""
    mask = ~(np.isnan(obs) | np.isnan(sim))
    obs, sim = obs[mask], sim[mask]
    
    if len(obs) == 0:
        return np.nan
    
    # Calculate components
    r = np.corrcoef(obs, sim)[0, 1]  # Correlation
    alpha = np.std(sim) / np.std(obs)  # Relative variability
    beta = np.mean(sim) / np.mean(obs)  # Bias
    
    # Calculate KGE
    return 1 - np.sqrt((r - 1)**2 + (alpha - 1)**2 + (beta - 1)**2)

# Enhance the existing load_summa_output function to handle streamflow variables
def load_model_streamflow(sim_dir):
    """Load streamflow from model outputs using existing function with streamflow variables"""
    # Try to find mizuRoute output first (processed streamflow)
    mizu_dir = sim_dir.parent / "mizuRoute"
    if mizu_dir.exists():
        import xarray as xr
        mizu_files = list(mizu_dir.glob("*.nc"))
        if mizu_files:
            try:
                ds = xr.open_dataset(mizu_files[0])
                # Check for common mizuRoute variables
                for var in ['IRFroutedRunoff', 'flow', 'streamflow']:
                    if var in ds.variables:
                        return pd.DataFrame({
                            'time': pd.to_datetime(ds.time.values),
                            'value': ds[var].values.flatten()
                        })
            except Exception as e:
                print(f"Error reading mizuRoute output: {e}")
    
    # Fall back to SUMMA runoff if routing not found
    runoff_vars = ['averageRunoff', 'scalarTotalRunoff', 'scalarAquiferBaseflow']
    for var in runoff_vars:
        df = load_summa_output(sim_dir, variable=var)
        if df is not None:
            return df
    
    # If no specific streamflow variable found, use the default function
    return load_summa_output(sim_dir)

def load_observation_data(ws_dir, watershed_id, dataset="camels"):
    """Load observation data for a watershed
    
    Args:
        ws_dir: Path to the domain directory
        watershed_id: Watershed ID (e.g., '01047000')
        dataset: Dataset name prefix (default: 'camels')
        
    Returns:
        DataFrame with observation data or None if not found
    """
    # Check common observation file locations and formats based on the actual directory structure
    obs_paths = [
        # New pattern based on the screenshot
        ws_dir / 'observations' / 'streamflow' / 'preprocessed' / f"{dataset}_{dataset}_{watershed_id}_streamflow_processed.csv",
        # Original patterns as fallbacks
        ws_dir / 'observations' / 'streamflow' / 'raw_data' / f"{dataset}_{watershed_id}_Discharge.csv",
        ws_dir / 'observations' / 'streamflow' / 'raw_data' / f"{watershed_id}_streamflow.csv",
        ws_dir / 'observations' / 'streamflow' / 'preprocessed' / f"{watershed_id}_streamflow_processed.csv"
    ]
    
    # Try each possible path
    for path in obs_paths:
        if path.exists():
            print(f"Found observation data at: {path}")
            try:
                return pd.read_csv(path, parse_dates=['datetime'])
            except Exception as e:
                print(f"Error reading {path}: {e}")
                continue
    
    # If no file is found, search for any files that might match
    print(f"No observation data found for watershed {watershed_id}")
    
    # Print available files to help debugging
    preprocessed_dir = ws_dir / 'observations' / 'streamflow' / 'preprocessed'
    if preprocessed_dir.exists():
        print(f"Available files in preprocessed directory:")
        for file in preprocessed_dir.glob("*"):
            print(f"  - {file.name}")
    
    return None

# Calculate metrics for the completed watersheds
metrics = []

for ws in completed:
    try:
        watershed_id = ws['watershed_id']
        print(f"Processing watershed {watershed_id}...")
        
        # Load simulated data using the enhanced function
        sim_df = load_model_streamflow(ws['sim_dir'])
        
        if sim_df is None:
            print(f"  No simulation data found for watershed {watershed_id}")
            continue
            
        # Load observed data
        obs_df = load_observation_data(ws['domain_dir'], watershed_id, experiment_config['dataset'])
        
        if obs_df is None:
            print(f"  No observation data found for watershed {watershed_id}")
            continue
        
        # Align the time series by merging on time column
        # Convert both to same frequency if needed (daily is common for hydrological studies)
        print(sim_df)
        sim_df['datetime'] = sim_df['time']
        sim_df.set_index('datetime', inplace=True)
        obs_df.set_index('datetime', inplace=True)
        
        # Resample to daily if data is higher frequency
        sim_daily = sim_df.resample('D').mean()
        obs_daily = obs_df.resample('D').mean()
        
        # Merge the datasets on the date index
        merged = pd.merge(obs_daily, sim_daily, 
                         left_index=True, right_index=True, 
                         how='inner',
                         suffixes=('_obs', '_sim'))
        
        if len(merged) < 30:
            print(f"  Insufficient matching data points for watershed {watershed_id} ({len(merged)} points)")
            continue
            
        # Calculate performance metrics
        print(merged)
        nse = calculate_nse(merged['discharge_cms'].values, merged['value'].values)
        kge = calculate_kge(merged['discharge_cms'].values, merged['value'].values)
        
        print(f"  Metrics calculated - NSE: {nse:.3f}, KGE: {kge:.3f}")
        
        # Store the metrics
        metrics.append({
            'watershed_id': watershed_id,
            'nse': nse,
            'kge': kge,
            'data_points': len(merged),
            'start_date': merged.index.min().strftime('%Y-%m-%d'),
            'end_date': merged.index.max().strftime('%Y-%m-%d')
        })
        
        # Plot the comparison
        plt.figure(figsize=(12, 6))
        plt.plot(merged.index, merged['discharge_cms'], 'k-', label='Observed', linewidth=1.5)
        plt.plot(merged.index, merged['value'], 'b-', label='Simulated', linewidth=1.5, alpha=0.7)
        plt.title(f"Watershed {watershed_id} - NSE: {nse:.3f}, KGE: {kge:.3f}")
        plt.xlabel('Date')
        plt.ylabel('Streamflow')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        
        # Save the figure
        comparison_file = experiment_dir / f"watershed_{watershed_id}_comparison.png"
        plt.savefig(comparison_file)
        plt.close()
        
    except Exception as e:
        print(f"Error processing watershed {ws['watershed_id']}: {str(e)}")

# Create a dataframe with the metrics
if metrics:
    metrics_df = pd.DataFrame(metrics).sort_values('kge', ascending=False)
    
    # Display metrics
    print("\nPerformance Metrics (sorted by KGE):")
    display(metrics_df)
    
    # Save metrics to CSV
    metrics_file = experiment_dir / 'performance_metrics.csv'
    metrics_df.to_csv(metrics_file, index=False)
    print(f"\nMetrics saved to {metrics_file}")
    
    # Create scatter plot comparing NSE and KGE
    plt.figure(figsize=(10, 6))
    plt.scatter(metrics_df['nse'], metrics_df['kge'], s=60, alpha=0.7)
    
    # Add watershed IDs as labels
    for i, row in metrics_df.iterrows():
        plt.annotate(row['watershed_id'], 
                   (row['nse'], row['kge']),
                   xytext=(5, 5), 
                   textcoords='offset points',
                   fontsize=9)
    
    # Add 1:1 line
    lims = [
        min(min(metrics_df['nse']), min(metrics_df['kge'])) - 0.1,
        max(max(metrics_df['nse']), max(metrics_df['kge'])) + 0.1
    ]
    plt.plot(lims, lims, 'k--', alpha=0.5)
    
    plt.grid(True, alpha=0.3)
    plt.xlabel('Nash-Sutcliffe Efficiency (NSE)')
    plt.ylabel('Kling-Gupta Efficiency (KGE)')
    plt.title(f'Performance Metrics - {experiment_config["dataset"].upper()} Watersheds')
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(experiment_dir / 'nse_kge_comparison.png')
    plt.show()
    
    # Create summary statistics
    print("\nSummary Statistics:")
    print(f"Number of watersheds: {len(metrics_df)}")
    print(f"Mean NSE: {metrics_df['nse'].mean():.3f}")
    print(f"Mean KGE: {metrics_df['kge'].mean():.3f}")
    print(f"Best watershed (by KGE): {metrics_df.iloc[0]['watershed_id']} (KGE={metrics_df.iloc[0]['kge']:.3f})")
else:
    print("No performance metrics could be calculated.")

## Next Steps

This tutorial covered the basics of running CONFLUENCE over multiple CARAVAN watersheds. For more advanced analysis:

1. **Regional Analysis**: Group watersheds by climate or geography
2. **Model Comparison**: Test different model structures
3. **Parameter Transfer**: Develop regionalization relationships
4. **Machine Learning**: Predict performance from attributes
5. **Climate Scenarios**: Run future projections

See the full documentation for detailed examples of these analyses.