# CONFLUENCE Tutorial: Elevation-Based HRU Discretization

This notebook demonstrates elevation-based HRU discretization, building on the distributed domain from Tutorial 3. We'll:

1. Use the existing GRUs from Tutorial 3
2. Apply elevation-based discretization
3. Run the model
4. Compare results with lumped and GRU-based approaches

**Prerequisites**: Tutorial 3 must be completed successfully.

## Setup and Import Libraries

In [None]:
# Import required libraries
import sys
import os
from pathlib import Path
import yaml
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
from datetime import datetime
import numpy as np
import contextily as cx
import xarray as xr
import shutil
from IPython.display import Image, display

# Add CONFLUENCE to path
confluence_path = Path('../').resolve()
sys.path.append(str(confluence_path))

# Import main CONFLUENCE class
from CONFLUENCE import CONFLUENCE

# Set up plotting style
plt.style.use('default')
%matplotlib inline

## Initialize CONFLUENCE
First, let's set up our directories and load the configuration. We'll modify the configuration from Tutorial 3 to use elevation-based discretization.

In [None]:
# Set directory paths
CONFLUENCE_CODE_DIR = confluence_path
CONFLUENCE_DATA_DIR = Path('/work/comphyd_lab/data/CONFLUENCE_data')  # ← User should modify this path

# Verify paths exist
if not CONFLUENCE_CODE_DIR.exists():
    raise FileNotFoundError(f"CONFLUENCE code directory not found: {CONFLUENCE_CODE_DIR}")

if not CONFLUENCE_DATA_DIR.exists():
    print(f"Data directory doesn't exist. Creating: {CONFLUENCE_DATA_DIR}")
    CONFLUENCE_DATA_DIR.mkdir(parents=True, exist_ok=True)

# Load the distributed configuration from Tutorial 3
distributed_config_path = CONFLUENCE_CODE_DIR / '0_config_files' / 'config_distributed.yaml'
if not distributed_config_path.exists():
    raise FileNotFoundError("Tutorial 3 must be run first! Distributed config file not found.")

# Read config file
with open(distributed_config_path, 'r') as f:
    config_dict = yaml.safe_load(f)

# Update core paths
config_dict['CONFLUENCE_CODE_DIR'] = str(CONFLUENCE_CODE_DIR)
config_dict['CONFLUENCE_DATA_DIR'] = str(CONFLUENCE_DATA_DIR)

# Modify for elevation-based discretization
config_dict['DOMAIN_NAME'] = 'Bow_at_Banff_elevation'
config_dict['EXPERIMENT_ID'] = 'elevation_tutorial'
config_dict['DOMAIN_DISCRETIZATION'] = 'elevation'  # Key change!
config_dict['ELEVATION_BAND_SIZE'] = 200  # 200m bands
config_dict['MIN_HRU_SIZE'] = 4  # 4 km² minimum
# Keep SPATIAL_MODE as 'Distributed'

# Save updated config to a temporary file
elevation_config_path = CONFLUENCE_CODE_DIR / '0_config_files' / 'config_elevation.yaml'
with open(elevation_config_path, 'w') as f:
    yaml.dump(config_dict, f)

# Initialize CONFLUENCE
confluence = CONFLUENCE(elevation_config_path)

# Display configuration
print("=== Directory Configuration ===")
print(f"Code Directory: {CONFLUENCE_CODE_DIR}")
print(f"Data Directory: {CONFLUENCE_DATA_DIR}")
print("\n=== Key Configuration Settings ===")
print(f"Domain Name: {confluence.config['DOMAIN_NAME']}")
print(f"Pour Point: {confluence.config['POUR_POINT_COORDS']}")
print(f"Discretization Method: {confluence.config['DOMAIN_DISCRETIZATION']}")
print(f"Elevation Band Size: {confluence.config['ELEVATION_BAND_SIZE']} m")
print(f"Minimum HRU Size: {confluence.config['MIN_HRU_SIZE']} km²")
print(f"Spatial Mode: {confluence.config['SPATIAL_MODE']}")
print(f"Model: {confluence.config['HYDROLOGICAL_MODEL']}")
print(f"Simulation Period: {confluence.config['EXPERIMENT_TIME_START']} to {confluence.config['EXPERIMENT_TIME_END']}")

## Project Setup - Organizing the Modeling Workflow

First, we'll establish a well-organized project structure, similar to what we did in Tutorial 3.

In [None]:

project_dir = confluence.managers['project'].setup_project()
# Create pour point
pour_point_path = confluence.managers['project'].create_pour_point()
# List created directories
print("\nCreated directories:")
for item in sorted(project_dir.iterdir()):
    if item.is_dir():
        print(f"  📁 {item.name}")

## Copy Domain Data from Tutorial 3

We'll reuse the existing domain and GRUs from Tutorial 3 (distributed model) instead of starting from scratch.

In [None]:
# Source domain from Tutorial 3
source_domain = CONFLUENCE_DATA_DIR / 'domain_Bow_at_Banff_distributed'

# Check if Tutorial 3 domain exists
if not source_domain.exists():
    raise FileNotFoundError("Tutorial 3 domain not found! Please run Tutorial 3 first.")

# Copy necessary directories from Tutorial 3
print("=== Step 2: Copying Domain Data from Tutorial 3 ===")
dirs_to_copy = ['shapefiles', 'attributes']

for dir_name in dirs_to_copy:
    source_dir = source_domain / dir_name
    target_dir = project_dir / dir_name
    
    if source_dir.exists():
        print(f"Copying {dir_name} from Tutorial 3...")
        # Create target directory if it doesn't exist
        target_dir.mkdir(parents=True, exist_ok=True)
        
        # Copy files
        for src_file in source_dir.glob('**/*'):
            if src_file.is_file():
                # Calculate relative path
                rel_path = src_file.relative_to(source_dir)
                dest_file = target_dir / rel_path
                
                # Create parent directories if needed
                dest_file.parent.mkdir(parents=True, exist_ok=True)
                
                # Copy the file, replacing 'distributed' with 'elevation' in filename
                shutil.copy2(src_file, dest_file)
                if 'distributed' in dest_file.name:
                    new_name = dest_file.parent / dest_file.name.replace('distributed', 'elevation')
                    dest_file.rename(new_name)
    else:
        print(f"Warning: {source_dir} not found in Tutorial 3 domain.")

print("\n✓ Domain data copied from Tutorial 3")

## Verify GRU Boundaries

Let's check the existing GRU boundaries that we'll use as a basis for our elevation-based HRUs.

In [None]:
# Check river basins (GRUs) and network
basin_path = project_dir / 'shapefiles' / 'river_basins'
network_path = project_dir / 'shapefiles' / 'river_network'

basin_files = list(basin_path.glob('*.shp'))
network_files = list(network_path.glob('*.shp'))

if basin_files and network_files:
    # Load data
    basins = gpd.read_file(basin_files[0])
    rivers = gpd.read_file(network_files[0])
    
    # Create visualization
    fig, ax = plt.subplots(figsize=(10, 8))
    
    # Plot basins (GRUs)
    basins.plot(ax=ax, column='GRU_ID', cmap='viridis', 
               alpha=0.7, edgecolor='black', linewidth=1)
    
    # Plot river network
    rivers.plot(ax=ax, color='blue', linewidth=1.5)
    
    # Format plot
    ax.set_title(f'GRU Boundaries for Elevation-Based Discretization\n({len(basins)} Sub-basins)', 
                fontsize=14, fontweight='bold')
    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')
    
    # Add colorbar for GRU IDs
    sm = plt.cm.ScalarMappable(cmap='viridis', 
                             norm=plt.Normalize(vmin=basins['GRU_ID'].min(), 
                                               vmax=basins['GRU_ID'].max()))
    sm._A = []
    cbar = fig.colorbar(sm, ax=ax, shrink=0.8)
    cbar.set_label('GRU ID', fontsize=12)
    
    plt.tight_layout()
    plt.show()
    
    print(f"Number of GRUs: {len(basins)}")
else:
    print("GRU files not found. Make sure Tutorial 3 was completed successfully.")

## Create Elevation-Based HRUs

Now we'll apply elevation-based discretization to create HRUs within each GRU.

In [None]:
print(f"Discretization Method: {confluence.config['DOMAIN_DISCRETIZATION']}")
print(f"Elevation Band Size: {confluence.config['ELEVATION_BAND_SIZE']} m")
print(f"Minimum HRU Size: {confluence.config['MIN_HRU_SIZE']} km²")

# Apply discretization
print("\nApplying elevation-based discretization...")
hru_path = confluence.managers['domain'].discretize_domain()

# Load and analyze the resulting HRU shapefile
catchment_path = project_dir / 'shapefiles' / 'catchment'
if catchment_path.exists():
    hru_files = list(catchment_path.glob('*.shp'))
    if hru_files:
        hru_gdf = gpd.read_file(hru_files[0])
        
        # Store the GRU GeoDataFrame for later use
        gru_gdf = gpd.read_file(basin_files[0])
        
        print(f"\n✓ Created elevation-based HRUs")
        print(f"Number of HRUs: {len(hru_gdf)}")
        print(f"Number of GRUs: {hru_gdf['GRU_ID'].nunique()}")
        
        # Calculate HRUs per GRU
        hru_counts = hru_gdf.groupby('GRU_ID').size()
        avg_hrus_per_gru = hru_counts.mean()
        print(f"Average HRUs per GRU: {avg_hrus_per_gru:.1f}")
        
        # Show the first few GRUs with their HRU counts
        print("\nHRUs per GRU (first 10):")
        for gru_id, count in hru_counts.head(10).items():
            print(f"  GRU {gru_id}: {count} HRUs")
else:
    print("Failed to create elevation-based HRUs.")

## Visualize Elevation-Based HRUs

In [None]:
# Visualize the HRUs
if 'hru_gdf' in locals() and len(hru_gdf) > 0:
    # Check if mean elevation is available in the data
    has_elevation = 'mean_elev' in hru_gdf.columns
    
    fig, ax = plt.subplots(figsize=(12, 10))
    
    # Plot HRUs colored by elevation if available, otherwise by HRU ID
    if has_elevation:
        hru_gdf.plot(ax=ax, column='mean_elev', cmap='terrain', 
                   edgecolor='gray', linewidth=0.5, alpha=0.7,
                   legend=True, legend_kwds={'label': 'Mean Elevation (m)'})
    else:
        hru_gdf.plot(ax=ax, column='HRU_ID', cmap='viridis', 
                   edgecolor='gray', linewidth=0.5, alpha=0.7)
    
    # Overlay GRU boundaries
    gru_gdf.boundary.plot(ax=ax, color='red', linewidth=1)
    
    # Add river network for context
    if 'rivers' in locals():
        rivers.plot(ax=ax, color='blue', linewidth=1)
    
    # Add title and labels
    ax.set_title(f'Elevation-Based HRUs\n{len(hru_gdf)} HRUs in {hru_gdf["GRU_ID"].nunique()} GRUs',
                fontsize=16, fontweight='bold')
    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')
    
    plt.tight_layout()
    plt.show()
else:
    print("No HRUs available to visualize.")

## Complete the Modeling Workflow

Now we'll complete the model setup and run the simulation using our elevation-based HRUs.

In [None]:
print("Processing observed streamflow data")

confluence.managers['data'].process_observed_data()

print("\n✓ Observed data processing completed")

In [None]:
'''
# Acquire Forcing Data
# Check if we can reuse forcing data from previous tutorials
forcing_dir = CONFLUENCE_DATA_DIR / 'domain_Bow_at_Banff_distributed' / 'forcing'
if forcing_dir.exists():
    print("Reusing forcing data from Tutorial 3...")
    
    # Copy forcing data
    target_dir = project_dir / 'forcing'
    target_dir.mkdir(parents=True, exist_ok=True)
    
    # Copy all subdirectories
    for src_dir in forcing_dir.iterdir():
        if src_dir.is_dir():
            dest_dir = target_dir / src_dir.name
            if not dest_dir.exists():
                shutil.copytree(src_dir, dest_dir)
            print(f"Copied {src_dir.name} forcing data")
    
    # Run model-agnostic preprocessing to handle the new HRU structure
    print("\nRunning model-agnostic preprocessing for new HRU structure...")
    confluence.managers['data'].run_model_agnostic_preprocessing()
else:
    print("Acquiring forcing data from scratch...")
    confluence.managers['data'].acquire_forcings()
'''

In [None]:
print("\nRunning model-agnostic preprocessing...")

confluence.managers['data'].run_model_agnostic_preprocessing()

print("\n✓ Forcing data processing completed")

In [None]:
print(f"Preparing {confluence.config['HYDROLOGICAL_MODEL']} input files...")

confluence.managers['model'].preprocess_models()

print("\n✓ Model-specific preprocessing completed")

In [None]:
print(f"Running {confluence.config['HYDROLOGICAL_MODEL']} with elevation-based HRUs...")
print("Note: This may take some time depending on the number of HRUs.")

confluence.managers['model'].run_models()

In [None]:
# Visualize Observed vs. Simulated Streamflow

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import xarray as xr
import matplotlib.dates as mdates
from pathlib import Path

# Load and plot simulation results
sim_path = project_dir / 'simulations' / confluence.config['EXPERIMENT_ID'] / 'mizuRoute'
sim_files = list(sim_path.glob('*.nc'))

# Load simulation data
print(f"Loading simulation data from: {sim_files[0]}")
sim_data = xr.open_dataset(sim_files[0])

# Load observation data
obs_path = project_dir / 'observations' / 'streamflow' / 'preprocessed' / f"{confluence.config['DOMAIN_NAME']}_streamflow_processed.csv"

if not obs_path.exists():
    print(f"Warning: Observation data not found at expected path: {obs_path}")
    print("Checking for alternative locations...")
    alt_obs_paths = list(Path(config_dict['CONFLUENCE_DATA_DIR']).glob(
        f"domain_{config_dict['DOMAIN_NAME']}/observations/streamflow/preprocessed/*_streamflow_processed.csv"))
    
    if alt_obs_paths:
        obs_path = alt_obs_paths[0]
        print(f"Found alternative observation data at: {obs_path}")
    else:
        print("No observation data found. Only simulations will be displayed.")

if obs_path.exists():
    print(f"Loading observation data from: {obs_path}")
    obs_df = pd.read_csv(obs_path)
    obs_df['datetime'] = pd.to_datetime(obs_df['datetime'])
    obs_df.set_index('datetime', inplace=True)
    print(f"Observation period: {obs_df.index.min()} to {obs_df.index.max()}")
else:
    obs_df = None
    
# Find the segment ID for the outlet
reach_id = int(confluence.config.get('SIM_REACH_ID', 0))
print(f"Using reach ID for outlet: {reach_id}")

# Find the index where reachID matches the target reach_id
segment_indices = np.where(sim_data.reachID.values == reach_id)[0]

if len(segment_indices) == 0:
    print(f"Error: Reach ID {reach_id} not found in simulation data")
    print(f"Available reach IDs: {sim_data.reachID.values}")
else:
    # Extract flow at the outlet segment using the index
    segment_index = segment_indices[0]
    sim_flow = sim_data.IRFroutedRunoff.isel(seg=segment_index).to_series()
    sim_df = pd.DataFrame(sim_flow)
    sim_df.columns = ['discharge_cms']

    # Determine common time period if observations exist
    if obs_df is not None:
        # Align to daily timestep for comparison
        obs_daily = obs_df.resample('D').mean()
        sim_daily = sim_df.resample('D').mean()
        
        # Find overlapping time period
        start_date = max(obs_daily.index.min(), sim_daily.index.min())
        end_date = min(obs_daily.index.max(), sim_daily.index.max())
        
        # Advance start date by 1 month to skip initial spinup
        start_date = start_date + pd.DateOffset(months=1)
        
        print(f"Common data period: {start_date} to {end_date}")
        
        # Filter to common period
        obs_period = obs_daily.loc[start_date:end_date]
        sim_period = sim_daily.loc[start_date:end_date]
        
        # Calculate performance metrics
        rmse = np.sqrt(((obs_period['discharge_cms'] - sim_period['discharge_cms'])**2).mean())
        
        # Calculate Nash-Sutcliffe Efficiency (NSE)
        mean_obs = obs_period['discharge_cms'].mean()
        numerator = ((obs_period['discharge_cms'] - sim_period['discharge_cms'])**2).sum()
        denominator = ((obs_period['discharge_cms'] - mean_obs)**2).sum()
        nse = 1 - (numerator / denominator)
        
        # Calculate Percent Bias (PBIAS)
        pbias = 100 * (sim_period['discharge_cms'].sum() - obs_period['discharge_cms'].sum()) / obs_period['discharge_cms'].sum()
        
        # Calculate Kling-Gupta Efficiency (KGE)
        r = obs_period['discharge_cms'].corr(sim_period['discharge_cms'])  # Correlation
        alpha = sim_period['discharge_cms'].std() / obs_period['discharge_cms'].std()  # Relative variability
        beta = sim_period['discharge_cms'].mean() / obs_period['discharge_cms'].mean()  # Bias ratio
        kge = 1 - ((r - 1)**2 + (alpha - 1)**2 + (beta - 1)**2)**0.5
        
        print(f"Performance metrics:")
        print(f"  - RMSE: {rmse:.2f} m³/s")
        print(f"  - NSE: {nse:.2f}")
        print(f"  - PBIAS: {pbias:.2f}%")
        print(f"  - KGE: {kge:.2f}")
        
        # Create figure with two subplots for time series and flow duration curve
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 16))
        fig.suptitle(f"Distributed Model Results - {confluence.config['DOMAIN_NAME'].replace('_', ' ').title()}", 
                     fontsize=16, fontweight='bold')
        
        # Plot time series
        ax1.plot(obs_period.index, obs_period['discharge_cms'], 
                 'b-', label='Observed', linewidth=1.5, alpha=0.7)
        ax1.plot(sim_period.index, sim_period['discharge_cms'], 
                 'r-', label='Simulated (Distributed)', linewidth=1.5, alpha=0.7)
            
        ax1.set_xlabel('Date', fontsize=12)
        ax1.set_ylabel('Discharge (m³/s)', fontsize=12)
        ax1.set_title('Streamflow Comparison', fontsize=14)
        ax1.legend(loc='upper right', fontsize=10)
        ax1.grid(True, linestyle=':', alpha=0.6)
        ax1.set_facecolor('#f0f0f0')
        
        # Format x-axis
        ax1.xaxis.set_major_locator(mdates.YearLocator())
        ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
        
        # Add metrics as text
        ax1.text(0.02, 0.95, 
                 f"RMSE: {rmse:.2f} m³/s\nNSE: {nse:.2f}\nPBIAS: {pbias:.2f}%\nKGE: {kge:.2f}",
                 transform=ax1.transAxes, 
                 fontsize=12,
                 bbox=dict(facecolor='white', alpha=0.8, boxstyle='round,pad=0.5'))
        
        # Plot flow duration curve
        # Sort values in descending order
        obs_sorted = obs_period['discharge_cms'].sort_values(ascending=False)
        sim_sorted = sim_period['discharge_cms'].sort_values(ascending=False)
        
        # Calculate exceedance probabilities
        obs_ranks = np.arange(1., len(obs_sorted) + 1) / len(obs_sorted)
        sim_ranks = np.arange(1., len(sim_sorted) + 1) / len(sim_sorted)
        
        # Plot Flow Duration Curves
        ax2.loglog(obs_ranks * 100, obs_sorted, 'b-', label='Observed', linewidth=2)
        ax2.loglog(sim_ranks * 100, sim_sorted, 'r-', label='Simulated', linewidth=2)
        
        ax2.set_xlabel('Exceedance Probability (%)', fontsize=12)
        ax2.set_ylabel('Discharge (m³/s)', fontsize=12)
        ax2.set_title('Flow Duration Curve', fontsize=14)
        ax2.legend(loc='best', fontsize=10)
        ax2.grid(True, which='both', linestyle=':', alpha=0.6)
        ax2.set_facecolor('#f0f0f0')
        
        # Add flow regime regions
        ax2.axvspan(0, 20, alpha=0.2, color='blue', label='High Flows')
        ax2.axvspan(20, 70, alpha=0.2, color='green', label='Medium Flows')
        ax2.axvspan(70, 100, alpha=0.2, color='red', label='Low Flows')
        
        # Save the plot to file
        plot_folder = project_dir / "plots" / "results"
        plot_folder.mkdir(parents=True, exist_ok=True)
        plot_filename = plot_folder / f"{confluence.config['EXPERIMENT_ID']}_streamflow_comparison.png"
        
        plt.tight_layout()
        plt.subplots_adjust(top=0.93)
        plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
        print(f"Plot saved to: {plot_filename}")
        
        plt.show()
    else:
        # If no observations, just plot simulation
        fig, ax = plt.subplots(figsize=(14, 6))
        ax.plot(sim_df.index, sim_df['discharge_cms'], 
                color='red', linewidth=1.5, label='Simulated (Distributed)')
        
        ax.set_xlabel('Date', fontsize=12)
        ax.set_ylabel('Discharge (m³/s)', fontsize=12)
        ax.set_title(f'Distributed Model Results - {confluence.config["DOMAIN_NAME"].replace("_", " ").title()}', 
                    fontsize=14, fontweight='bold')
        ax.grid(True, alpha=0.3)
        ax.legend(fontsize=10)
        
        plt.tight_layout()
        plt.show()

# Close the dataset
sim_data.close()