# FLUXNET Large Sample Experiment Tutorial

This notebook demonstrates how to run CONFLUENCE over multiple FLUXNET tower sites for point-scale large-sample analysis.

## Overview

FLUXNET represents a global network of eddy covariance tower sites that measure exchanges of carbon dioxide, water vapor, and energy between the biosphere and atmosphere. Running CONFLUENCE at these point locations allows us to:

- Validate model physics against direct observations
- Compare model performance across diverse climates and ecosystems
- Develop parameter estimation relationships
- Test model transferability across sites

## 1. Setup and Imports

In [None]:
import sys
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import subprocess
import yaml
from datetime import datetime
import seaborn as sns

# Add CONFLUENCE to path
confluence_path = Path('../').resolve()
sys.path.append(str(confluence_path))

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

print("Setup complete!")

## 2. Configure the Experiment

In [None]:
# Configuration for the FLUXNET large sample experiment
experiment_config = {
    'experiment_name': 'fluxnet_tutorial',
    'max_sites': 5,  # Number of FLUXNET sites to process
    'dry_run': False,  # Set to True to test without submitting jobs
    'template_config': '../CONFLUENCE/0_config_files/config_point_template.yaml',
    'config_dir': '../CONFLUENCE/0_config_files/fluxnet',
    'fluxnet_script': '../CONFLUENCE/9_scripts/run_towers_fluxnet.py',
    'fluxnet_csv': 'fluxnet_transformed.csv'
}

# Create experiment directory
experiment_dir = Path(f"./experiments/{experiment_config['experiment_name']}")
experiment_dir.mkdir(parents=True, exist_ok=True)

# Save configuration
with open(experiment_dir / 'experiment_config.yaml', 'w') as f:
    yaml.dump(experiment_config, f)

print(f"Experiment configured: {experiment_config['experiment_name']}")
print(f"Processing {experiment_config['max_sites']} FLUXNET sites")

## 3. Load and Explore FLUXNET Sites

In [None]:
# Load FLUXNET sites data
fluxnet_df = pd.read_csv(experiment_config['fluxnet_csv'])

print(f"Loaded {len(fluxnet_df)} FLUXNET sites")
print("\nColumns in dataset:")
for col in fluxnet_df.columns:
    print(f"  - {col}")

# Display first few sites
print("\nFirst 5 sites:")
display(fluxnet_df[['ID', 'Watershed_Name', 'KG', 'Dominant_LC', 'Area_km2']].head())

## 4. Visualize FLUXNET Site Distribution

In [None]:
# Extract coordinates from POUR_POINT_COORDS
coords = fluxnet_df['POUR_POINT_COORDS'].str.split('/', expand=True)
fluxnet_df['lat'] = coords[0].astype(float)
fluxnet_df['lon'] = coords[1].astype(float)

# Create global distribution plot
plt.figure(figsize=(15, 8))
plt.scatter(fluxnet_df['lon'], fluxnet_df['lat'], 
           c='red', alpha=0.6, s=50, edgecolors='black', linewidth=0.5)

plt.title('Global Distribution of FLUXNET Sites', fontsize=16, fontweight='bold')
plt.xlabel('Longitude', fontsize=12)
plt.ylabel('Latitude', fontsize=12)
plt.grid(True, alpha=0.3)
plt.xlim(-180, 180)
plt.ylim(-60, 80)

# Add continent labels for context
plt.text(-100, 40, 'North\nAmerica', fontsize=10, ha='center', style='italic', alpha=0.7)
plt.text(-60, -10, 'South\nAmerica', fontsize=10, ha='center', style='italic', alpha=0.7)
plt.text(15, 50, 'Europe', fontsize=10, ha='center', style='italic', alpha=0.7)
plt.text(20, 0, 'Africa', fontsize=10, ha='center', style='italic', alpha=0.7)
plt.text(100, 30, 'Asia', fontsize=10, ha='center', style='italic', alpha=0.7)
plt.text(135, -25, 'Australia', fontsize=10, ha='center', style='italic', alpha=0.7)

plt.tight_layout()
plt.show()

## 5. Analyze Site Characteristics

In [None]:
# Climate distribution (Köppen-Geiger)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Climate type distribution
climate_counts = fluxnet_df['KG'].value_counts()
climate_counts.plot(kind='bar', ax=ax1, color='skyblue', edgecolor='black')
ax1.set_title('Distribution of Köppen-Geiger Climate Types', fontsize=14)
ax1.set_xlabel('Climate Type', fontsize=12)
ax1.set_ylabel('Number of Sites', fontsize=12)
ax1.grid(axis='y', alpha=0.3)

# Land cover distribution
lc_counts = fluxnet_df['Dominant_LC'].value_counts()
lc_counts.plot(kind='bar', ax=ax2, color='lightgreen', edgecolor='black')
ax2.set_title('Distribution of Dominant Land Cover Types', fontsize=14)
ax2.set_xlabel('Land Cover Type', fontsize=12)
ax2.set_ylabel('Number of Sites', fontsize=12)
ax2.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

# Summary statistics
print("Site Characteristics Summary:")
print(f"Total sites: {len(fluxnet_df)}")
print(f"Climate types: {len(climate_counts)} unique")
print(f"Land cover types: {len(lc_counts)} unique")
print(f"Average site area: {fluxnet_df['Area_km2'].mean():.2f} km²")

## 6. Select Sites for Processing

In [None]:
# Select sites based on criteria or randomly
# For this tutorial, we'll select diverse climate types

# Get unique climate types
climate_types = fluxnet_df['KG'].unique()

# Select one site from each climate type (up to max_sites)
selected_sites = []
for climate in climate_types[:experiment_config['max_sites']]:
    site = fluxnet_df[fluxnet_df['KG'] == climate].iloc[0]
    selected_sites.append(site)

selected_df = pd.DataFrame(selected_sites)

print(f"Selected {len(selected_df)} sites for processing:")
display(selected_df[['ID', 'Watershed_Name', 'KG', 'Dominant_LC', 'Area_km2']])

# Save selected sites
selected_df.to_csv(experiment_dir / 'selected_sites.csv', index=False)

## 7. Generate Configuration Files

In [None]:
# Create config directory if it doesn't exist
config_dir = Path(experiment_config['config_dir'])
config_dir.mkdir(parents=True, exist_ok=True)

# Generate configs for selected sites
generated_configs = []

for _, site in selected_df.iterrows():
    site_id = site['ID']
    site_name = site['Watershed_Name']
    pour_point = site['POUR_POINT_COORDS']
    bounding_box = site['BOUNDING_BOX_COORDS']
    
    # Create config file name
    config_name = f"config_{site_name}.yaml"
    config_path = config_dir / config_name
    
    # Generate config using the script function
    cmd = [
        'python', '-c',
        f"""
import sys
sys.path.append('{str(confluence_path)}/9_scripts')
from run_towers_fluxnet import generate_config_file
generate_config_file(
    '{experiment_config['template_config']}',
    '{config_path}',
    '{site_name}',
    '{pour_point}',
    '{bounding_box}'
)
"""
    ]
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode == 0:
        generated_configs.append(config_path)
        print(f"Generated config for {site_name}")
    else:
        print(f"Failed to generate config for {site_name}: {result.stderr}")

print(f"\nGenerated {len(generated_configs)} configuration files")

## 8. Launch CONFLUENCE for Selected Sites

In [None]:
# Launch CONFLUENCE runs
cmd = [
    'python', experiment_config['fluxnet_script']
]

# For dry run, add appropriate option
if experiment_config['dry_run']:
    print("DRY RUN MODE - No jobs will be submitted")
    # Note: the fluxnet script asks interactively, so we'll need to handle that

print(f"Launching CONFLUENCE for FLUXNET sites...")
print(f"Command: {' '.join(cmd)}")

# Execute the script
# Note: The script requires user input, so for notebook usage we'll simulate it
result = subprocess.run(cmd, input='n\n' if experiment_config['dry_run'] else 'y\n', 
                       capture_output=True, text=True)

print("\nOutput:")
print(result.stdout[:1000] + "..." if len(result.stdout) > 1000 else result.stdout)

# Save submission log
with open(experiment_dir / 'submission.log', 'w') as f:
    f.write(result.stdout)

## 9. Monitor Job Status

In [None]:
# Check SLURM job status
def check_job_status(user=None):
    user = user or os.environ.get('USER')
    cmd = ['squeue', '-u', user]
    result = subprocess.run(cmd, capture_output=True, text=True)
    return result.stdout

print("Current jobs:")
print(check_job_status())

# Extract job IDs from submission log
import re
job_ids = []
with open(experiment_dir / 'submission.log', 'r') as f:
    content = f.read()
    job_pattern = r'job ID: (\d+)'
    job_ids = re.findall(job_pattern, content)

if job_ids:
    print(f"\nSubmitted job IDs: {', '.join(job_ids)}")
else:
    print("\nNo job IDs found in submission log")

## 10. Find Completed Simulations

In [None]:
# Find completed FLUXNET simulations
confluence_data_dir = Path("/work/comphyd_lab/data/CONFLUENCE_data")
fluxnet_dir = confluence_data_dir / "fluxnet"

completed = []
if fluxnet_dir.exists():
    for domain_dir in fluxnet_dir.glob("domain_*"):
        site_name = domain_dir.name.replace("domain_", "")
        sim_dir = domain_dir / "simulations"
        
        # Check if simulation files exist
        if sim_dir.exists() and list(sim_dir.rglob("*.nc")):
            completed.append({
                'site_name': site_name,
                'domain_dir': domain_dir,
                'sim_dir': sim_dir
            })

print(f"Completed simulations: {len(completed)}")
for site in completed:
    print(f"  - {site['site_name']}")

## 11. Load and Analyze Results

In [None]:
# Function to load SUMMA output
def load_summa_output(sim_dir, variable='scalarLatHeatTotal'):
    import xarray as xr
    
    output_files = list(sim_dir.rglob("*timestep*.nc"))
    if output_files:
        ds = xr.open_dataset(output_files[0])
        if variable in ds.variables:
            return pd.DataFrame({
                'time': pd.to_datetime(ds.time.values),
                'value': ds[variable].values.flatten()
            })
    return None

# Plot energy fluxes for completed sites
if completed:
    fig, ax = plt.subplots(figsize=(12, 6))
    
    for site in completed[:3]:  # Plot first 3
        data = load_summa_output(site['sim_dir'], 'scalarLatHeatTotal')
        if data is not None:
            # Plot daily means
            daily = data.set_index('time').resample('D').mean()
            ax.plot(daily.index, daily['value'], 
                   label=f"{site['site_name']}", 
                   linewidth=2, alpha=0.7)
    
    ax.set_xlabel('Date')
    ax.set_ylabel('Latent Heat Flux (W/m²)')
    ax.set_title('Latent Heat Flux Comparison - FLUXNET Sites')
    ax.legend()
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

## 12. Compare Model with FLUXNET Observations

In [None]:
# Function to load FLUXNET observations (placeholder)
def load_fluxnet_obs(site_name, variable='LE'):
    # This would load actual FLUXNET observations
    # For demo, we'll create synthetic data
    dates = pd.date_range('2020-01-01', '2020-12-31', freq='D')
    values = 50 + 30 * np.sin(2 * np.pi * dates.dayofyear / 365) + np.random.normal(0, 10, len(dates))
    return pd.DataFrame({
        'time': dates,
        'obs': values
    })

# Compare model with observations for one site
if completed:
    site = completed[0]
    
    # Load model output
    model_data = load_summa_output(site['sim_dir'], 'scalarLatHeatTotal')
    
    # Load observations (simulated)
    obs_data = load_fluxnet_obs(site['site_name'])
    
    if model_data is not None:
        # Resample to daily means
        model_daily = model_data.set_index('time').resample('D').mean()
        obs_daily = obs_data.set_index('time').resample('D').mean()
        
        # Merge datasets
        comparison = pd.merge(model_daily, obs_daily, left_index=True, right_index=True, how='inner')
        
        # Plot comparison
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8), height_ratios=[2, 1])
        
        # Time series
        ax1.plot(comparison.index, comparison['obs'], label='Observed', color='black', linewidth=2)
        ax1.plot(comparison.index, comparison['value'], label='Modeled', color='red', alpha=0.7)
        ax1.set_ylabel('Latent Heat Flux (W/m²)')
        ax1.set_title(f'Model vs Observations - {site["site_name"]}')
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        
        # Scatter plot
        ax2.scatter(comparison['obs'], comparison['value'], alpha=0.5)
        ax2.plot([0, comparison[['obs', 'value']].max().max()], 
                [0, comparison[['obs', 'value']].max().max()], 
                'k--', alpha=0.5)
        ax2.set_xlabel('Observed (W/m²)')
        ax2.set_ylabel('Modeled (W/m²)')
        ax2.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        # Calculate metrics
        from sklearn.metrics import r2_score, mean_squared_error
        r2 = r2_score(comparison['obs'], comparison['value'])
        rmse = np.sqrt(mean_squared_error(comparison['obs'], comparison['value']))
        
        print(f"Performance Metrics for {site['site_name']}:")
        print(f"  R²: {r2:.3f}")
        print(f"  RMSE: {rmse:.1f} W/m²")

## 13. Cross-Site Analysis

In [None]:
# Analyze model performance across different climate types
performance_data = []

for site in completed:
    # Get site info from original dataframe
    site_info = fluxnet_df[fluxnet_df['Watershed_Name'] == site['site_name']].iloc[0]
    
    # Placeholder performance metrics
    # In reality, you would calculate these from actual model-obs comparison
    performance_data.append({
        'site': site['site_name'],
        'climate': site_info['KG'],
        'landcover': site_info['Dominant_LC'],
        'r2': np.random.uniform(0.6, 0.9),  # Placeholder
        'rmse': np.random.uniform(20, 60)    # Placeholder
    })

performance_df = pd.DataFrame(performance_data)

# Plot performance by climate type
if len(performance_df) > 0:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # R² by climate
    climate_means = performance_df.groupby('climate')['r2'].mean()
    climate_means.plot(kind='bar', ax=ax1, color='skyblue', edgecolor='black')
    ax1.set_title('Model Performance (R²) by Climate Type', fontsize=14)
    ax1.set_ylabel('R²', fontsize=12)
    ax1.grid(axis='y', alpha=0.3)
    
    # RMSE by land cover
    lc_means = performance_df.groupby('landcover')['rmse'].mean()
    lc_means.plot(kind='bar', ax=ax2, color='lightcoral', edgecolor='black')
    ax2.set_title('Model Error (RMSE) by Land Cover', fontsize=14)
    ax2.set_ylabel('RMSE (W/m²)', fontsize=12)
    ax2.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 14. Summary Report