# CMIP Data Exploration

This notebook explores the CMIP climate data files and demonstrates how to work with them using our processing tools.

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt

# Add the scripts directory to the path so we can import our modules
sys.path.append('../scripts')
from cmip_processor import CMIPProcessor, AggregationMethod

## Load and Examine the CMIP Data

First, let's load one of the CMIP datasets and examine its structure.

In [None]:
# Path to the CMIP data file
data_path = "./data/macav2metdata_huss_CCSM4_r6i1p1_rcp45_2021_2025_CONUS_monthly.nc"

# Try to import configuration if available
try:
    # Add parent directory to path to import config
    sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    import config
    # Override path with config path if available
    if hasattr(config, 'CMIP_CONFIG') and hasattr(config, 'DATA_DIR'):
        model = config.CMIP_CONFIG.get("models", ["CCSM4"])[0]
        variable = config.CMIP_CONFIG.get("variables", ["huss"])[0]
        scenario = config.CMIP_CONFIG.get("scenarios", ["rcp45"])[0]
        years = config.CMIP_CONFIG.get("years", "2021_2025")
        region = config.CMIP_CONFIG.get("region", "CONUS")
        frequency = config.CMIP_CONFIG.get("frequency", "monthly")
        file_name = f"macav2metdata_{variable}_{model}_r6i1p1_{scenario}_{years}_{region}_{frequency}.nc"
        data_path = os.path.join(config.DATA_DIR, file_name)
except ImportError:
    print("Configuration module not found. Using default path.")

# Check if file exists
import os
if not os.path.exists(data_path):
    alt_paths = [
        f"./data/macav2metdata_huss_CCSM4_r6i1p1_rcp85_2021_2025_CONUS_monthly.nc",
        f"./data/macav2metdata_pr_CCSM4_r6i1p1_rcp45_2021_2025_CONUS_monthly.nc",
        f"./data/macav2metdata_tas*_CCSM4_r6i1p1_rcp45_2021_2025_CONUS_monthly.nc"
    ]
    
    found_alt = False
    for alt_path in alt_paths:
        # Check for wildcard paths
        import glob
        if "*" in alt_path:
            matches = glob.glob(alt_path)
            if matches:
                data_path = matches[0]
                found_alt = True
                print(f"Using alternative data file: {data_path}")
                break
        elif os.path.exists(alt_path):
            data_path = alt_path
            found_alt = True
            print(f"Using alternative data file: {data_path}")
            break
    
    if not found_alt:
        raise FileNotFoundError(
            f"Required CMIP data file not found: {data_path} or alternatives. "
            f"Please ensure you have at least one CMIP data file available in the data directory."
        )

# Try to initialize the processor
try:
    # Initialize the processor
    processor = CMIPProcessor(data_path)
except ValueError as e:
    if "unable to decode time units" in str(e) or "calendar" in str(e):
        print("Warning: Non-standard calendar detected in the data file.")
        print("Attempting to fix by manually loading the dataset...")
        
        # Override CMIPProcessor._load_dataset method to handle non-standard calendars
        import xarray as xr
        
        try:
            # Try to import cftime
            import cftime
            print("Using cftime for non-standard calendar.")
            dataset = xr.open_dataset(data_path, use_cftime=True)
        except ImportError:
            print("cftime not available. Loading without decoding times.")
            dataset = xr.open_dataset(data_path, decode_times=False)
        
        # Create a processor and manually set the dataset
        processor = CMIPProcessor(data_path)
        processor.dataset = dataset
    else:
        # Re-raise if it's a different ValueError
        raise

# Print dataset information
print("Dataset Information:")
print(processor.dataset)

## Explore the Variables

Let's look at the variables in the dataset and check their basic statistics.

In [None]:
# Print dataset variables
for var_name in processor.dataset.data_vars:
    var = processor.dataset[var_name]
    print(f"Variable: {var_name}")
    print(f"  Dimensions: {var.dims}")
    print(f"  Shape: {var.shape}")
    print(f"  Attributes: {var.attrs}")
    print(f"  Data Statistics:")
    print(f"    Min: {var.values.min()}")
    print(f"    Max: {var.values.max()}")
    print(f"    Mean: {var.values.mean()}")
    print(f"    Standard Deviation: {var.values.std()}")
    print("")

## Get Data Resolution Information

Let's determine the native resolution of the data.

In [None]:
# Get spatial resolution
lat_res, lon_res = processor.get_spatial_resolution()
print(f"Spatial Resolution: {lat_res}° latitude × {lon_res}° longitude")

# Get temporal resolution
time_res = processor.get_temporal_resolution()
print(f"Temporal Resolution: {time_res}")

## Visualize the Data

Let's create some visualizations of the data for a specific time point.

In [None]:
# Get first variable for plotting
var_names = list(processor.dataset.data_vars)
if not var_names:
    print("No variables found in the dataset for plotting.")
else:
    var_name = var_names[0]
    var = processor.dataset[var_name]
    
    try:
        # Check that time dimension exists and has at least one value
        if 'time' in processor.dataset.dims and len(processor.dataset.time) > 0:
            # Plot for the first time point
            plt.figure(figsize=(12, 8))
            var.isel(time=0).plot()
            plt.title(f"{var_name} at {processor.dataset.time.values[0]}")
            plt.tight_layout()
            plt.show()
        else:
            print("Dataset does not have a time dimension or it is empty.")
    except Exception as e:
        print(f"Error plotting data: {e}")
        
        # Alternative visualization attempt
        try:
            plt.figure(figsize=(12, 8))
            if 'time' in var.dims:
                # Try first with dropping problematic dimensions
                var.isel(time=0).plot()
            else:
                # Try plotting without time dimension
                var.plot()
            plt.title(f"{var_name}")
            plt.tight_layout()
            plt.show()
        except Exception as nested_e:
            print(f"Alternative plotting also failed: {nested_e}")
            print("Cannot create visualization.")

## Create a Time Series for a Specific Location

Let's extract and plot a time series for a specific location.

In [None]:
# Check if the dataset has required dimensions
if 'lat' not in processor.dataset.dims or 'lon' not in processor.dataset.dims:
    print("Dataset does not have lat and/or lon dimensions required for location-based time series.")
elif 'time' not in processor.dataset.dims:
    print("Dataset does not have a time dimension required for time series analysis.")
elif len(processor.dataset.time) <= 1:
    print("Dataset has only one time point. Cannot create time series.")
else:
    try:
        # Find the variable for time series plotting
        var_names = list(processor.dataset.data_vars)
        if not var_names:
            print("No variables found in the dataset for time series plotting.")
        else:
            var_name = var_names[0]
            var = processor.dataset[var_name]
            
            # Select a specific location (example coordinates)
            lat_idx = len(processor.dataset.lat) // 2  # Middle latitude index
            lon_idx = len(processor.dataset.lon) // 2  # Middle longitude index

            # Extract lat/lon values
            lat_val = float(processor.dataset.lat[lat_idx])
            lon_val = float(processor.dataset.lon[lon_idx])

            # Extract time series for this location
            time_series = processor.dataset[var_name].isel(lat=lat_idx, lon=lon_idx)

            # Plot time series
            plt.figure(figsize=(12, 6))
            time_series.plot()
            plt.title(f"{var_name} Time Series at Lat: {lat_val:.2f}, Lon: {lon_val:.2f}")
            plt.xlabel('Time')
            plt.ylabel(f"{var_name} ({time_series.attrs.get('units', '')})")
            plt.grid(True)
            plt.tight_layout()
            plt.show()
    except Exception as e:
        print(f"Error creating time series plot: {e}")
        
        # Try an alternative approach with simplified data extraction
        try:
            var_name = list(processor.dataset.data_vars)[0]
            # Find middle points using different methods
            lat_dim = next((d for d in processor.dataset.dims if 'lat' in d.lower()), None)
            lon_dim = next((d for d in processor.dataset.dims if 'lon' in d.lower()), None)
            
            if lat_dim and lon_dim:
                lat_idx = len(processor.dataset[lat_dim]) // 2
                lon_idx = len(processor.dataset[lon_dim]) // 2
                
                # Create selection dict
                selection = {'time': slice(None)}  # All time points
                selection[lat_dim] = lat_idx
                selection[lon_dim] = lon_idx
                
                # Extract time series
                time_series = processor.dataset[var_name].isel(**selection)
                
                # Plot time series
                plt.figure(figsize=(12, 6))
                time_series.plot()
                plt.title(f"{var_name} Time Series at center point")
                plt.xlabel('Time')
                plt.ylabel(f"{var_name}")
                plt.grid(True)
                plt.tight_layout()
                plt.show()
            else:
                print("Could not find latitude and longitude dimensions.")

## Apply Spatial Bucketing

Now let's demonstrate spatially bucketing the data to a coarser resolution.

In [None]:
# Check if we can get spatial resolution
try:
    # Apply spatial bucketing (coarsen the resolution)
    lat_res, lon_res = processor.get_spatial_resolution()
    lat_bucket_size = lat_res * 2  # Double the native resolution
    lon_bucket_size = lon_res * 2  # Double the native resolution
    
    try:
        # Try standard spatial bucketing method
        spatially_bucketed = processor.bucket_spatial(
            lat_bucket_size=lat_bucket_size,
            lon_bucket_size=lon_bucket_size,
            agg_method=AggregationMethod.MEAN
        )
    except Exception as e:
        print(f"Warning: Standard spatial bucketing failed: {e}")
        print("Using simplified approach with striding...")
        
        # Calculate stride for lat/lon to approximate the requested bucket sizes
        lat_stride = max(1, int(lat_bucket_size / lat_res))
        lon_stride = max(1, int(lon_bucket_size / lon_res))
        
        print(f"Using strides: lat={lat_stride}, lon={lon_stride}")
        
        # Find latitude and longitude dimension names
        lat_dim = next((d for d in processor.dataset.dims if 'lat' in d.lower()), 'lat')
        lon_dim = next((d for d in processor.dataset.dims if 'lon' in d.lower()), 'lon')
        
        # Create a coarser resolution dataset using slicing
        selection = {}
        selection[lat_dim] = slice(0, None, lat_stride)
        selection[lon_dim] = slice(0, None, lon_stride)
        
        spatially_bucketed = processor.dataset.isel(**selection)
        
        print(f"Reduced resolution from {processor.dataset.dims} to {spatially_bucketed.dims}")

    # Print information about the bucketed dataset
    print(f"Original spatial dimensions: {processor.dataset.dims}")
    print(f"Bucketed spatial dimensions: {spatially_bucketed.dims}")

    # Visualize the spatially bucketed data for the first time point
    var_names = list(spatially_bucketed.data_vars)
    if var_names:
        var_name = var_names[0] # Make sure we have the right var name
        try:
            plt.figure(figsize=(12, 8))
            if 'time' in spatially_bucketed.dims and len(spatially_bucketed.time) > 0:
                spatially_bucketed[var_name].isel(time=0).plot()
                plt.title(f"Spatially Bucketed {var_name} (Lat: {lat_bucket_size}°, Lon: {lon_bucket_size}°)")
            else:
                spatially_bucketed[var_name].plot()
                plt.title(f"Spatially Bucketed {var_name} (Lat: {lat_bucket_size}°, Lon: {lon_bucket_size}°)")
            plt.tight_layout()
            plt.show()
        except Exception as e:
            print(f"Error visualizing bucketed data: {e}")
    else:
        print("No variables found in the bucketed dataset for visualization.")
except Exception as e:
    print(f"Error in spatial bucketing process: {e}")
    print("Cannot perform spatial bucketing on this dataset.")

## Apply Temporal Bucketing

Let's also demonstrate temporally bucketing the data.

In [None]:
# Check if time dimension exists
if 'time' not in processor.dataset.dims:
    print("Dataset does not have a time dimension. Cannot perform temporal bucketing.")
elif len(processor.dataset.time) <= 1:
    print("Dataset has only one time point. Temporal bucketing not meaningful.")
else:
    try:
        # Apply temporal bucketing (e.g., to quarterly data)
        time_bucket_size = "3M"  # Quarterly

        try:
            # Try standard temporal bucketing
            temporally_bucketed = processor.bucket_temporal(
                bucket_size=time_bucket_size,
                agg_method=AggregationMethod.MEAN
            )
        except Exception as e:
            print(f"Warning: Standard temporal bucketing failed: {e}")
            print("Using simplified approach with striding...")
            
            # For quarterly bucketing, we'll take every 3rd time step
            stride = 3
            
            # Create a coarser temporal resolution dataset using slicing
            temporally_bucketed = processor.dataset.isel(time=slice(0, None, stride))
            
            print(f"Reduced time resolution from {len(processor.dataset.time)} to {len(temporally_bucketed.time)}")

        # Print information about the bucketed dataset
        print(f"Original time dimension: {len(processor.dataset.time)} time points")
        print(f"Bucketed time dimension: {len(temporally_bucketed.time)} time points")

        # Get variable for plotting
        var_names = list(processor.dataset.data_vars)
        if var_names:
            var_name = var_names[0]
            
            try:
                # Check if we have spatial dimensions
                if 'lat' in processor.dataset.dims and 'lon' in processor.dataset.dims:
                    # Find a suitable location for time series comparison
                    lat_idx = len(processor.dataset.lat) // 2  # Middle latitude index
                    lon_idx = len(processor.dataset.lon) // 2  # Middle longitude index
                    
                    # Get time series for original and bucketed data
                    time_series = processor.dataset[var_name].isel(lat=lat_idx, lon=lon_idx)
                    time_series_bucketed = temporally_bucketed[var_name].isel(lat=lat_idx, lon=lon_idx)
                    
                    # Plot comparison
                    plt.figure(figsize=(12, 6))
                    time_series.plot(label="Original")
                    time_series_bucketed.plot(marker='o', linestyle='-', label="Bucketed (Quarterly)")
                    plt.title(f"{var_name} Time Series - Original vs Temporally Bucketed")
                    plt.xlabel('Time')
                    plt.ylabel(f"{var_name} ({time_series.attrs.get('units', '')})")
                    plt.grid(True)
                    plt.legend()
                    plt.tight_layout()
                    plt.show()
                else:
                    print("Dataset missing lat/lon dimensions. Cannot create location-based time series.")
            except Exception as e:
                print(f"Error plotting time series comparison: {e}")
                
                # Try alternative dimension names
                try:
                    lat_dim = next((d for d in processor.dataset.dims if 'lat' in d.lower()), None)
                    lon_dim = next((d for d in processor.dataset.dims if 'lon' in d.lower()), None)
                    
                    if lat_dim and lon_dim:
                        lat_idx = len(processor.dataset[lat_dim]) // 2
                        lon_idx = len(processor.dataset[lon_dim]) // 2
                        
                        # Create selection dict
                        selection = {}
                        selection[lat_dim] = lat_idx
                        selection[lon_dim] = lon_idx
                        
                        # Get time series for original and bucketed data
                        time_series = processor.dataset[var_name].isel(**selection)
                        time_series_bucketed = temporally_bucketed[var_name].isel(**selection)
                        
                        # Plot comparison
                        plt.figure(figsize=(12, 6))
                        time_series.plot(label="Original")
                        time_series_bucketed.plot(marker='o', linestyle='-', label="Bucketed (Quarterly)")
                        plt.title(f"{var_name} Time Series - Original vs Temporally Bucketed")
                        plt.xlabel('Time')
                        plt.ylabel(f"{var_name}")
                        plt.grid(True)
                        plt.legend()
                        plt.tight_layout()
                        plt.show()
                    else:
                        print("Could not find suitable spatial dimensions for time series.")
                except Exception as nested_e:
                    print(f"Alternative plotting also failed: {nested_e}")
        else:
            print("No variables found in the dataset for visualization.")
    except Exception as e:
        print(f"Error in temporal bucketing process: {e}")
        print("Cannot perform temporal bucketing on this dataset.")

## Create a Full Datacube

Finally, let's create a complete datacube with both spatial and temporal bucketing.

In [None]:
# Check if we have the required variables defined
required_vars = ['lat_bucket_size', 'lon_bucket_size', 'time_bucket_size']
missing_vars = [var for var in required_vars if var not in locals()]

if missing_vars:
    # Define default values if not already defined
    if 'lat_bucket_size' not in locals() or 'lon_bucket_size' not in locals():
        try:
            lat_res, lon_res = processor.get_spatial_resolution()
            lat_bucket_size = lat_res * 2
            lon_bucket_size = lon_res * 2
            print(f"Using default spatial bucket sizes: lat={lat_bucket_size}°, lon={lon_bucket_size}°")
        except Exception as e:
            print(f"Error getting spatial resolution: {e}")
            lat_bucket_size = 0.5
            lon_bucket_size = 0.5
            print(f"Using fallback spatial bucket sizes: lat={lat_bucket_size}°, lon={lon_bucket_size}°")
    
    if 'time_bucket_size' not in locals():
        time_bucket_size = "3M"  # Quarterly
        print(f"Using default time bucket size: {time_bucket_size}")

try:
    # Process to datacube with both spatial and temporal bucketing
    datacube = processor.process_to_datacube(
        lat_bucket_size=lat_bucket_size,
        lon_bucket_size=lon_bucket_size,
        time_bucket_size=time_bucket_size,
        spatial_agg_method=AggregationMethod.MEAN,
        temporal_agg_method=AggregationMethod.MEAN
    )

    # Print information about the datacube
    print("Datacube Information:")
    print(datacube)

    # Create output directory if it doesn't exist
    output_path = "./data/processed/processed_cmip_datacube.nc"
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Save the datacube
    processor.save_datacube(datacube, output_path)
    print(f"Saved datacube to {output_path}")
except Exception as e:
    print(f"Error creating full datacube: {e}")
    print("Attempting simplified approach...")
    
    try:
        # Use simplified striding approach
        lat_dim = next((d for d in processor.dataset.dims if 'lat' in d.lower()), 'lat')
        lon_dim = next((d for d in processor.dataset.dims if 'lon' in d.lower()), 'lon')
        
        # Calculate stride values
        lat_res, lon_res = processor.get_spatial_resolution()
        lat_stride = max(1, int(lat_bucket_size / lat_res))
        lon_stride = max(1, int(lon_bucket_size / lon_res))
        time_stride = 3 if time_bucket_size == "3M" else 1
        
        # Create selection dict
        selection = {}
        if 'time' in processor.dataset.dims:
            selection['time'] = slice(0, None, time_stride)
        selection[lat_dim] = slice(0, None, lat_stride)
        selection[lon_dim] = slice(0, None, lon_stride)
        
        # Create simplified datacube
        simplified_datacube = processor.dataset.isel(**selection)
        
        print("Created simplified datacube:")
        print(simplified_datacube)
        
        # Save the simplified datacube
        output_path = "./data/processed/simplified_cmip_datacube.nc"
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        simplified_datacube.to_netcdf(output_path)
        print(f"Saved simplified datacube to {output_path}")
    except Exception as nested_e:
        print(f"Simplified approach also failed: {nested_e}")
        print("Could not create datacube.")

## Compare Different Aggregation Methods

Let's compare how different aggregation methods affect the data.

In [None]:
# Create datacubes with different aggregation methods
agg_methods = [
    AggregationMethod.MEAN,
    AggregationMethod.MEDIAN,
    AggregationMethod.MAX,
    AggregationMethod.MIN
]

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

for i, method in enumerate(agg_methods):
    datacube = processor.process_to_datacube(
        lat_bucket_size=lat_bucket_size,
        lon_bucket_size=lon_bucket_size,
        time_bucket_size=None,  # Keep original time resolution for comparison
        spatial_agg_method=method
    )
    
    # Plot for the first time point
    datacube[var_name].isel(time=0).plot(ax=axes[i])
    axes[i].set_title(f"{method.value.capitalize()} Aggregation")

plt.tight_layout()
plt.suptitle(f"Comparison of Spatial Aggregation Methods for {var_name}", y=1.02, fontsize=16)
plt.show()

## Conclusion

In this notebook, we've explored the CMIP climate data and demonstrated how to:

1. Load and examine the dataset structure
2. Determine native spatial and temporal resolutions
3. Visualize the data spatially and temporally
4. Apply spatial and temporal bucketing with different aggregation methods
5. Create and save a complete datacube

The CMIPProcessor class provides a powerful and flexible way to work with climate data and prepare it for inclusion in larger datacube systems.