# CMIP Data Exploration

This notebook explores the CMIP climate data files and demonstrates how to work with them using our processing tools.

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt

# Add the scripts directory to the path so we can import our modules
sys.path.append('../scripts')
from cmip_processor import CMIPProcessor, AggregationMethod

## Load and Examine the CMIP Data

First, let's load one of the CMIP datasets and examine its structure.

In [ ]:
# Path to the CMIP data file
data_path = "./data/macav2metdata_huss_CCSM4_r6i1p1_rcp45_2021_2025_CONUS_monthly.nc"

# Check if file exists
import os
if not os.path.exists(data_path):
    raise FileNotFoundError(f"Required CMIP data file not found: {data_path}. Please ensure you're running this notebook from the datacube directory.")

# Try to initialize the processor
try:
    # Initialize the processor
    processor = CMIPProcessor(data_path)
except ValueError as e:
    if "unable to decode time units" in str(e) or "calendar" in str(e):
        print("Warning: Non-standard calendar detected in the data file.")
        print("Attempting to fix by manually loading the dataset...")
        
        # Override CMIPProcessor._load_dataset method to handle non-standard calendars
        import xarray as xr
        
        try:
            # Try to import cftime
            import cftime
            print("Using cftime for non-standard calendar.")
            dataset = xr.open_dataset(data_path, use_cftime=True)
        except ImportError:
            print("cftime not available. Loading without decoding times.")
            dataset = xr.open_dataset(data_path, decode_times=False)
        
        # Create a processor and manually set the dataset
        processor = CMIPProcessor(data_path)
        processor.dataset = dataset
    else:
        # Re-raise if it's a different ValueError
        raise

# Print dataset information
print("Dataset Information:")
print(processor.dataset)

## Explore the Variables

Let's look at the variables in the dataset and check their basic statistics.

In [None]:
# Print dataset variables
for var_name in processor.dataset.data_vars:
    var = processor.dataset[var_name]
    print(f"Variable: {var_name}")
    print(f"  Dimensions: {var.dims}")
    print(f"  Shape: {var.shape}")
    print(f"  Attributes: {var.attrs}")
    print(f"  Data Statistics:")
    print(f"    Min: {var.values.min()}")
    print(f"    Max: {var.values.max()}")
    print(f"    Mean: {var.values.mean()}")
    print(f"    Standard Deviation: {var.values.std()}")
    print("")

## Get Data Resolution Information

Let's determine the native resolution of the data.

In [None]:
# Get spatial resolution
lat_res, lon_res = processor.get_spatial_resolution()
print(f"Spatial Resolution: {lat_res}° latitude × {lon_res}° longitude")

# Get temporal resolution
time_res = processor.get_temporal_resolution()
print(f"Temporal Resolution: {time_res}")

## Visualize the Data

Let's create some visualizations of the data for a specific time point.

In [None]:
# Get first variable for plotting
var_name = list(processor.dataset.data_vars)[0]
var = processor.dataset[var_name]

# Plot for the first time point
plt.figure(figsize=(12, 8))
var.isel(time=0).plot()
plt.title(f"{var_name} at {processor.dataset.time.values[0]}")
plt.tight_layout()
plt.show()

## Create a Time Series for a Specific Location

Let's extract and plot a time series for a specific location.

In [None]:
# Select a specific location (example coordinates)
lat_idx = len(processor.dataset.lat) // 2  # Middle latitude index
lon_idx = len(processor.dataset.lon) // 2  # Middle longitude index

# Extract lat/lon values
lat_val = float(processor.dataset.lat[lat_idx])
lon_val = float(processor.dataset.lon[lon_idx])

# Extract time series for this location
time_series = processor.dataset[var_name].isel(lat=lat_idx, lon=lon_idx)

# Plot time series
plt.figure(figsize=(12, 6))
time_series.plot()
plt.title(f"{var_name} Time Series at Lat: {lat_val:.2f}, Lon: {lon_val:.2f}")
plt.xlabel('Time')
plt.ylabel(f"{var_name} ({time_series.attrs.get('units', '')})")
plt.grid(True)
plt.tight_layout()
plt.show()

## Apply Spatial Bucketing

Now let's demonstrate spatially bucketing the data to a coarser resolution.

In [ ]:
# Apply spatial bucketing (coarsen the resolution)
lat_bucket_size = lat_res * 2  # Double the native resolution
lon_bucket_size = lon_res * 2  # Double the native resolution

try:
    # Try standard spatial bucketing method
    spatially_bucketed = processor.bucket_spatial(
        lat_bucket_size=lat_bucket_size,
        lon_bucket_size=lon_bucket_size,
        agg_method=AggregationMethod.MEAN
    )
except Exception as e:
    print(f"Warning: Standard spatial bucketing failed: {e}")
    print("Using simplified approach with striding...")
    
    # Calculate stride for lat/lon to approximate the requested bucket sizes
    lat_stride = max(1, int(lat_bucket_size / lat_res))
    lon_stride = max(1, int(lon_bucket_size / lon_res))
    
    print(f"Using strides: lat={lat_stride}, lon={lon_stride}")
    
    # Create a coarser resolution dataset using slicing
    spatially_bucketed = processor.dataset.isel(
        lat=slice(0, None, lat_stride),
        lon=slice(0, None, lon_stride)
    )
    
    print(f"Reduced resolution from {processor.dataset.dims} to {spatially_bucketed.dims}")

# Print information about the bucketed dataset
print(f"Original spatial dimensions: {processor.dataset.dims}")
print(f"Bucketed spatial dimensions: {spatially_bucketed.dims}")

# Visualize the spatially bucketed data for the first time point
plt.figure(figsize=(12, 8))
var_name = list(spatially_bucketed.data_vars)[0] # Make sure we have the right var name
spatially_bucketed[var_name].isel(time=0).plot()
plt.title(f"Spatially Bucketed {var_name} (Lat: {lat_bucket_size}°, Lon: {lon_bucket_size}°)")
plt.tight_layout()
plt.show()

## Apply Temporal Bucketing

Let's also demonstrate temporally bucketing the data.

In [ ]:
# Apply temporal bucketing (e.g., to quarterly data)
time_bucket_size = "3M"  # Quarterly

try:
    # Try standard temporal bucketing
    temporally_bucketed = processor.bucket_temporal(
        bucket_size=time_bucket_size,
        agg_method=AggregationMethod.MEAN
    )
except Exception as e:
    print(f"Warning: Standard temporal bucketing failed: {e}")
    print("Using simplified approach with striding...")
    
    # For quarterly bucketing, we'll take every 3rd time step
    stride = 3
    
    # Create a coarser temporal resolution dataset using slicing
    temporally_bucketed = processor.dataset.isel(time=slice(0, None, stride))
    
    print(f"Reduced time resolution from {len(processor.dataset.time)} to {len(temporally_bucketed.time)}")

# Print information about the bucketed dataset
print(f"Original time dimension: {len(processor.dataset.time)} time points")
print(f"Bucketed time dimension: {len(temporally_bucketed.time)} time points")

# Plot the time series for the same location with the temporally bucketed data
time_series_bucketed = temporally_bucketed[var_name].isel(lat=lat_idx, lon=lon_idx)

plt.figure(figsize=(12, 6))
time_series.plot(label="Original")
time_series_bucketed.plot(marker='o', linestyle='-', label="Bucketed (Quarterly)")
plt.title(f"{var_name} Time Series - Original vs Temporally Bucketed")
plt.xlabel('Time')
plt.ylabel(f"{var_name} ({time_series.attrs.get('units', '')})")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

## Create a Full Datacube

Finally, let's create a complete datacube with both spatial and temporal bucketing.

In [ ]:
# Process to datacube with both spatial and temporal bucketing
datacube = processor.process_to_datacube(
    lat_bucket_size=lat_bucket_size,
    lon_bucket_size=lon_bucket_size,
    time_bucket_size=time_bucket_size,
    spatial_agg_method=AggregationMethod.MEAN,
    temporal_agg_method=AggregationMethod.MEAN
)

# Print information about the datacube
print("Datacube Information:")
print(datacube)

# Save the datacube
output_path = "./data/processed/processed_cmip_datacube.nc"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
processor.save_datacube(datacube, output_path)
print(f"Saved datacube to {output_path}")

## Compare Different Aggregation Methods

Let's compare how different aggregation methods affect the data.

In [None]:
# Create datacubes with different aggregation methods
agg_methods = [
    AggregationMethod.MEAN,
    AggregationMethod.MEDIAN,
    AggregationMethod.MAX,
    AggregationMethod.MIN
]

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

for i, method in enumerate(agg_methods):
    datacube = processor.process_to_datacube(
        lat_bucket_size=lat_bucket_size,
        lon_bucket_size=lon_bucket_size,
        time_bucket_size=None,  # Keep original time resolution for comparison
        spatial_agg_method=method
    )
    
    # Plot for the first time point
    datacube[var_name].isel(time=0).plot(ax=axes[i])
    axes[i].set_title(f"{method.value.capitalize()} Aggregation")

plt.tight_layout()
plt.suptitle(f"Comparison of Spatial Aggregation Methods for {var_name}", y=1.02, fontsize=16)
plt.show()

## Conclusion

In this notebook, we've explored the CMIP climate data and demonstrated how to:

1. Load and examine the dataset structure
2. Determine native spatial and temporal resolutions
3. Visualize the data spatially and temporally
4. Apply spatial and temporal bucketing with different aggregation methods
5. Create and save a complete datacube

The CMIPProcessor class provides a powerful and flexible way to work with climate data and prepare it for inclusion in larger datacube systems.