# Datacube Exploration

This notebook demonstrates how to combine multiple CMIP datasets into a unified datacube and explore the results.

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import seaborn as sns

# Add the scripts directory to the path so we can import our modules
sys.path.append('../scripts')
from cmip_processor import CMIPProcessor, AggregationMethod
from datacube_builder import DatacubeBuilder, InterpolationMethod

## Load CMIP Datasets

First, let's load the available CMIP datasets.

In [ ]:
# Paths to the CMIP data files
data_files = {
    "rcp45": "./data/macav2metdata_huss_CCSM4_r6i1p1_rcp45_2021_2025_CONUS_monthly.nc",
    "rcp85": "./data/macav2metdata_huss_CCSM4_r6i1p1_rcp85_2021_2025_CONUS_monthly.nc"
}

# Check for missing data files
missing_files = []
for scenario, file_path in data_files.items():
    if not os.path.exists(file_path):
        missing_files.append(file_path)

if missing_files:
    raise FileNotFoundError(f"Required CMIP data files not found: {', '.join(missing_files)}. Please ensure you're running this notebook from the datacube directory.")

# Helper function to load datasets with non-standard calendars
def load_dataset_with_calendar_handling(file_path):
    try:
        # Try the standard CMIPProcessor
        processor = CMIPProcessor(file_path)
        return processor
    except ValueError as e:
        if "unable to decode time units" in str(e) or "calendar" in str(e):
            print(f"Warning: Non-standard calendar detected in {file_path}")
            print("Attempting to fix by manually loading the dataset...")
            
            # Override CMIPProcessor._load_dataset method to handle non-standard calendars
            import xarray as xr
            
            try:
                # Try to import cftime
                import cftime
                print("Using cftime for non-standard calendar.")
                dataset = xr.open_dataset(file_path, use_cftime=True)
            except ImportError:
                print("cftime not available. Loading without decoding times.")
                dataset = xr.open_dataset(file_path, decode_times=False)
            
            # Create a processor and manually set the dataset
            processor = CMIPProcessor(file_path)
            processor.dataset = dataset
            return processor
        else:
            # Re-raise if it's a different ValueError
            raise

# Initialize processors for each dataset
processors = {}
for scenario, file_path in data_files.items():
    processors[scenario] = load_dataset_with_calendar_handling(file_path)
    print(f"Loaded {scenario} dataset with dimensions: {processors[scenario].dataset.dims}")

## Compare Datasets

Let's compare basic statistics between the datasets.

In [None]:
# Get the first variable from the datasets
var_name = list(processors["rcp45"].dataset.data_vars)[0]

# Compare statistics
stats = []
for scenario, processor in processors.items():
    var = processor.dataset[var_name]
    stats.append({
        "Scenario": scenario,
        "Min": float(var.min()),
        "Max": float(var.max()),
        "Mean": float(var.mean()),
        "Std": float(var.std())
    })

# Convert to DataFrame for easy viewing
stats_df = pd.DataFrame(stats)
stats_df

## Visualize Differences Between Scenarios

Let's create maps to visualize the differences between the RCP4.5 and RCP8.5 scenarios.

In [ ]:
# Calculate difference between scenarios for the first time point
rcp45_data = processors["rcp45"].dataset[var_name].isel(time=0)
rcp85_data = processors["rcp85"].dataset[var_name].isel(time=0)

# Check coordinate systems
print("RCP45 coordinates:", rcp45_data.lat.shape, rcp45_data.lon.shape)
print("RCP85 coordinates:", rcp85_data.lat.shape, rcp85_data.lon.shape)

# Ensure they have the same coordinates
if not (rcp45_data.lat.equals(rcp85_data.lat) and rcp45_data.lon.equals(rcp85_data.lon)):
    print("Warning: Datasets have different coordinates.")
    
    # If dimensions are identical but values differ
    if rcp45_data.lat.shape == rcp85_data.lat.shape and rcp45_data.lon.shape == rcp85_data.lon.shape:
        print("Dimensions match but values differ. Using as-is.")
        diff_data = rcp85_data - rcp45_data
    else:
        # Check which has the coarser resolution
        if len(rcp45_data.lat) <= len(rcp85_data.lat) and len(rcp45_data.lon) <= len(rcp85_data.lon):
            # RCP45 is coarser, so downsample RCP85
            print("RCP45 has coarser resolution. Downsampling RCP85...")
            rcp85_data = rcp85_data.sel(
                lat=rcp45_data.lat.values,
                lon=rcp45_data.lon.values,
                method="nearest"
            )
        else:
            # RCP85 is coarser, so downsample RCP45
            print("RCP85 has coarser resolution. Downsampling RCP45...")
            rcp45_data = rcp45_data.sel(
                lat=rcp85_data.lat.values,
                lon=rcp85_data.lon.values,
                method="nearest"
            )
        
        # Calculate difference
        diff_data = rcp85_data - rcp45_data
else:
    # Calculate difference if coordinates are already the same
    diff_data = rcp85_data - rcp45_data

# Create a 1x3 subplot for comparison
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot RCP4.5
rcp45_plot = rcp45_data.plot(ax=axes[0], cmap='viridis')
axes[0].set_title(f"RCP4.5 {var_name} (First Month)")

# Plot RCP8.5
rcp85_plot = rcp85_data.plot(ax=axes[1], cmap='viridis')
axes[1].set_title(f"RCP8.5 {var_name} (First Month)")

# Plot Difference
diff_plot = diff_data.plot(ax=axes[2], cmap='RdBu_r')
axes[2].set_title(f"Difference (RCP8.5 - RCP4.5)")

plt.tight_layout()
plt.show()

## Process Datasets for Datacube Integration

Now let's process each dataset to prepare for integration into a unified datacube.

In [None]:
# Define common bucketing parameters
lat_bucket_size = 0.5  # degrees
lon_bucket_size = 0.5  # degrees
time_bucket_size = '1M'  # monthly

# Process each dataset
processed_datasets = {}
for scenario, processor in processors.items():
    processed_datasets[scenario] = processor.process_to_datacube(
        lat_bucket_size=lat_bucket_size,
        lon_bucket_size=lon_bucket_size,
        time_bucket_size=time_bucket_size,
        spatial_agg_method=AggregationMethod.MEAN,
        temporal_agg_method=AggregationMethod.MEAN
    )
    print(f"Processed {scenario} dataset with new dimensions: {processed_datasets[scenario].dims}")

## Build a Unified Datacube

Now, let's use the DatacubeBuilder to combine the processed datasets into a unified datacube.

In [None]:
# Initialize the datacube builder
builder = DatacubeBuilder()

# Add the processed datasets
for scenario, dataset in processed_datasets.items():
    builder.add_dataset(scenario, dataset)

# Build the unified datacube
unified_datacube = builder.build_datacube(
    lat_resolution=lat_bucket_size,
    lon_resolution=lon_bucket_size,
    time_resolution=time_bucket_size,
    interpolation_method=InterpolationMethod.LINEAR
)

# Print information about the unified datacube
print("Unified Datacube Information:")
print(unified_datacube)

## Explore the Unified Datacube

Let's explore the unified datacube to see how it combines the different datasets.

In [None]:
# List all variables in the unified datacube
print("Variables in the unified datacube:")
for var_name in unified_datacube.data_vars:
    print(f"  - {var_name}")

## Compare Variables Across Scenarios

Let's compare the same variable across different scenarios in the unified datacube.

In [None]:
# Identify the common variable name across scenarios
common_var = list(processors["rcp45"].dataset.data_vars)[0]

# Get the variable names in the unified datacube for each scenario
rcp45_var = f"rcp45_{common_var}"
rcp85_var = f"rcp85_{common_var}"

# Select a specific location for time series analysis
lat_idx = len(unified_datacube.lat) // 2  # Middle latitude index
lon_idx = len(unified_datacube.lon) // 2  # Middle longitude index

# Extract lat/lon values
lat_val = float(unified_datacube.lat[lat_idx])
lon_val = float(unified_datacube.lon[lon_idx])

# Extract time series for this location
rcp45_series = unified_datacube[rcp45_var].isel(lat=lat_idx, lon=lon_idx)
rcp85_series = unified_datacube[rcp85_var].isel(lat=lat_idx, lon=lon_idx)

# Plot time series comparison
plt.figure(figsize=(12, 6))
rcp45_series.plot(label="RCP4.5")
rcp85_series.plot(label="RCP8.5", linestyle="--")
plt.title(f"{common_var} Time Series at Lat: {lat_val:.2f}, Lon: {lon_val:.2f}")
plt.xlabel('Time')
plt.ylabel(f"{common_var} (units)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

## Calculate and Visualize Scenario Differences

Let's calculate the difference between scenarios across the entire spatial domain over time.

In [None]:
# Calculate difference between scenarios
scenario_difference = unified_datacube[rcp85_var] - unified_datacube[rcp45_var]

# Create a new dataset with the difference
diff_dataset = xr.Dataset(
    data_vars={
        f"{common_var}_difference": scenario_difference
    },
    coords=unified_datacube.coords
)

# Plot the spatial pattern of differences for the first time point
plt.figure(figsize=(12, 8))
diff_dataset[f"{common_var}_difference"].isel(time=0).plot(cmap='RdBu_r')
plt.title(f"Difference in {common_var} (RCP8.5 - RCP4.5) at {unified_datacube.time.values[0]}")
plt.tight_layout()
plt.show()

## Analyze Temporal Trends

Let's analyze how the difference between scenarios evolves over time.

In [None]:
# Calculate the spatial mean difference for each time point
mean_diff_over_time = diff_dataset[f"{common_var}_difference"].mean(dim=['lat', 'lon'])

# Plot the trend
plt.figure(figsize=(12, 6))
mean_diff_over_time.plot(marker='o')
plt.title(f"Mean Difference in {common_var} (RCP8.5 - RCP4.5) Over Time")
plt.xlabel('Time')
plt.ylabel(f"Mean Difference ({common_var} units)")
plt.grid(True)
plt.tight_layout()
plt.show()

## Create a Multi-Variable Analysis

If there are multiple variables in the datasets, let's analyze relationships between them.

In [None]:
# Check if there are multiple variables
all_vars = list(unified_datacube.data_vars)
if len(all_vars) > 2:  # Need at least two variables to compare
    # Select two variables for comparison
    var1 = all_vars[0]
    var2 = all_vars[1]
    
    # Extract data for a specific time point
    time_idx = 0
    var1_data = unified_datacube[var1].isel(time=time_idx).values.flatten()
    var2_data = unified_datacube[var2].isel(time=time_idx).values.flatten()
    
    # Remove NaN values
    mask = ~(np.isnan(var1_data) | np.isnan(var2_data))
    var1_data = var1_data[mask]
    var2_data = var2_data[mask]
    
    # Create scatter plot
    plt.figure(figsize=(10, 8))
    plt.scatter(var1_data, var2_data, alpha=0.5)
    plt.title(f"Relationship between {var1} and {var2} at {unified_datacube.time.values[time_idx]}")
    plt.xlabel(var1)
    plt.ylabel(var2)
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    
    # Calculate correlation
    corr = np.corrcoef(var1_data, var2_data)[0, 1]
    print(f"Correlation between {var1} and {var2}: {corr:.4f}")
else:
    print("Not enough variables for multi-variable analysis")

## Save the Unified Datacube

Finally, let's save the unified datacube for future use.

In [ ]:
# Save the unified datacube
output_path = "./data/processed/unified_cmip_datacube.nc"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
builder.save_datacube(output_path)
print(f"Saved unified datacube to {output_path}")

# Also save the difference dataset
diff_output_path = "./data/processed/scenario_difference_datacube.nc"
diff_dataset.to_netcdf(diff_output_path)
print(f"Saved scenario difference dataset to {diff_output_path}")

## Conclusion

In this notebook, we've demonstrated how to:

1. Load and process multiple CMIP climate datasets
2. Compare the datasets and visualize their differences
3. Build a unified datacube that combines multiple datasets
4. Explore the unified datacube through various visualizations and analyses
5. Calculate and visualize differences between climate scenarios
6. Save the unified datacube for future use

This approach can be extended to include additional datasets and variables, creating a comprehensive datacube for climate analysis.