In [1]:
import xarray as xr
from odc.geo.geobox import GeoBox
from odc.geo.geom import Geometry
from odc.geo.xr import crop, xr_reproject
from shapely.geometry import box
from typing import List, Optional, Tuple
import rioxarray as riox
import s3fs
from datetime import datetime
import pandas as pd
import re

import dask.distributed
import arraylake

def extract_years_and_set_time_if_missing_coords(filenames, da, year_range=(1990, 2100)):
    """
    Extract years from filenames and set them as time coordinates
    only if the 'time' dimension exists but does not have coordinates.
    
    Parameters:
        filenames (list of str): List of filenames containing year information.
        da (xarray.DataArray or xarray.Dataset): DataArray or Dataset to update.
        year_range (tuple): Range of valid years (default: 1990 to 2100).
    
    Returns:
        xarray.DataArray or xarray.Dataset: Updated xarray object with time coordinates set.
    """
    # Check if 'time' dimension exists and has no coordinates
    if "time" in da.dims and "time" not in da.coords:
        # Compile regex for extracting 4-digit years
        year_regex = re.compile(r"(19[9][0-9]|20[0-9]{2}|2100)")

        # Extract years from filenames
        years = []
        for filename in filenames:
            match = year_regex.search(filename)
            if match:
                year = int(match.group())
                if year_range[0] <= year <= year_range[1]:
                    years.append(year)
                else:
                    raise ValueError(f"Year {year} in file '{filename}' is out of the specified range.")
            else:
                raise ValueError(f"No valid year found in filename: {filename}")

        # Convert years to pd.Timestamp
        time_coords = pd.to_datetime([f"{year}-01-01" for year in years])

        # Assign time coordinates
        da = da.assign_coords(time=time_coords)
    
    return da


def build_datacube(
    raster_files: List[str],
    bounds: List[float],
    varname: str,
    resolution: float = 0.009,
    src_crs: str = "EPSG:4326",
    dst_crs: str = "EPSG:4326",
    resampling: str = "bilinear",
    startyear: int = 2000,
    endyear: int = 2020,
    time_frequency_years: int = 5
) -> xr.DataArray:
    """
    Build a datacube from a list of raster files, crop to bounds, and save to zarr format.
    
    Args:
        raster_files: List of paths to raster files to combine
        bounds: List of [xmin, ymin, xmax, ymax] coordinates
        varname: Name of the output variable
        resolution: Output resolution in degrees (default: 0.009)
        src_crs: Source coordinate reference system (default: "EPSG:4326")
        dst_crs: Destination coordinate reference system (default: "EPSG:4326") 
        resampling: Resampling method for reprojection (default: "bilinear")
        startyear: Start year for resampling (default: 2000)
        endyear: End year for resampling (default: 2020)
        time_frequency_years: Frequency of resampling in years (default: 5)
    Returns:
        xr.DataArray: The processed datacube
        
    Example:
        >>> raster_files = ["data/gdp/gdp1990.tif", "data/gdp/gdp2000.tif"]
        >>> bounds = [17.58, -35.00, 21.38, -32.23]
        >>> da = build_datacube(
        ...     raster_files=raster_files,
        ...     bounds=bounds, 
        ...     varname="gdp"
        ... )
    """
    

    #TODO extend to accomodate multiple bands/variables
    rasters = []
    
    # Check if files are on S3 or local and open accordingly
    for file in raster_files:
        if file.startswith("s3://"):
            # Open file from S3
            fs = s3fs.S3FileSystem()
            with fs.open(file) as infile:
                rasters.append(riox.open_rasterio(infile, chunks='auto'))
        else:
            # Open local file
            rasters.append(riox.open_rasterio(file, chunks='auto'))
    
    # Combine rasters along the time dimension
    combined = xr.concat(rasters, dim="time").squeeze()
    #set time coords
    combined = extract_years_and_set_time_if_missing_coords(raster_files, combined)

    # Set CRS
    if combined.rio.crs is not None:
        combined = combined.odc.assign_crs(combined.rio.crs)
    else:
        combined = combined.odc.assign_crs(src_crs)

    # Create template
    template = GeoBox.from_bbox(bounds, crs=dst_crs, resolution=resolution)
    bbox = box(*template.boundingbox)
    geom = Geometry(bbox, dst_crs)

    # Crop to box
    combined = crop(combined, geom)

    # Reproject dataset
    da = xr_reproject(
        combined,
        how=template,
        resampling=resampling,
    )
    
    # Name the dataarray
    da = da.rename(varname)
    #if there is a a time dim resample 
    if "time" in da.dims:
        start_date = datetime(startyear,1,1)
        end_date = datetime(endyear,12,31)
        time_frequency_years = 5

        # Interpolate
        dseqn = pd.date_range(
            start=start_date,
            end=end_date,
            freq=pd.DateOffset(years=time_frequency_years),
            inclusive='left'  # Exclude end_date
        )
        da = da.interp(
            time=dseqn,
            method="linear",
            assume_sorted=True,
            )

    return da

In [2]:
# Initialize dask client
client_da = dask.distributed.Client()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 41329 instead


In [3]:
#files to cube
raster_files = ["s3://sagemaker-us-west-2-482277551691/glenn/hm_data/gdp/gdp1990.tif",
                "s3://sagemaker-us-west-2-482277551691/glenn/hm_data/gdp/gdp2000.tif",
                "s3://sagemaker-us-west-2-482277551691/glenn/hm_data/gdp/gdp2010.tif",
                "s3://sagemaker-us-west-2-482277551691/glenn/hm_data/gdp/gdp2020.tif",
                "s3://sagemaker-us-west-2-482277551691/glenn/hm_data/gdp/gdp2030.tif"]
#boundiung box
bounds = [17.58, -35.00, 21.38, -32.23]
#chunks
chunk_shape = (100,100)

#build cube
da = build_datacube(
    raster_files=raster_files,
    bounds=bounds,
    resampling='nearest',
    varname="gdp",
)

#chunk
# Initialize chunks dictionary with default values
chunks = {dim: -1 for dim in da.dims if dim not in ["time", "latitude", "longitude"]}

# Set chunk sizes for specific dimensions
if "time" in da.dims:
    chunks["time"] = 1
if "latitude" in da.dims:
    chunks["latitude"] = chunk_shape[0]
if "longitude" in da.dims:
    chunks["longitude"] = chunk_shape[1]

# Create encoding dictionary
encoding = {}
for dim, chunk_size in chunks.items():
    encoding[dim] = {"chunks": chunk_size}
        
# Apply chunking
ds = da.chunk(chunks).to_dataset()

In [4]:
ds

Unnamed: 0,Array,Chunk
Bytes,2.48 MiB,39.06 kiB
Shape,"(5, 308, 423)","(1, 100, 100)"
Dask graph,100 chunks in 31 graph layers,100 chunks in 31 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 2.48 MiB 39.06 kiB Shape (5, 308, 423) (1, 100, 100) Dask graph 100 chunks in 31 graph layers Data type float32 numpy.ndarray",423  308  5,

Unnamed: 0,Array,Chunk
Bytes,2.48 MiB,39.06 kiB
Shape,"(5, 308, 423)","(1, 100, 100)"
Dask graph,100 chunks in 31 graph layers,100 chunks in 31 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [5]:
#local write
#outfile = 'test.zarr'
# Write to zarr
#da.to_zarr(outfile, mode="w",encoding=encoding)

#arraylake write
# Instantiate the Arraylake client
client_ar = arraylake.Client()

# Checkout the repo
repo = client_ar.get_repo("the-nature-conservancy/hm-forecasting")
#repo = client.get_repo("the-nature-conservancy/gmoncrieff-scratch")

# write
ds.to_zarr(repo.store, group="gdp-test", zarr_version=3,encoding=encoding)

# Make your first commit
repo.commit('commit test gdp cube')

ValueError: Error response 500 while requesting URL('https://api.earthmover.io/repos/the-nature-conservancy/hm-forecasting/contents/metadata/_bulk_set?session_id=42946ceb0aa14b029d366fc248f9bffe&base_commit='). <Response [500 Internal Server Error]>: 