In [1]:
# # List all directories and files in the current working directory
# for root, dirs, files in os.walk('.'):
#     print("Root directory:", root)
#     print("Subdirectories:", dirs)
#     print("Files:", files)
#     break  # Stop after the first level to avoid printing too much

In [19]:
import os
os.chdir("../")    #commment after first run


In [20]:
#imports
import dask.distributed
import pystac_client
import planetary_computer
import stackstac 
import numpy as np
import pandas as pd
import rioxarray
import xarray as xr
import geopandas as gpd
from src.utils import gen_chips


In [25]:
print(os.getcwd())
#should be /home/benchuser/code

/home/benchuser/code


In [24]:
#os.chdir("/home/benchuser/code")

In [44]:
#config setup
import yaml
with open("notebooks/config.yml", "r") as file:
    config = yaml.safe_load(file)
print(config)  # Check the structure of the dictionary


{'sentinel_2': {'collection': 'sentinel-2-l2a', 'time_ranges': ['2023-01-01/2023-03-31', '2023-04-01/2023-06-30', '2023-07-01/2023-09-30', '2023-10-01/2023-12-31'], 'cloud_cover': 1, 'bands': ['B2', 'B3', 'B4', 'B8', 'B11', 'B12'], 'resolution': 10}, 'land_cover': {'collection': 'io-lulc-annual-v02', 'year': '2023-01-01/2023-12-31'}, 'chips': {'sample_size': 256, 'chip_size': 128}, 'output': {'directory': 'notebooks/test_output_dump', 'naming_convention': 's2_{season}_{index:05}.tif'}, 'metadata': {'file': 'metadata.csv'}}


In [45]:
# Sentinel-2 settings
s2_collection = config["sentinel_2"]["collection"]
s2_date_ranges = config["sentinel_2"]["time_ranges"]
s2_bands = config["sentinel_2"]["bands"]
s2_resolution = config["sentinel_2"]["resolution"]
cloud_cover_threshold = config["sentinel_2"]["cloud_cover"]  # Max allowed cloud cover

# Land Cover settings
lc_collection = config["land_cover"]["collection"]
lc_year = config["land_cover"]["year"]  # Year of LC dataset

# Chip settings
sample_size = config["chips"]["sample_size"]  # Grid size for homogeneity check
chip_size = config["chips"]["chip_size"]  # Output chip size

# Output settings
output_dir = config["output"]["directory"]
chip_naming_convention = config["output"]["naming_convention"]

# Metadata settings
metadata_file = config["metadata"]["file"]

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Define seasons for indexing
seasons = ["JFM", "AMJ", "JAS", "OND"]

aoi_gdf = gpd.read_file("data/urbans.geojson") # or "data/aois.geojson"


In [46]:
aoi_bounds = aoi.geometry.bounds  # This gives (minx, miny, maxx, maxy)
# Extract coordinates
minx, miny, maxx, maxy = aoi_bounds

# Reduce size by 50% (adjustable)
shrink_factor = 0.1  
center_x = (minx + maxx) / 2
center_y = (miny + maxy) / 2

# Compute new bounds
new_minx = center_x - (maxx - minx) * shrink_factor / 2
new_maxx = center_x + (maxx - minx) * shrink_factor / 2
new_miny = center_y - (maxy - miny) * shrink_factor / 2
new_maxy = center_y + (maxy - miny) * shrink_factor / 2

# Create new AOI dictionary (to pass into bbox-based queries)
aoi_test_bbox = [new_minx, new_miny, new_maxx, new_maxy]

print("Old AOI Bounding Box:", aoi_bounds)

print("New AOI Bounding Box for Testing:", aoi_test_bbox)

Old AOI Bounding Box: (31.20416267326229, 30.02337142235983, 31.280433633102717, 30.060612342021983)
New AOI Bounding Box for Testing: [31.238484605190482, 30.0401298362078, 31.246111701174524, 30.043853928174013]


In [48]:
#dask
from dask.distributed import Client, LocalCluster
cluster = LocalCluster(n_workers=8, threads_per_worker=1)
client = Client(cluster)
print(client.dashboard_link)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 37567 instead


http://127.0.0.1:37567/status


In [49]:

def search_s2_scenes(aoi, date_range):
    print(f"Searching for Sentinel-2 scenes for AOI at {aoi.geometry.bounds} within {date_range}")
    catalog = pystac_client.Client.open(
        "https://planetarycomputer.microsoft.com/api/stac/v1",
        modifier=planetary_computer.sign_inplace,
    )
    s2_search = catalog.search(
        collections=["sentinel-2-l2a"],
        #bbox=aoi.geometry.bounds, #UNCOMMENT FOR FULL AOI
        bbox = aoi_test_bbox,
        datetime=date_range,
        query=["eo:cloud_cover<1"],
        sortby=["+properties.eo:cloud_cover"],
        max_items=1,
    )
    items = s2_search.item_collection()
    print(f"Found {len(items)} Sentinel-2 scenes")
    return items

def search_lc_scene(bbox, lc_date_range):
    print(f"Searching for Land Cover scenes within {lc_date_range} for bbox {bbox}")
    catalog = pystac_client.Client.open(
        "https://planetarycomputer.microsoft.com/api/stac/v1",
        modifier=planetary_computer.sign_inplace,
    )
    lc_search = catalog.search(
        collections=["io-lulc-annual-v02"],
        bbox=bbox,
        datetime=lc_date_range,
    )
    items = lc_search.item_collection()
    print(f"Found {len(items)} Land Cover scenes")
    return items

def stack_s2_data(s2_items, s2_assets):
    if not s2_items:
        print("No Sentinel-2 items found.")
        return None
    
    try:
        print("Stacking Sentinel-2 images...")
        stacked_data = stackstac.stack(
            s2_items,
            assets=s2_assets,
            epsg=s2_items[0].properties["proj:epsg"],
            resolution=10,
            bounds_latlon=s2_items[0].bbox,
        ).median("time", skipna=True).squeeze()

        print("Stacked S2 data shape:", stacked_data.shape)

        if stacked_data is None or not isinstance(stacked_data, xr.DataArray):
            print("Error: Stacking Sentinel-2 data resulted in an invalid output.")
            return None

        return stacked_data

    except Exception as e:
        print(f"Error stacking Sentinel-2 data: {e}")
        return None

def stack_lc_data(lc_items, s2_epsg):
    if len(lc_items) == 0:
        print("No Land Cover data found.")
        return None
    try:
        print("Stacking Land Cover images...")
        stacked_data = stackstac.stack(
            lc_items,
            dtype=np.ubyte,
            fill_value=255,
            sortby_date=False,
            epsg=s2_epsg,
            resolution=10,
            bounds_latlon=lc_items[0].bbox,
        ).squeeze()
        print("Stacked LC data shape:", stacked_data.shape)
        return stacked_data
    except Exception as e:
        print(f"Error stacking Land Cover data: {e}")
        return None

def unique_class(window, axis=None, **kwargs):
    return np.all(window == window[:, :1, :, :1], axis=(1, 3))


def is_homogeneous(lc_stack):
    print("Checking for homogeneous LC regions...")

    # Ensure dimensions are exactly divisible by `sample_size`
    height, width = lc_stack.shape[-2], lc_stack.shape[-1]
    new_height = (height // sample_size) * sample_size
    new_width = (width // sample_size) * sample_size

    if new_height == 0 or new_width == 0:
        print("Error: LC stack is empty after cropping. Skipping this AOI.")
        return None  

    if new_height != height or new_width != width:
        print(f"Cropping LC stack to ({new_height}, {new_width}) to match sample_size.")
        lc_stack = lc_stack.isel(x=slice(0, new_height), y=slice(0, new_width))

    # Drop mismatched coordinate variables
    lc_stack = lc_stack.drop_vars([var for var in lc_stack.coords if "x" in var or "y" in var], errors="ignore")

    # Ensure rechunking aligns with the coarsening window
    lc_stack = lc_stack.chunk({"x": sample_size, "y": sample_size})

    # Check if lc_stack is empty after all modifications
    if lc_stack.size == 0:
        print("Error: LC stack is empty after cropping. Skipping this AOI.")
        return None  

    # Apply coarsening and reduce with unique_class function
    lc_uniqueness = lc_stack.coarsen(x=sample_size, y=sample_size, boundary="trim").reduce(unique_class)

    # Debugging: Print dtype of lc_uniqueness
    print(f"LC uniqueness dtype: {lc_uniqueness.dtype}, shape: {lc_uniqueness.shape}, type: {type(lc_uniqueness)}")

    return lc_uniqueness

    
def has_missing_values(array):
    print("Checking for missing values...")
    return array.isnull().any().compute()

def process_chips(aoi, s2_stack, lc_stack, output_dir, global_index):
    print(f"Processing chips for AOI at {aoi.geometry.bounds}")

    # Compute lc_uniqueness before looping
    lc_uniqueness = is_homogeneous(lc_stack).compute()
    print(f"LC uniqueness dtype after compute: {lc_uniqueness.dtype}, shape: {lc_uniqueness.shape}")

    for i in range(0, lc_stack.shape[1] - chip_size, sample_size):
        for j in range(0, lc_stack.shape[2] - chip_size, sample_size):
            
            # Ensure indices stay within valid bounds
            x_index = min(i // sample_size, lc_uniqueness.sizes["x"] - 1)
            y_index = min(j // sample_size, lc_uniqueness.sizes["y"] - 1)

            # Extract boolean value safely
            value = lc_uniqueness.isel(x=x_index, y=y_index).values
            if isinstance(value, np.ndarray):  
                value = value.item()  # Ensure it's a single boolean value

            if not value:
                print(f"Skipping chip at ({i}, {j}): LC region not homogeneous.")
                continue  

            s2_chip = s2_stack.isel(x=slice(i, i + chip_size), y=slice(j, j + chip_size))
            lc_chip = lc_stack.isel(x=slice(i, i + chip_size), y=slice(j, j + chip_size))

            if has_missing_values(s2_chip):
                continue  

            chip_name = f"s2_{date_range[:3]}_{global_index:05}.tif"
            lc_chip.rio.to_raster(os.path.join(output_dir, f"lc_{chip_name}"))
            s2_chip.rio.to_raster(os.path.join(output_dir, chip_name))

            print(f"Saved chip {chip_name}")
            global_index += 1

    return global_index

def crop_to_aoi(stack, bbox):
    """Crop the Sentinel-2 stack to match the AOI bounding box"""
    minx, miny, maxx, maxy = bbox
    return stack.rio.clip_box(minx, miny, maxx, maxy)



In [52]:
## Main processing loop
chip_dict = {}
global_index = 0
# Main processing loop
for index, aoi in aoi_gdf.iterrows():
    print(f"Processing AOI {index} at {aoi.geometry.bounds}")

    # Search for Sentinel-2 images
    s2_items = [search_s2_scenes(aoi, date_range) for date_range in s2_date_ranges]
    
    if any(len(items) == 0 for items in s2_items):
        print(f"Skipping AOI {index}: Missing Sentinel-2 data for at least one quarter.")
        continue

    # Stack Sentinel-2 images
    s2_stack = stack_s2_data([item[0] for item in s2_items if item], s2_bands)

    # âœ… Crop Sentinel-2 stack to AOI to reduce size
    if s2_stack is not None:
        print(f"Original Sentinel-2 stack shape: {s2_stack.shape}")
        s2_stack = crop_to_aoi(s2_stack, aoi.geometry.bounds)  # Crop it
        print(f"Cropped Sentinel-2 stack shape: {s2_stack.shape}")

    # Ensure valid Sentinel-2 stack before proceeding
    if s2_stack is None:
        print(f"Skipping AOI {index}: No valid Sentinel-2 data found after cropping.")
        continue

    if not hasattr(s2_stack, "rio") or s2_stack.rio.crs is None:
        print(f"Error: s2_stack is missing CRS information. Assigning manually.")
        s2_first_item = next(iter(s2_items), None)  # Get the first actual item
        if s2_first_item is None:
            print("Error: No valid Sentinel-2 item found.")
            continue

        s2_epsg = s2_first_item.properties["proj:epsg"]
        s2_stack.rio.write_crs(f"epsg:{s2_epsg}", inplace=True)

    print(f"Sentinel-2 CRS: {s2_stack.rio.crs}")

    # Search for Land Cover images
    lc_items = search_lc_scene(s2_items[0][0].bbox, lc_year)
    
    if len(lc_items) == 0:
        print(f"Skipping AOI {index}: No valid LC data found.")
        continue

    # Stack and process LC data
    lc_stack = stack_lc_data(lc_items, s2_stack.rio.crs.to_epsg())

    if lc_stack is None:
        print(f"Skipping AOI {index}: No valid LC data found.")
        continue

    # Process chips
    global_index, chip_dict = process_chips(aoi, s2_stack, lc_stack, output_dir, global_index)




Processing AOI 0 at (31.20416267326229, 30.02337142235983, 31.280433633102717, 30.060612342021983)
Searching for Sentinel-2 scenes for AOI at (31.20416267326229, 30.02337142235983, 31.280433633102717, 30.060612342021983) within 2023-01-01/2023-03-31
Found 1 Sentinel-2 scenes
Searching for Sentinel-2 scenes for AOI at (31.20416267326229, 30.02337142235983, 31.280433633102717, 30.060612342021983) within 2023-04-01/2023-06-30


APIError: The request exceeded the maximum allowed time, please try again. If the issue persists, please contact planetarycomputer@microsoft.com.



In [None]:
#optional plotting code
import matplotlib.pyplot as plt

# Select the first stored location
if chip_dict:
    first_location = next(iter(chip_dict.keys()))  # Get the first (i, j) key
    first_four_chips = chip_dict[first_location]  # Get the corresponding chips

    fig, axes = plt.subplots(2, 4, figsize=(15, 6))  # 2 rows, 4 columns

    for idx, (s2_chip, lc_chip) in enumerate(first_four_chips):
        # Sentinel-2 RGB Visualization (assuming B4, B3, B2 are RGB)
        rgb_image = s2_chip.sel(band=[3, 2, 1]).transpose("y", "x", "band")  # Adjust band indices if needed
        axes[0, idx].imshow(rgb_image.compute().clip(0, 3000) / 3000)  # Normalize for better visualization
        axes[0, idx].set_title(f"Sentinel-2 Chip {idx+1}")
        axes[0, idx].axis("off")

        # Land Cover Visualization
        axes[1, idx].imshow(lc_chip.compute(), cmap="tab10")  # Color map for categorical data
        axes[1, idx].set_title(f"Land Cover Chip {idx+1}")
        axes[1, idx].axis("off")

    plt.tight_layout()
    plt.show()

In [None]:
print(chip_dict)