In [1]:
# # List all directories and files in the current working directory
# for root, dirs, files in os.walk('.'):
#     print("Root directory:", root)
#     print("Subdirectories:", dirs)
#     print("Files:", files)
#     break  # Stop after the first level to avoid printing too much

In [2]:
import os
os.chdir("../")    #commment after first run


In [3]:
#imports
import dask.distributed
import pystac_client
import planetary_computer
import stackstac 
import numpy as np
import pandas as pd
import rioxarray
import xarray as xr
import geopandas as gpd
from src.utils import gen_chips

In [4]:
print(os.getcwd())
#should be /home/benchuser/code

/home/benchuser/code


In [5]:
#os.chdir("/home/benchuser/code")

In [6]:
#config setup
import yaml
with open("notebooks/config.yml", "r") as file:
    config = yaml.safe_load(file)
print(config)  # Check the structure of the dictionary


{'sentinel_2': {'collection': 'sentinel-2-l2a', 'time_ranges': ['2023-01-01/2023-03-31', '2023-04-01/2023-06-30', '2023-07-01/2023-09-30', '2023-10-01/2023-12-31'], 'cloud_cover': 1, 'bands': ['B02', 'B03', 'B04', 'B08', 'B11', 'B12'], 'resolution': 10}, 'land_cover': {'collection': 'io-lulc-annual-v02', 'year': '2023-01-02/2023-12-31'}, 'chips': {'sample_size': 100, 'chip_size': 224}, 'output': {'directory': 'notebooks/test_output_dump', 'naming_convention': 's2_{season}_{index:05}.tif'}, 'metadata': {'file': 'metadata.csv'}}


In [7]:
# Sentinel-2 settings
s2_collection = config["sentinel_2"]["collection"]
s2_date_ranges = config["sentinel_2"]["time_ranges"]
s2_bands = config["sentinel_2"]["bands"]
s2_resolution = config["sentinel_2"]["resolution"]
cloud_cover_threshold = config["sentinel_2"]["cloud_cover"]  # Max allowed cloud cover

# Land Cover settings
lc_collection = config["land_cover"]["collection"]
lc_year = config["land_cover"]["year"]  # Year of LC dataset

# Chip settings
sample_size = config["chips"]["sample_size"]  # Grid size for homogeneity check
chip_size = config["chips"]["chip_size"]  # Output chip size

# Output settings
output_dir = config["output"]["directory"]
chip_naming_convention = config["output"]["naming_convention"]

# Metadata settings
metadata_file = config["metadata"]["file"]

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Define seasons for indexing
seasons = ["JFM", "AMJ", "JAS", "OND"]

aoi_gdf = gpd.read_file("data/urbans.geojson") # or "data/aois.geojson"


In [8]:
#dask
from dask.distributed import Client, LocalCluster
cluster = LocalCluster(n_workers=8, threads_per_worker=1)
client = Client(cluster)
print(client.dashboard_link)

http://127.0.0.1:8787/status


Debugging information
---------------------
old task state: released
old run_spec: (<function execute_task at 0x7f99b48659e0>, (('rechunk-merge-rechunk-split-reshape-getitem-f67c1a7ec2b2bc8c42f64be7a8f74565', 0, 0),), {})
new run_spec: (<function execute_task at 0x7f99b48659e0>, (('fetch_raster_window-rechunk-merge-rechunk-split-reshape-getitem-f67c1a7ec2b2bc8c42f64be7a8f74565', 0, 0),), {})
old token: ('tuple', (('913ceb5b5beb463a9010ec0790bc30002ca34164', []), ('tuple', (('tuple', ('rechunk-merge-rechunk-split-reshape-getitem-f67c1a7ec2b2bc8c42f64be7a8f74565', 0, 0)),)), ('dict', ())))
new token: ('tuple', (('913ceb5b5beb463a9010ec0790bc30002ca34164', []), ('tuple', (('tuple', ('fetch_raster_window-rechunk-merge-rechunk-split-reshape-getitem-f67c1a7ec2b2bc8c42f64be7a8f74565', 0, 0)),)), ('dict', ())))
old dependencies: {('rechunk-merge-rechunk-split-reshape-getitem-f67c1a7ec2b2bc8c42f64be7a8f74565', 0, 0)}
new dependencies: {('fetch_raster_window-rechunk-merge-rechunk-split-reshape-g

In [9]:
def search_s2_scenes(aoi, date_range):
    """
    Searches for Sentinel-2 scenes within the AOI and date range.
    Adds debugging info to identify missing data issues.
    """

    #print(f"\n Searching Sentinel-2 for AOI {aoi.geometry.bounds} during {date_range}")

    catalog = pystac_client.Client.open(
        "https://planetarycomputer.microsoft.com/api/stac/v1",
        modifier=planetary_computer.sign_inplace,
    )

    # Print the exact query parameters
    # print(f"Querying collection: 'sentinel-2-l2a'")
    #print(f"Bounding Box: {aoi.geometry.bounds}")
    #print(f"Date Range: {date_range}")
    # print(f"Cloud Cover Filter: < {1}%")
    
    s2_search = catalog.search(
        collections=["sentinel-2-l2a"],
        bbox=aoi.geometry.bounds, 
        datetime=date_range,
        query={"eo:cloud_cover": {"lt": 1}},  # Fix query syntax to dictionary
        sortby=["+properties.eo:cloud_cover"],  # Sort by least cloud cover
        max_items=10,  # Increase items to check for any available data
    )

    # Print found items
    items = s2_search.item_collection()
    #print(f"Found {len(items)} Sentinel-2 scenes")
    
    # Print full STAC item properties for debugging
    # if len(items) > 0:
    #     print(f"First Scene ID: {items[0].id}")
    #     print(f"First Scene EPSG: {items[0].properties.get('proj:epsg', 'Unknown')}")
    #     print(f"Acquisition Date: {items[0].properties.get('datetime', 'Unknown')}")
    #     print(f"Cloud Cover: {items[0].properties.get('eo:cloud_cover', 'Unknown')}")
    # else:
    #     print(f"No Sentinel-2 data found for this AOI and date range!")

    return list(items)  # Convert ItemCollection to a list of Items
def search_lc_scene(bbox, lc_date_range):
    #print(f"Searching for Land Cover scenes within {lc_date_range} for bbox {bbox}")
    catalog = pystac_client.Client.open(
        "https://planetarycomputer.microsoft.com/api/stac/v1",
        modifier=planetary_computer.sign_inplace,
    )
    lc_search = catalog.search(
        collections=["io-lulc-annual-v02"],
        bbox=bbox,
        datetime=lc_date_range,
    )
    items = lc_search.item_collection()
    #print(f"Found {len(items)} Land Cover scenes")
    return list(items)  # Convert ItemCollection to a list of Items

def stack_s2_data(s2_items, s2_bands):
    print("\nChecking available assets in Sentinel-2 items...")
    valid_bands = [band for band in s2_bands if all(band in item.assets for item in s2_items)]
    try:
        s2_stack = stackstac.stack(
            s2_items,
            assets=valid_bands,
            epsg=s2_items[0].properties["proj:epsg"],
            resolution=10,
            fill_value=np.nan,
            dtype="float32"  # Ensure time dimension is preserved
        )
        print(f"Stacked Sentinel-2 bands: {list(s2_stack.coords['band'].values)}")
        print(f"Number of time steps: {len(s2_stack.time)}")
        return s2_stack
    except Exception as e:
        print(f"Error stacking Sentinel-2 data: {e}")
        return None
        
def stack_lc_data(lc_items, s2_epsg):
    if not lc_items:
        print("No Land Cover data found.")
        return None
    try:
        print("Stacking Land Cover images...")
        stacked_data = stackstac.stack(
            lc_items,
            dtype=np.ubyte,
            fill_value=255,
            sortby_date=False,
            epsg=s2_epsg,
            resolution=10,
            bounds_latlon=s2_items[0].bbox,
        ).squeeze()

        stacked_data = stacked_data.chunk(chunks={"x": sample_size, "y": sample_size})
        print("Stacked LC data shape:", stacked_data.shape)
        #print(f"Chunk sizes: {stacked_data.chunks}")   #uncomment for big chunk size 

        return stacked_data

    except Exception as e:
        print(f"Error stacking Land Cover data: {e}")
        return None


def has_missing_values(array):
    """Check if the given array contains NaN values and print only when necessary."""
    has_nan = array.isnull().any().compute()
    if has_nan:
        print("Warning: Missing values detected in the chip!")
    return has_nan

def process_chips(aoi, s2_stack, lc_stack, output_dir, global_index, chip_dict, sample_size):
    print(f"Processing chips for AOI at {aoi.geometry.bounds}")
    print(f"Number of time steps in s2_stack: {len(s2_stack.time)}")

    def unique_class(window, axis=None, **kwargs):
        return np.all(window == window[0, 0], axis=axis)

    new_x = (lc_stack.shape[0] // sample_size) * sample_size
    new_y = (lc_stack.shape[1] // sample_size) * sample_size

    lc_stack_trimmed = lc_stack.isel(
        x=slice(0, new_x), 
        y=slice(0, new_y)
    ).assign_coords(
        x=lc_stack.coords['x'].isel(x=slice(0, new_x)),
        y=lc_stack.coords['y'].isel(y=slice(0, new_y))
    )

    lc_uniqueness = lc_stack_trimmed.coarsen(x=sample_size, y=sample_size, boundary="trim").reduce(unique_class)

    for t in range(len(s2_stack.time)):
        acquisition_date = str(s2_stack.time[t].values)
        chip_count = 0
        print(f"Time step {t}, acquisition date: {acquisition_date}")

        for i in range(0, lc_stack_trimmed.shape[0] - sample_size + 1, sample_size):
            if chip_count >= 2:
                break
            for j in range(0, lc_stack_trimmed.shape[1] - sample_size + 1, sample_size):
                #print(f"i: {i}, j: {j}")
                if chip_count >= 2:
                    print(f"Reached 2 chips for time period {acquisition_date}. Moving to next time period.")
                      # Reset count for the next time period
                    break  # This breaks the innermost loop
                
                if not lc_uniqueness.isel(x=i // sample_size, y=j // sample_size):
                    #print(f"Skipping non-homogeneous chip at ({i}, {j})")
                    continue

                s2_chip = s2_stack.isel(time=t, x=slice(i, i + sample_size), y=slice(j, j + sample_size))
                lc_chip = lc_stack_trimmed.isel(x=slice(i, i + sample_size), y=slice(j, j + sample_size))

                print(f"Checking chip at ({i}, {j}), time step {t}")
                if s2_chip.isnull().all() or lc_chip.isnull().all():
                    print(f"Skipping chip {global_index}: Empty dataset (NoDataInBounds)")
                    continue

                chip_metadata = {
                    "aoi_bounds": aoi.geometry.bounds,
                    "date": acquisition_date,
                    "coords": (i, j)
                }

                chip_dict[(i, j, acquisition_date)] = (s2_chip, lc_chip, chip_metadata)

                chip_name = f"s2_{global_index:05}_{acquisition_date}.tif"
                print(f"Saving chip {chip_name} to {output_dir}")
                lc_chip.rio.to_raster(os.path.join(output_dir, f"lc_{chip_name}"))
                s2_chip.squeeze().drop_vars([dim for dim in s2_chip.dims if dim not in ['band', 'x', 'y']], errors="ignore").rio.to_raster(os.path.join(output_dir, chip_name))

                print(f"Saved chip {chip_name}")

                global_index += 1
                chip_count += 1

                if chip_count >= 8:
                    print(f"Reached 8 chips for time period {acquisition_date}. Moving to next time period.")
                    break

    print("Finished processing chips.")
    return global_index


def plot_chips(chip_dict, chip_indices):
    import matplotlib.pyplot as plt

    selected_chips = [chip_dict[idx] for idx in chip_indices if idx in chip_dict]

    if not selected_chips:
        print("No chips selected for plotting.")
        return

    num_chips = len(selected_chips)
    num_rows = (num_chips + 3) // 4  # Calculate the number of rows for 4 chips per row

    fig, axes = plt.subplots(num_rows, 4, figsize=(20, 6 * num_rows))
    axes = axes.flatten() if num_rows > 1 else [axes]

    for idx, (s2_chip, lc_chip, metadata) in enumerate(selected_chips):
        if s2_chip is None or lc_chip is None:
            print(f"Skipping chip {chip_indices[idx]}: Missing data.")
            continue

        # Check if the 'time' dimension exists, if not, select the first time step
        if 'time' in s2_chip.dims:
            rgb_image = s2_chip.isel(time=0).sel(band=["B04", "B03", "B02"]).transpose("y", "x", "band").compute()
        else:
            rgb_image = s2_chip.sel(band=["B04", "B03", "B02"]).transpose("y", "x", "band").compute()

        axes[idx].imshow(rgb_image.clip(0, 3000) / 3000)
        axes[idx].set_title(f"Chip {idx+1}\nDate: {metadata['date']}\nCoords: {metadata['coords']}")
        axes[idx].axis("off")

    for ax in axes[num_chips:]:
        ax.axis("off")

    plt.tight_layout()
    plt.show()




In [10]:
global_index = 0
chip_dict = {}
for index, aoi in aoi_gdf.iterrows():
    print(f"\nProcessing AOI: {aoi.geometry.bounds}")  # Debug: print AOI bounds

    for date_range in s2_date_ranges:
        print(f"Querying for date range: {date_range}")  # Debug: print date range being processed
       
        # Debugging: Check if Sentinel-2 items are found for the AOI and date range
        s2_items = search_s2_scenes(aoi, date_range)
        if not s2_items:
            print(f"No Sentinel-2 scenes found for AOI {aoi.geometry.bounds} and date range {date_range}")
            continue
       
        expected_bands = ["B02", "B03", "B04", "B08", "B11", "B12"]
        s2_stack = stack_s2_data(s2_items, expected_bands)
       
        # Debugging: Check if Sentinel-2 stack is created correctly
        if s2_stack is None:
            print(f"Failed to stack Sentinel-2 bands for AOI {aoi.geometry.bounds} and date range {date_range}")
            continue

        # Debugging: Check if Land Cover items are found for the AOI and date range
        lc_items = search_lc_scene(s2_items[0].bbox, lc_year)
        if not lc_items:
            print(f"No Land Cover data found for AOI {aoi.geometry.bounds} and date range {date_range}")
            continue
       
        # Debugging: Check Land Cover data dimensions and bounds
        lc_stack = stack_lc_data(lc_items, s2_stack.rio.crs.to_epsg())
        if lc_stack is None:
            print(f"Failed to stack Land Cover data for AOI {aoi.geometry.bounds} and date range {date_range}")
            continue
        print(f"Land Cover stack shape: {lc_stack.shape}")  # Debug: check shape of the LC stack

        # Debugging: Check bounding box of the Land Cover data and Sentinel-2 data
        print(f"Land Cover bounding box: {s2_items[0].bbox}")  # Debug: print bounding box for Sentinel-2 scene
        print(f"Land Cover bounding box for AOI: {aoi.geometry.bounds}")  # Debug: print AOI bounding box

        # Process chips after confirming the data is available
        global_index = process_chips(aoi, s2_stack, lc_stack, output_dir, global_index, chip_dict, sample_size)


Processing AOI: (31.20416267326229, 30.02337142235983, 31.280433633102717, 30.060612342021983)
Querying for date range: 2023-01-01/2023-03-31

Checking available assets in Sentinel-2 items...
Stacked Sentinel-2 bands: ['B02', 'B03', 'B04', 'B08', 'B11', 'B12']
Number of time steps: 3


  times = pd.to_datetime(


Stacking Land Cover images...


  times = pd.to_datetime(


Stacked LC data shape: (11272, 11273)
Land Cover stack shape: (11272, 11273)
Land Cover bounding box: [30.91139663, 29.72604903, 32.06723105, 30.72965025]
Land Cover bounding box for AOI: (31.20416267326229, 30.02337142235983, 31.280433633102717, 30.060612342021983)
Processing chips for AOI at (31.20416267326229, 30.02337142235983, 31.280433633102717, 30.060612342021983)
Number of time steps in s2_stack: 3
Time step 0, acquisition date: 2023-01-18T08:31:59.024000000
Checking chip at (0, 900), time step 0
Saving chip s2_00000_2023-01-18T08:31:59.024000000.tif to notebooks/test_output_dump
Saved chip s2_00000_2023-01-18T08:31:59.024000000.tif
Checking chip at (0, 3000), time step 0
Saving chip s2_00001_2023-01-18T08:31:59.024000000.tif to notebooks/test_output_dump
Saved chip s2_00001_2023-01-18T08:31:59.024000000.tif
Reached 2 chips for time period 2023-01-18T08:31:59.024000000. Moving to next time period.
Time step 1, acquisition date: 2023-03-04T08:28:31.024000000
Checking chip at (0,

  times = pd.to_datetime(


Stacking Land Cover images...


  times = pd.to_datetime(


Stacked LC data shape: (11272, 11273)
Land Cover stack shape: (11272, 11273)
Land Cover bounding box: [30.91139663, 29.72604903, 32.06723105, 30.72965025]
Land Cover bounding box for AOI: (31.20416267326229, 30.02337142235983, 31.280433633102717, 30.060612342021983)
Processing chips for AOI at (31.20416267326229, 30.02337142235983, 31.280433633102717, 30.060612342021983)
Number of time steps in s2_stack: 10
Time step 0, acquisition date: 2023-04-03T08:26:01.024000000
Checking chip at (0, 900), time step 0
Saving chip s2_00006_2023-04-03T08:26:01.024000000.tif to notebooks/test_output_dump
Saved chip s2_00006_2023-04-03T08:26:01.024000000.tif
Checking chip at (0, 3000), time step 0
Saving chip s2_00007_2023-04-03T08:26:01.024000000.tif to notebooks/test_output_dump
Saved chip s2_00007_2023-04-03T08:26:01.024000000.tif
Reached 2 chips for time period 2023-04-03T08:26:01.024000000. Moving to next time period.
Time step 1, acquisition date: 2023-04-18T08:26:09.024000000
Checking chip at (0

  times = pd.to_datetime(


Stacking Land Cover images...
Stacked LC data shape: (11272, 11273)
Land Cover stack shape: (11272, 11273)
Land Cover bounding box: [30.91139663, 29.72604903, 32.06723105, 30.72965025]
Land Cover bounding box for AOI: (31.20416267326229, 30.02337142235983, 31.280433633102717, 30.060612342021983)
Processing chips for AOI at (31.20416267326229, 30.02337142235983, 31.280433633102717, 30.060612342021983)
Number of time steps in s2_stack: 10


  times = pd.to_datetime(


Time step 0, acquisition date: 2023-07-17T08:26:09.024000000
Checking chip at (0, 900), time step 0
Saving chip s2_00026_2023-07-17T08:26:09.024000000.tif to notebooks/test_output_dump
Saved chip s2_00026_2023-07-17T08:26:09.024000000.tif
Checking chip at (0, 3000), time step 0
Saving chip s2_00027_2023-07-17T08:26:09.024000000.tif to notebooks/test_output_dump
Saved chip s2_00027_2023-07-17T08:26:09.024000000.tif
Reached 2 chips for time period 2023-07-17T08:26:09.024000000. Moving to next time period.
Time step 1, acquisition date: 2023-07-22T08:26:11.024000000
Checking chip at (0, 900), time step 1
Saving chip s2_00028_2023-07-22T08:26:11.024000000.tif to notebooks/test_output_dump
Saved chip s2_00028_2023-07-22T08:26:11.024000000.tif
Checking chip at (0, 3000), time step 1
Saving chip s2_00029_2023-07-22T08:26:11.024000000.tif to notebooks/test_output_dump
Saved chip s2_00029_2023-07-22T08:26:11.024000000.tif
Reached 2 chips for time period 2023-07-22T08:26:11.024000000. Moving to 

  times = pd.to_datetime(


Stacking Land Cover images...


  times = pd.to_datetime(


Stacked LC data shape: (11272, 11273)
Land Cover stack shape: (11272, 11273)
Land Cover bounding box: [30.91139663, 29.72604903, 32.06723105, 30.72965025]
Land Cover bounding box for AOI: (31.20416267326229, 30.02337142235983, 31.280433633102717, 30.060612342021983)
Processing chips for AOI at (31.20416267326229, 30.02337142235983, 31.280433633102717, 30.060612342021983)
Number of time steps in s2_stack: 5
Time step 0, acquisition date: 2023-10-20T08:29:41.024000000
Checking chip at (0, 900), time step 0
Saving chip s2_00046_2023-10-20T08:29:41.024000000.tif to notebooks/test_output_dump
Saved chip s2_00046_2023-10-20T08:29:41.024000000.tif
Checking chip at (0, 3000), time step 0
Saving chip s2_00047_2023-10-20T08:29:41.024000000.tif to notebooks/test_output_dump
Saved chip s2_00047_2023-10-20T08:29:41.024000000.tif
Reached 2 chips for time period 2023-10-20T08:29:41.024000000. Moving to next time period.
Time step 1, acquisition date: 2023-10-25T08:29:29.024000000
Checking chip at (0,

  times = pd.to_datetime(


Stacking Land Cover images...
Stacked LC data shape: (11350, 10215)
Land Cover stack shape: (11350, 10215)
Land Cover bounding box: [28.32859925, 40.53618553, 29.51453839, 41.53860487]
Land Cover bounding box for AOI: (28.84773759372314, 41.00724457843606, 28.92438317781108, 41.04136515505064)
Processing chips for AOI at (28.84773759372314, 41.00724457843606, 28.92438317781108, 41.04136515505064)
Number of time steps in s2_stack: 1


  times = pd.to_datetime(


Time step 0, acquisition date: 2023-01-09T08:53:31.024000000
Checking chip at (0, 0), time step 0
Skipping chip 56: Empty dataset (NoDataInBounds)
Checking chip at (0, 100), time step 0
Skipping chip 56: Empty dataset (NoDataInBounds)
Checking chip at (0, 200), time step 0
Skipping chip 56: Empty dataset (NoDataInBounds)
Checking chip at (0, 300), time step 0
Skipping chip 56: Empty dataset (NoDataInBounds)
Checking chip at (0, 400), time step 0
Skipping chip 56: Empty dataset (NoDataInBounds)
Checking chip at (0, 500), time step 0
Skipping chip 56: Empty dataset (NoDataInBounds)
Checking chip at (0, 600), time step 0
Skipping chip 56: Empty dataset (NoDataInBounds)
Checking chip at (0, 700), time step 0
Skipping chip 56: Empty dataset (NoDataInBounds)
Checking chip at (0, 800), time step 0
Skipping chip 56: Empty dataset (NoDataInBounds)
Checking chip at (0, 5600), time step 0
Skipping chip 56: Empty dataset (NoDataInBounds)
Checking chip at (0, 5700), time step 0
Skipping chip 56: Em



KeyboardInterrupt: 

In [None]:
print(chip_dict.keys())

In [None]:
#print(chip_dict)

In [None]:
import matplotlib.pyplot as plt
# List of valid chip indices in chip_dict
valid_chip_indices = list(chip_dict.keys())

# Map your indices to the actual (i, j, acquisition_date) keys
selected_indices = []
for idx in [1, 20, 40, 50]:
    # This assumes that you want to select based on the index of chips, i.e., global index
    if idx < len(valid_chip_indices):
        selected_indices.append(valid_chip_indices[idx])

# Plot the selected chips
plot_chips(chip_dict, selected_indices)


In [None]:
#print(s2_stack)

In [None]:
#print(s2_stack.isel(band=0))

In [None]:
#print("First four chips content:", first_four_chips)

In [None]:
#print(chip_dict)