# GELOS Chip Generation

This notebook runs the chip generation pipeline interactively. It initializes the Downloader class and runs the generation pipeline for one AOI at a time for explanation and testing.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import argparse
import geopandas as gpd
from src.gelos_config import GELOSConfig
from src.downloader import Downloader
from src.aoi_processor import AOI_Processor
from pathlib import Path
import shutil
from src.utils.array import unique_class
import leafmap
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
import pdb
import xarray as xr
import rioxarray as rxr
from src.gelos_config import GELOSConfig
from src.chip_generator import ChipGenerator
import pystac
import pandas as pd
import geopandas as gpd

from src.utils.search import search_s2l2a_scenes, search_s1rtc_scenes, search_lc2l2_scenes, search_annual_scene, count_unique_dates, get_lc2l2_wrs_path
from src.utils.stack import stack_data, stack_dem_data, stack_lulc_data, pystac_itemcollection_to_gdf
from functools import reduce

In [4]:
lc2l2_wrs_path = '/home/benchuser/data/WRs2l2a_descending_0.zip'
lc2l2_wrs_url = 'https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/atoms/files/WRs2l2a_descending_0.zip'

In [6]:
config = "/app//config.yml"
gelosconfig = GELOSConfig.from_yaml(config)
gelosconfig.dataset.version = 'test'
working_directory = Path(gelosconfig.directory.working) / gelosconfig.dataset.version

# create working directory with version number if none exists
working_directory.mkdir(exist_ok=True)
# copy yaml to working directory
shutil.copy(config, working_directory / "config.yaml")


PosixPath('/app/data/interim/test/config.yaml')

In [7]:
vars(gelosconfig)

{'dataset': DatasetConfig(version='test'),
 'aoi': AoiConfig(version='v0.30', include_indices=None, exclude_indices=None),
 'directory': DirectoryConfig(working='/app/data/interim', output='/app/data/processed'),
 'log_errors': True,
 's2l2a': S2L2AConfig(collection='sentinel-2-l2a', resolution=10, native_crs=True, fill_na=False, na_value=-999, dtype=dtype('int16'), time_ranges=['2023-01-01/2023-03-31', '2023-04-01/2023-06-30', '2023-07-01/2023-09-30', '2023-10-01/2023-12-31'], nodata_pixel_percentage=5, cloud_cover=30, cloud_band='SCL', bands=['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09', 'B11', 'B12', 'SCL']),
 's1rtc': S1RTCConfig(collection='sentinel-1-rtc', resolution=10, native_crs=False, fill_na=False, na_value=-999, dtype=dtype('float32'), nodata_pixel_percentage=5, delta_days=12, bands=['vv', 'vh']),
 'LC2L2': lc2l2Config(collection='LC2L2-c2-l2', resolution=30, native_crs=False, fill_na=False, na_value=-999, dtype=dtype('float32'), platforms=['LC2L2-8'

In [9]:
downloader = Downloader(gelosconfig)

In [8]:
processor = AOI_Processor(
    aoi_index = 133,
    aoi = downloader.aoi_gdf.iloc[133],
    chip_index = downloader.chip_index,
    working_directory = downloader.working_directory,
    catalog = downloader.catalog,
    config = gelosconfig,
)

In [9]:
vars(processor)

{'config': GELOSConfig(dataset=DatasetConfig(version='test'), aoi=AoiConfig(version='v0.30', include_indices=None, exclude_indices=None), directory=DirectoryConfig(working='/home/benchuser/data', output='/home/benchuser/final_data'), log_errors=True, s2l2a=S2L2AConfig(collection='sentinel-2-l2a', resolution=10, native_crs=True, fill_na=False, na_value=-999, dtype=dtype('int16'), time_ranges=['2023-01-01/2023-03-31', '2023-04-01/2023-06-30', '2023-07-01/2023-09-30', '2023-10-01/2023-12-31'], nodata_pixel_percentage=5, cloud_cover=50, cloud_band='SCL', bands=['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09', 'B11', 'B12', 'SCL']), s1rtc=S1RTCConfig(collection='sentinel-1-rtc', resolution=10, native_crs=False, fill_na=False, na_value=-999, dtype=dtype('float32'), nodata_pixel_percentage=5, delta_days=12, bands=['vv', 'vh']), lc2l2=lc2l2Config(collection='LC2L2-c2-l2', resolution=30, native_crs=False, fill_na=False, na_value=-999, dtype=dtype('float32'), platforms=['LC2

## Run process_aoi step-by-step

In [10]:
s2l2a_items = pystac.item_collection.ItemCollection([])
for date_range in processor.config.s2l2a.time_ranges:
    print(f"Searching Sentinel-2 scenes for {date_range}")
    s2l2a_items_season, processor.s2l2a_scene_id = search_s2l2a_scenes(
        processor.aoi.geometry,
        date_range,
        processor.catalog,
        processor.config.s2l2a.collection,
        processor.config.s2l2a.nodata_pixel_percentage,
        processor.config.s2l2a.cloud_cover,
        processor.s2l2a_scene_id,
    )
    if not s2l2a_items_season:
        raise ValueError("s2 scenes missing")
    s2l2a_items += s2l2a_items_season

Searching Sentinel-2 scenes for 2023-01-01/2023-03-31
Searching Sentinel-2 scenes for 2023-04-01/2023-06-30
Searching Sentinel-2 scenes for 2023-07-01/2023-09-30
Searching Sentinel-2 scenes for 2023-10-01/2023-12-31


In [11]:
s2l2a_items

In [12]:
try:
    processor.epsg = s2l2a_items[0].properties["proj:epsg"]
except:
    processor.epsg = int(s2l2a_items[0].properties["proj:code"].split(":")[-1])
    
processor.s2l2a_bbox = s2l2a_items[0].geometry

In [13]:
processor.s2l2a_bbox

{'type': 'Polygon',
 'coordinates': [[[-112.19945, 41.5455888],
   [-110.882965, 41.5517851],
   [-110.884705, 40.5626924],
   [-112.181625, 40.5567069],
   [-112.19945, 41.5455888]]]}

### Search lc2l2 scenes

In [14]:
processor.lc2l2_wrs_path = get_lc2l2_wrs_path(processor.s2l2a_bbox)

In [15]:
processor.lc2l2_wrs_path

38

In [16]:
lc2l2_wrs_gdf = gpd.read_file(lc2l2_wrs_url).to_crs(3857)

In [17]:
lc2l2_wrs_path_scenes = lc2l2_wrs_gdf[lc2l2_wrs_gdf['PATH'] == processor.lc2l2_wrs_path]

In [18]:
lc2l2_wrs_path_scenes.explore()

In [19]:
lc2l2_items = pystac.item_collection.ItemCollection([])

s2l2a_item = s2l2a_items[0]
date_range = processor.config.s2l2a.time_ranges[0]

center_datetime = s2l2a_item.datetime


aoi = processor.s2l2a_bbox
center_datetime = center_datetime
overall_date_range = date_range
delta_days = processor.config.lc2l2.delta_days
catalog = processor.catalog
collection = processor.config.lc2l2.collection
platforms = processor.config.lc2l2.platforms
cloud_cover = processor.config.lc2l2.cloud_cover
lc2l2_wrs_path = processor.lc2l2_wrs_path
print(f'{aoi=}')
print(f'{center_datetime=}')
print(f'{overall_date_range=}')
print(f'{delta_days=}')
print(f'{catalog=}')
print(f'{collection=}')
print(f'{platforms=}')
print(f'{cloud_cover=}')
print(f'{lc2l2_wrs_path=}')


aoi={'type': 'Polygon', 'coordinates': [[[-112.19945, 41.5455888], [-110.882965, 41.5517851], [-110.884705, 40.5626924], [-112.181625, 40.5567069], [-112.19945, 41.5455888]]]}
center_datetime=datetime.datetime(2023, 2, 16, 18, 14, 21, 24000, tzinfo=tzutc())
overall_date_range='2023-01-01/2023-03-31'
delta_days=91
catalog=<Client id=microsoft-pc>
collection='LC2L2-c2-l2'
platforms=['LC2L2-8', 'LC2L2-9']
cloud_cover=100
lc2l2_wrs_path=38


In [20]:
from src.utils.search import get_clipped_datetime_range

In [21]:
datetime_range = get_clipped_datetime_range(center_datetime, overall_date_range, delta_days)

In [22]:
query = {
        "platform": {"in": platforms},
        "eo:cloud_cover": {"lt": cloud_cover},
    }
# if lc2l2_wrs_path:
#     query["lc2l2:wrs_path"] = {"eq": lc2l2_wrs_path}
print(query)

{'platform': {'in': ['LC2L2-8', 'LC2L2-9']}, 'eo:cloud_cover': {'lt': 100}}


In [23]:
search = catalog.search(
    collections = collection,
    intersects = aoi,
    datetime = datetime_range,
    query = query,
    sortby = ["+properties.eo:cloud_cover"],
    max_items = 50 
)

In [24]:
items = search.item_collection()

In [26]:
lc2l2_items = pystac.item_collection.ItemCollection([])

for s2l2a_item, date_range in zip(s2l2a_items, processor.config.s2l2a.time_ranges):
    center_datetime = s2l2a_item.datetime
 
    lc2l2_item = search_lc2l2_scenes(
        processor.s2l2a_bbox,
        center_datetime,
        date_range,
        processor.config.lc2l2.delta_days,
        processor.catalog,
        processor.config.lc2l2.collection,
        processor.config.lc2l2.platforms,
        processor.config.lc2l2.cloud_cover,
        processor.lc2l2_wrs_path,
    )
    lc2l2_items += lc2l2_item



## Run Searching and Stacking Process

In [25]:
lc2l2_items = pystac.item_collection.ItemCollection([])
s1rtc_items = pystac.item_collection.ItemCollection([])
for s2l2a_item, date_range in zip(s2l2a_items, processor.config.s2l2a.time_ranges):
    center_datetime = s2l2a_item.datetime
    print(f"searching s1rtc and lc2l2 scenes close to {center_datetime} within {date_range}")
    s1rtc_item, processor.s1rtc_relative_orbit = search_s1rtc_scenes(
        processor.s2l2a_bbox,
        center_datetime,
        date_range,
        processor.config.s1rtc.delta_days,
        processor.catalog,
        processor.config.s1rtc.collection,
        processor.s1rtc_relative_orbit,
    )
    if not s1rtc_item:
        raise ValueError("s1 scenes missing")
    s1rtc_items += s1rtc_item

    lc2l2_item = search_lc2l2_scenes(
        processor.s2l2a_bbox,
        center_datetime,
        date_range,
        processor.config.lc2l2.delta_days,
        processor.catalog,
        processor.config.lc2l2.collection,
        processor.config.lc2l2.platforms,
        processor.config.lc2l2.cloud_cover,
        processor.lc2l2_wrs_path,
    )
    if not lc2l2_item:
        raise ValueError("lc2l2 scenes missing")
    lc2l2_items += lc2l2_item

if count_unique_dates(lc2l2_items) < 4:
    raise ValueError(f"lc2l2 scenes missing")

if count_unique_dates(s1rtc_items) < 4:
    raise ValueError(f"s1 scenes missing")
        
print("searching land cover data...")
lulc_items = search_annual_scene(
    processor.s2l2a_bbox,
    processor.config.lulc.year,
    processor.catalog,
    processor.config.lulc.collection,
)
if not lulc_items:
    raise ValueError(f"lulc data missing")

print("searching dem data...")
dem_items = search_annual_scene(
    processor.s2l2a_bbox,
    processor.config.dem.year,
    processor.catalog,
    processor.config.dem.collection,
)
if not dem_items:
    raise ValueError(f"dem data missing")

    # first, get area of overlap of all item bboxes
processor.itemcollections = {
    "s2l2a": s2l2a_items,
    "s1rtc": s1rtc_items,
    "lc2l2": lc2l2_items,
    "lulc": lulc_items,
    "dem": dem_items
}
bbox_gdf = pd.concat([pystac_itemcollection_to_gdf(items) for items in processor.itemcollections.values()])
bbox_gdf.to_file(processor.working_directory / f"{processor.aoi_index}_stac_items.json", driver="GeoJSON")

# group scenes which share a collection and date
# bbox_gdf['date'] = bbox_gdf.apply(lambda x: x.datetime.date())
bbox_gdf['datetime'] = pd.to_datetime(bbox_gdf['datetime'], format="mixed")
bbox_gdf['date'] = bbox_gdf['datetime'].dt.date
combined_geoms = bbox_gdf.groupby(['collection', 'date'])['geometry'].apply(lambda x: x.unary_union)

# get the intersection of all data sources as the bounding box for stacks
overlap = reduce(lambda x, y: x.intersection(y), combined_geoms)
processor.overlap_bounds = overlap.bounds

print("stacking lc2l2 data...")
processor.stacks['LC2L2'] = stack_data(
    lc2l2_items,
    "lc2l2",
    processor.config.lc2l2.native_crs,
    processor.config.lc2l2.resolution,
    processor.config.lc2l2.bands,
    processor.config.lc2l2.cloud_band,
    processor.epsg,
    processor.overlap_bounds,
    bbox_is_latlon = True
)

overlap_bbox = processor.stacks['LC2L2'].rio.bounds()

print("stacking dem data...")
processor.stacks['dem'] = stack_dem_data(
    dem_items, 
    processor.config.dem.native_crs,
    processor.config.dem.resolution, 
    processor.epsg, 
    overlap_bbox,
    bbox_is_latlon=False
)

print("stacking land cover data...")
processor.stacks['lulc'] = stack_lulc_data(
    lulc_items, 
    processor.config.lulc.native_crs,
    processor.config.lulc.resolution, 
    processor.epsg, 
    overlap_bbox,
    bbox_is_latlon=False
)


print("stacking s1rtc data...")
processor.stacks['s1rtc'] = stack_data(
    s1rtc_items,
    "s1rtc",
    processor.config.s1rtc.native_crs,
    processor.config.s1rtc.resolution,
    processor.config.s1rtc.bands,
    None,  # No cloud band for Sentinel-1
    processor.epsg,
    overlap_bbox,
    bbox_is_latlon=False
)

print("stacking s2l2a data...")
processor.stacks['s2l2a'] = stack_data(
    s2l2a_items,
    "s2l2a",
    processor.config.s2l2a.native_crs,
    processor.config.s2l2a.resolution,
    processor.config.s2l2a.bands,
    processor.config.s2l2a.cloud_band,
    processor.epsg,
    overlap_bbox,
    bbox_is_latlon=False
)


searching s1rtc and lc2l2 scenes close to 2023-02-16 18:14:21.024000+00:00 within 2023-01-01/2023-03-31
searching s1rtc and lc2l2 scenes close to 2023-06-21 18:09:19.024000+00:00 within 2023-04-01/2023-06-30
searching s1rtc and lc2l2 scenes close to 2023-09-29 18:10:39.024000+00:00 within 2023-07-01/2023-09-30
searching s1rtc and lc2l2 scenes close to 2023-11-28 18:17:09.024000+00:00 within 2023-10-01/2023-12-31
searching land cover data...
searching dem data...
stacking lc2l2 data...
stacking dem data...
stacking land cover data...
stacking s1rtc data...
stacking s2l2a data...


In [26]:
processor.scene_ids = {
    f"{platform}_scene_ids": [item.id for item in items] for platform, items in processor.itemcollections.items()
}

In [27]:
processor.scene_ids

{'s2l2a_scene_ids': ['S2A_MSIL2A_20230216T181421_R084_T12TVL_20230217T151342',
  'S2B_MSIL2A_20230621T180919_R084_T12TVL_20240928T000050',
  'S2B_MSIL2A_20230929T181039_R084_T12TVL_20241105T052343',
  'S2B_MSIL2A_20231128T181709_R084_T12TVL_20241110T055759'],
 's1rtc_scene_ids': ['S1A_IW_GRDH_1SDV_20230215T133437_20230215T133502_047247_05AB73_rtc',
  'S1A_IW_GRDH_1SDV_20230215T133412_20230215T133437_047247_05AB73_rtc',
  'S1A_IW_GRDH_1SDV_20230627T133441_20230627T133506_049172_05E9BF_rtc',
  'S1A_IW_GRDH_1SDV_20230627T133416_20230627T133441_049172_05E9BF_rtc',
  'S1A_IW_GRDH_1SDV_20230919T133446_20230919T133511_050397_06118B_rtc',
  'S1A_IW_GRDH_1SDV_20230919T133421_20230919T133446_050397_06118B_rtc',
  'S1A_IW_GRDH_1SDV_20231118T133446_20231118T133511_051272_062F8C_rtc',
  'S1A_IW_GRDH_1SDV_20231118T133421_20231118T133446_051272_062F8C_rtc'],
 'LC2L2_scene_ids': ['LC08_L2SP_038032_20230213_02_T1',
  'LC08_L2SP_038031_20230213_02_T1',
  'LC09_L2SP_038031_20230410_02_T1',
  'LC09_L2SP_0

In [30]:
s2l2a_np = processor.stacks['s2l2a'].isel(x=slice(5000, 5096), y=slice(5000,5096)).to_numpy()

In [31]:
from src.utils.output import normalize

In [32]:
s2l2a_np

array([[[[ 5166.,  5166.,  4264., ...,  8733.,  8733.,  8733.],
         [ 5166.,  5166.,  4264., ...,  8733.,  8733.,  8733.],
         [ 6611.,  6611.,  5433., ...,  8517.,  8517.,  8517.],
         ...,
         [ 7375.,  7375.,  6313., ...,  6598.,  6598.,  6598.],
         [ 7375.,  7375.,  6313., ...,  6598.,  6598.,  6598.],
         [ 7375.,  7375.,  6313., ...,  6598.,  6598.,  6598.]],

        [[ 3850.,  4082.,  4800., ...,  9648.,  9488., 10376.],
         [ 3846.,  3480.,  3886., ...,  7904.,  7524.,  8352.],
         [ 3810.,  3604.,  4472., ...,  6840.,  7668.,  8640.],
         ...,
         [ 6680.,  5272.,  6792., ..., 10112., 11096., 10440.],
         [ 7276.,  5916.,  6584., ...,  8464.,  8432.,  7748.],
         [ 7180.,  5724.,  6040., ...,  5504.,  4472.,  4820.]],

        [[ 4196.,  4400.,  4920., ...,  9496.,  9544., 10032.],
         [ 3916.,  3724.,  4188., ...,  7808.,  8208.,  7988.],
         [ 3994.,  4082.,  4564., ...,  6868.,  7620.,  8632.],
        

In [None]:
lc2l2_xr = processor.stacks['LC2L2'].isel(x=slice(1000, 1032), y=slice(1000,1032)).compute()

In [37]:
lc2l2_np.plot.imshow()

  return x.astype(astype_dtype, **kwargs)
  return x.astype(astype_dtype, **kwargs)


KeyboardInterrupt: 

Process Dask Worker process (from Nanny):
2025-11-19 00:56:21,017 - distributed.nanny - ERROR - Worker process died unexpectedly
Process Dask Worker process (from Nanny):
2025-11-19 00:56:21,017 - distributed.nanny - ERROR - Worker process died unexpectedly
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/conda/envs/gfm_bench/lib/python3.12/asyncio/runners.py", line 118, in run
    return self._loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/envs/gfm_bench/lib/python3.12/asyncio/base_events.py", line 691, in run_until_complete
    return future.result()
           ^^^^^^^^^^^^^^^
  File "/opt/conda/envs/gfm_bench/lib/python3.12/site-packages/distributed/nanny.py", line 984, in run
    await worker.finished()
  File "/opt/conda/envs/gfm_bench/lib/python3.12/site-packages/distributed/core.py", line 491, in finished
    await self._event_finished.wait()
  File "/opt/conda/envs/gfm_bench/lib/python3.12/asynci