In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir("../")

In [3]:
import pystac_client
import pystac
from requests.adapters import HTTPAdapter
from urllib3 import Retry
from pystac_client.stac_api_io import StacApiIO
import planetary_computer

import dask.distributed
import numpy as np
import rioxarray
import pandas as pd
import geopandas as gpd
from src.utils import search_s2_scenes, search_s1_scenes, search_lc_scene, stack_s2_data, stack_lc_data, unique_class, missing_values, gen_chips
import yaml

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
with open("config.yml", "r") as file:
    config = yaml.safe_load(file)

In [6]:
aoi_gdf = gpd.read_file("data/map_v0.30.geojson")

In [7]:
# Following AOIs have broken scenes in the STAC catalog and should be removed
aoi_gdf = aoi_gdf[aoi_gdf.index != 12]
aoi_gdf = aoi_gdf[aoi_gdf.index != 25]
aoi_gdf = aoi_gdf[aoi_gdf.index != 46]
aoi_gdf = aoi_gdf[aoi_gdf.index != 60]
aoi_gdf = aoi_gdf[aoi_gdf.index != 81]
aoi_gdf = aoi_gdf[aoi_gdf.index != 153]

In [8]:
from dask.distributed import Client, LocalCluster
cluster = LocalCluster()#(n_workers=8, threads_per_worker=2)
client = Client(cluster)
print(client.dashboard_link)

http://127.0.0.1:8787/status


In [9]:
retry = Retry(
    total=10, backoff_factor=1, status_forcelist=[502, 503, 504], allowed_methods=None
)
stac_api_io = StacApiIO(max_retries=retry)

catalog = pystac_client.Client.open(
    "https://planetarycomputer.microsoft.com/api/stac/v1",
    modifier=planetary_computer.sign_inplace,
    stac_io=stac_api_io
)

In [10]:
def process_chips(s2_stack, lc_stack, epsg, sample_size, chip_size, global_index, metadata_df):
    
    try:
        lc_stack = lc_stack.compute()
    except:
        print("skipping the AOI for no LC data")
        return global_index, metadata_df
    
    lc_uniqueness = lc_stack.coarsen(x = sample_size,
                                     y = sample_size,
                                     boundary = "trim"
                                    ).reduce(unique_class)
    lc_uniqueness[0:2, :] = False
    lc_uniqueness[-2:, :] = False
    lc_uniqueness[:, 0:2] = False
    lc_uniqueness[:, -2:] = False

    ys, xs = np.where(lc_uniqueness)
    print("Loading s2_stack")
    
    try:
        s2_stack = s2_stack.compute()
    except:
        print("skipping the AOI for no S2 data")
        return global_index, metadata_df

    # Following indices are added to limit the number of rangeland, bareground, and water chips per tile
    rangeland_index = 0
    bareground_index = 0
    water_index = 0
    tree_index = 0
    crops_index = 0
    for index in range(0, len(ys)):
        y = ys[index]
        x = xs[index]
    
            
        x_coords = slice((x) * sample_size - int((chip_size - sample_size)/2), (x + 1) * sample_size + int((chip_size - sample_size)/2))
        y_coords = slice((y) * sample_size - int((chip_size - sample_size)/2), (y + 1) * sample_size + int((chip_size - sample_size)/2))    
        
        s2_array = s2_stack.isel(x = x_coords, y = y_coords)
        s2_array.rio.write_crs(f"epsg:{epsg}", inplace=True)
        s2_array = s2_array.where((s2_array.x >= s2_stack.x[(x) * sample_size]) &
                                  (s2_array.x < s2_stack.x[(x + 1) * sample_size]) & 
                                  (s2_array.y <= s2_stack.y[(y) * sample_size]) &
                                  (s2_array.y > s2_stack.y[(y + 1) * sample_size])
                                 )
        
        if missing_values(s2_array, chip_size, sample_size):
            continue        
        
        s2_array = s2_array.fillna(-999)
        s2_array = s2_array.rio.write_nodata(-999)
        s2_array = s2_array.astype(np.dtype(np.int16))
        s2_array = s2_array.rename("s2")
        

                
        lc_array = lc_stack.isel(x = x_coords, y = y_coords)
        lc_array.rio.write_crs(f"epsg:{epsg}", inplace=True)
        lc_array = lc_array.where((lc_array.x >= lc_stack.x[(x) * sample_size]) &
                                  (lc_array.x < lc_stack.x[(x + 1) * sample_size]) & 
                                  (lc_array.y <= lc_stack.y[(y) * sample_size] ) &
                                  (lc_array.y > lc_stack.y[(y + 1) * sample_size])
                                 )
        
        if missing_values(lc_array, chip_size, sample_size):
            continue

        if (np.isin(lc_array, [255, 130, 133])).any():
            raise ValueError('Wrong LC value')
        
        # Skipping Flooded Vegetation
        if (np.isin(lc_array, [4])).any():
            continue
        lc_array = lc_array.fillna(0)
        lc_array = lc_array.rio.write_nodata(0)
        lc_array = lc_array.astype(np.dtype(np.int8))
        lc_array = lc_array.rename("lc")

        
        lc = np.unique(lc_array)
        if lc[1] == 1:
            water_index += 1
            if water_index > 400:
                continue 
        elif lc[1] == 8:
            bareground_index += 1
            if bareground_index > 400:
                continue
        elif lc[1] == 11:
            rangeland_index += 1
            if rangeland_index > 400:
                continue
        elif lc[1] == 2:
            tree_index += 1
            if tree_index > 400:
                continue
        elif lc[1] == 5:
            crops_index += 1
            if crops_index > 400:
                continue

        gen_status, dts = gen_chips(s2_array, lc_array, global_index)
        if gen_status:
            metadata_df = pd.concat([pd.DataFrame([[global_index,
                                                    dts,
                                                    np.unique(lc_array)[1],
                                                    s2_stack.x[(x) * sample_size + int(sample_size / 2)].data,
                                                    s2_stack.y[(y) * sample_size + int(sample_size / 2)].data,
                                                    epsg]
                                                  ],
                                                  columns=metadata_df.columns
                                                 ),
                                     metadata_df],
                                    ignore_index=True
                                   )
            global_index += 1
    
    return global_index, metadata_df

In [11]:
global_index = 0
metadata_df = pd.DataFrame(columns=["chip_id", "dates", "lc", "x_center", "y_center", "epsg"])
# metadata_df = pd.read_csv("../data/metadata_df.csv") # Use this line to continue from a previous iteration if the code stops. 

In [15]:
for index, aoi in aoi_gdf.iterrows():
    print(f"\nProcessing AOI at index {index}")
    
    aoi_bounds = aoi['geometry'].bounds
    s2_items = pystac.item_collection.ItemCollection([])
    for date_range in config["sentinel_2"]["time_ranges"]:        
        s2_items_season = search_s2_scenes(aoi, date_range, catalog, config)
        s2_items += s2_items_season

    if len(s2_items)<4:
        print(f"Missing Sentinel-2 scenes for AOI {aoi_bounds}")
        continue
        

    s2_stack = stack_s2_data(s2_items, config)
    if s2_stack is None:
        print(f"Failed to stack Sentinel-2 bands for AOI {aoi_bounds}")
        continue

    try:
        epsg = s2_items[0].properties["proj:epsg"]
    except:
        epsg = int(s2_items[0].properties["proj:code"].split(":")[-1])


    s1_items = pystac.item_collection.ItemCollection([])

    for s2_item in s2_items:
        s2_datetime = s2_item.datetime
        s1_item = search_s1_scenes(aoi, s2_datetime, catalog, config)
        s1_items += s1_item

    s1_stack = stack_s1_data(s1_items, config)
    break
    if s1_stack is None:
        print(f"Failed to stack Sentinel-2 bands for AOI {aoi_bounds}")
        continue

    lc_items = search_lc_scene(s2_items[0].bbox, catalog, config)
    if not lc_items:
        print(f"No Land Cover data found for AOI {aoi_bounds}")
        continue
    
    lc_stack = stack_lc_data(lc_items, s2_stack.rio.crs.to_epsg(), s2_items[0].bbox, config)
    if lc_stack is None:
        print(f"Failed to stack Land Cover data for AOI {aoi_bounds} and date range {date_range}")
        continue

    global_index, metadata_df = process_chips(s2_stack,
                                              lc_stack,
                                              epsg,
                                              config["chips"]["sample_size"],
                                              config["chips"]["chip_size"],
                                              global_index,
                                              metadata_df)
    metadata_df.to_csv('/home/benchuser/data/metadata_df.csv', index=False)


Processing AOI at index 0


NameError: name 'stack_s1_data' is not defined

In [18]:
for s1_item, s2_item in zip(s1_items, s2_items):
    print(s1_item.datetime, s2_item.datetime)

2023-02-18 04:16:57.544351+00:00 2023-02-18 08:50:21.024000+00:00
2023-04-19 04:16:58.051502+00:00 2023-04-19 08:46:01.024000+00:00
2023-07-12 04:17:03.121254+00:00 2023-07-13 08:46:49.024000+00:00
2023-12-27 04:17:04.257535+00:00 2023-12-30 08:52:59.024000+00:00


In [None]:
index

In [None]:
aoi_gdf

In [15]:
for item in s2_items:
    print(item.datetime)

2023-02-18 08:50:21.024000+00:00
2023-04-19 08:46:01.024000+00:00
2023-07-13 08:46:49.024000+00:00
2023-12-30 08:52:59.024000+00:00
