In [1]:
#import sys
#!{sys.executable} -m pip install pystac-client planetary-computer
from pystac_client import Client
import planetary_computer

catalog = Client.open(
    "https://planetarycomputer.microsoft.com/api/stac/v1",
    modifier=planetary_computer.sign_inplace,
)

min_lon, min_lat, max_lon, max_lat = -123.5, 37.0, -121.5, 38.5
# Buscar colecciones relevantes por texto
for col in catalog.get_all_collections():
    if "Sentinel-2" in col.title or "Landsat" in col.title:
        print(col.id, col.title)


search = catalog.search(
    collections=["sentinel-2-l2a"],
    bbox=[min_lon, min_lat, max_lon, max_lat],
    datetime="2020-01-01/2020-12-31",
    query={"eo:cloud_cover": {"gt": 20}}
)


items = list(search.get_items())

landsat-c2-l2 Landsat Collection 2 Level-2
sentinel-2-l2a Sentinel-2 Level-2A
landsat-c2-l1 Landsat Collection 2 Level-1
hls2-s30 Harmonized Landsat Sentinel-2 (HLS) Version 2.0, Sentinel-2 Data
hls2-l30 Harmonized Landsat Sentinel-2 (HLS) Version 2.0, Landsat Data




In [2]:
import datetime

def buscar_pares_entrenamiento(bioma_nombre, coords, n_pares=50, dias_margen=30):
    buffer = 0.1
    bbox = [coords[0]-buffer, coords[1]-buffer, coords[0]+buffer, coords[1]+buffer]
    
    search_cloudy = catalog.search(
        collections=["sentinel-2-l2a"],
        bbox=bbox,
        datetime="2023-01-01/2023-12-31",
        query={"eo:cloud_cover": {"gt": 20, "lt": 60}}, 
        max_items=n_pares
    )
    cloudy_items = list(search_cloudy.items())

    pares = []
    for c_item in cloudy_items:
        # Extraer fecha de la imagen nublada
        fecha_nublada = c_item.datetime
        fecha_inicio = (fecha_nublada - datetime.timedelta(days=dias_margen)).strftime('%Y-%m-%dT%H:%M:%SZ')
        fecha_fin = (fecha_nublada + datetime.timedelta(days=dias_margen)).strftime('%Y-%m-%dT%H:%M:%SZ')

        # B. Buscamos la imagen limpia en ese rango específico de ±30 días
        search_clear = catalog.search(
            collections=["sentinel-2-l2a"],
            bbox=c_item.bbox,
            datetime=f"{fecha_inicio}/{fecha_fin}",
            query={"eo:cloud_cover": {"lt": 5}}, 
            max_items=1
        )
        clear_items = list(search_clear.items())
        
        if clear_items:
            pares.append({
                "bioma": bioma_nombre,
                "id_nublada": c_item.id,
                "id_limpia": clear_items[0].id,
                "nubes_porcentaje": c_item.properties["eo:cloud_cover"],
                "item_nublado": c_item,
                "item_limpio": clear_items[0]
            })
    return pares



In [3]:
import pandas as pd

biomas = {
    "bosque": [-60.0, -3.0],         
    "ciudad": [2.35, 48.86],          
    "desierto": [-13.0, 23.5],        
    "tundra": [30.0, 69.5],         
    "sabana": [20.0, 9.5],           
    "manglar": [99.0, 8.2],           
    "montaña": [7.0, 46.5],           
    "pradera": [105.0, 49.5],         
    "humedal": [-61.0, -32.0],        
    "agricultura": [-62.5, -31.0],    
    "glaciar": [13.0, 47.0],         
    "volcanico": [-78.6, -0.9],      
    "costa": [-8.9, 41.1],            
    "isla_tropical": [151.7, -16.9],  
    "oceano": [-40.0, 0.0],  
    "bosque_2": [-84.0, 10.3],
    "ciudad_2": [-74.0, 40.7],
    "desierto_2": [25.0, 25.0],
    "tundra_2": [68.0, 70.0],
    "sabana_2": [34.0, -1.3],
    "manglar_2": [-80.0, 0.5],
    "montaña_2": [86.9, 27.9],
    "pradera_2": [-100.0, 44.0],
    "humedal_2": [-58.0, -34.5],
    "agricultura_2": [-3.0, 40.0],
    "glaciar_2": [-72.0, -50.0],
    "volcanico_2": [-155.0, 19.4],
    "costa_2": [-77.0, 24.5],
    "isla_tropical_2": [-60.0, 14.0]         
}


dataset_pares = []

for nombre, coords in biomas.items():
    print(f"Buscando pares para {nombre}...")
    dataset_pares.extend(buscar_pares_entrenamiento(nombre, coords, n_pares=100))

df_pares = pd.DataFrame(dataset_pares)
print(f"\nTotal de pares encontrados: {len(df_pares)}")
display(df_pares.head())


# Ejemplo: Analizar el primer par encontrado
ejemplo = dataset_pares[0]['item_nublado']
props = ejemplo.properties

print(f"Análisis del item: {ejemplo.id}")
print(f"- Nubes totales: {props.get('s2:cloud_shadow_percentage')}%")
print(f"- Nubes Cirrus (finas): {props.get('s2:thin_cirrus_percentage')}%")
print(f"- Sombras: {props.get('s2:cloud_shadow_percentage')}%")

Buscando pares para bosque...


APIError: ('Connection broken: IncompleteRead(11725 bytes read, 3120 more expected)', IncompleteRead(11725 bytes read, 3120 more expected))

In [None]:
#!{sys.executable} -m pip install rioxarray
import torch
from torch.utils.data import Dataset, DataLoader
import rioxarray
import numpy as np
import torchvision.transforms.functional as TF
import random
from rasterio.errors import RasterioIOError

class CloudSegmentationDatasetV2(Dataset):
    def __init__(
        self,
        df_pares,
        patch_size=256,
        train=True,
        min_cloud_pct=5.0,
        max_cloud_pct=80.0,
        max_tries_per_item=10
    ):
        """
        df_pares: DataFrame con columnas:
            - item_nublado
            - item_limpio
        """
        self.df = df_pares.reset_index(drop=True)
        self.patch_size = patch_size
        self.train = train
        self.bands = ["B04", "B03", "B02", "B08"]
        self.min_cloud_pct = min_cloud_pct
        self.max_cloud_pct = max_cloud_pct
        self.max_tries_per_item = max_tries_per_item

    def __len__(self):
        # Podrías multiplicar por un factor si quieres más patches por par
        return len(self.df)

    def _open_band_stack(self, item):
        signed_item = planetary_computer.sign(item)
        band_data = []
        for b in self.bands:
            url = signed_item.assets[b].href
            try:
                da = rioxarray.open_rasterio(url)
                band_arr = da.values.astype(np.float32)
            except RasterioIOError:
                # Error al leer este TIFF → descartamos este par
                return None
            band_arr = np.clip(band_arr / 4000.0, 0, 1)
            band_data.append(band_arr)

        try:
            img = np.concatenate(band_data, axis=0)
        except ValueError:
            # Por si alguna banda vino con shape raro
            return None

        return img  # (C, H, W)


    def _open_scl(self, item):
      signed_item = planetary_computer.sign(item)
      url = signed_item.assets["SCL"].href
      try:
          da = rioxarray.open_rasterio(url)
          scl = da.values.astype(np.int16)
      except RasterioIOError:
          return None
      return scl


    def _get_random_window_coords(self, H, W):
        if H <= self.patch_size or W <= self.patch_size:
            # Si la imagen es más pequeña, simplemente empezamos en 0,0
            return 0, 0
        y0 = np.random.randint(0, H - self.patch_size)
        x0 = np.random.randint(0, W - self.patch_size)
        return y0, x0

    def _crop_patch(self, arr, y0, x0):
        # arr: (C, H, W) o (1, H, W)
        return arr[:, y0:y0 + self.patch_size, x0:x0 + self.patch_size]

    def _cloud_mask_from_scl(self, scl_patch):
        """
        scl_patch: (1, H, W) con clases SCL.
        Usamos 3, 8, 9, 10 como nubes/sombras.
        """
        cloud_classes = np.array([3, 8, 9, 10], dtype=np.int16)
        mask = np.isin(scl_patch, cloud_classes).astype(np.float32)  # (1, H, W)
        return mask

    def _cloud_pct(self, mask):
        """
        mask: (1, H, W) binaria.
        """
        total = mask.size
        n_cloud = mask.sum()
        return float(100.0 * n_cloud / total)

    def _sample_valid_patch(self, item_cloudy, item_clear):
      img_cloudy = self._open_band_stack(item_cloudy)
      img_clear  = self._open_band_stack(item_clear)
      scl        = self._open_scl(item_cloudy)

      if img_cloudy is None or img_clear is None or scl is None:
          return None  

      _, Hc, Wc = img_cloudy.shape
      _, Ht, Wt = img_clear.shape
      _, Hs, Ws = scl.shape
      H_min = min(Hc, Ht, Hs)
      W_min = min(Wc, Wt, Ws)
      if H_min < self.patch_size or W_min < self.patch_size:
          return None

      for _ in range(self.max_tries_per_item):
          y0, x0 = self._get_random_window_coords(H_min, W_min)

          img_cloudy_patch = img_cloudy[:, y0:y0 + self.patch_size, x0:x0 + self.patch_size]
          img_clear_patch = img_clear[:, y0:y0 + self.patch_size, x0:x0 + self.patch_size]
          scl_patch = scl[:, y0:y0 + self.patch_size, x0:x0 + self.patch_size]

          mask = self._cloud_mask_from_scl(scl_patch)   # (1, H, W)
          cloud_pct = self._cloud_pct(mask)

          if self.min_cloud_pct <= cloud_pct <= self.max_cloud_pct:
              return img_cloudy_patch, img_clear_patch, mask, cloud_pct

      return None

    def _augment(self, image, mask, target):
        if not self.train:
            return image, mask, target

        # Flip horizontal
        if random.random() > 0.5:
            image = TF.hflip(image)
            mask = TF.hflip(mask)
            target = TF.hflip(target)

        # Rotaciones 0, 90, 180, 270
        angle = random.choice([0, 90, 180, 270])
        if angle != 0:
            image = TF.rotate(image, angle)
            mask = TF.rotate(mask, angle)
            target = TF.rotate(target, angle)

        return image, mask, target

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        item_cloudy = row["item_nublado"]
        item_clear = row["item_limpio"]

        sampled = self._sample_valid_patch(item_cloudy, item_clear)
        # Si no encuentra patch válido, podemos intentar con otro índice
        tries = 0
        while sampled is None and tries < 5:
            idx = np.random.randint(0, len(self.df))
            row = self.df.iloc[idx]
            item_cloudy = row["item_nublado"]
            item_clear = row["item_limpio"]
            sampled = self._sample_valid_patch(item_cloudy, item_clear)
            tries += 1

        if sampled is None:
            # Fallback: devolver algo simple o lanzar excepción
            raise RuntimeError("No se pudo samplear un patch válido.")

        img_cloudy_patch, img_clear_patch, mask, cloud_pct = sampled

        # A Torch
        img_input = torch.from_numpy(img_cloudy_patch)  # (C, H, W)
        img_target = torch.from_numpy(img_clear_patch)  # (C, H, W)
        mask = torch.from_numpy(mask)                  # (1, H, W)

        # Aumentos consistentes
        img_input, mask, img_target = self._augment(img_input, mask, img_target)

        return {
            "input": img_input,
            "mask": mask,
            "target": img_target,
            "cloud_pct": cloud_pct,
            "pair_idx": idx
        }


train_ds = CloudSegmentationDatasetV2(df_pares, patch_size=256, train=True,
                                     min_cloud_pct=5.0, max_cloud_pct=80.0)
train_loader = DataLoader(train_ds, batch_size=1, shuffle=True)
