<a href="https://colab.research.google.com/github/CheilaBaiao/Pantanal/blob/main/00b_vodca_offline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# TOP 20 pastas mais pesadas no /content e no cache do usu√°rio
!du -h -d1 /content | sort -h | tail -n 20
!du -h -d1 /root/.cache | sort -h | tail -n 20


140K	/content/.config
55M	/content/sample_data
56G	/content/vod_tmp
112G	/content/drive
168G	/content
36K	/root/.cache/matplotlib
33M	/root/.cache/pip
56M	/root/.cache/node-gyp
88M	/root/.cache


In [None]:
# APAGA caches comuns e tempor√°rios (seguro):
!rm -rf /content/vod_tmp
!rm -rf /content/.config
!rm -rf /root/.cache/pip
!rm -rf /root/.cache/fontconfig
!rm -rf /root/.cache/matplotlib
!pip cache purge -q

# Se voc√™ tiver gerado arquivos tempor√°rios em /content fora do Drive:
!find /content -maxdepth 2 -type f -name "*.tmp" -delete

# Conferir de novo:
!df -h /
!du -h -d1 /content | sort -h | tail -n 20


[0mFilesystem      Size  Used Avail Use% Mounted on
overlay         108G   42G   66G  39% /
55M	/content/sample_data
112G	/content
112G	/content/drive


In [None]:
# @title VODCA (offline): instala√ß√£o e configura√ß√£o
%pip -q install xarray rioxarray rasterio netcdf4 h5netcdf scipy geopandas shapely dask tqdm

import json, gc, glob, warnings
from pathlib import Path
import numpy as np
import xarray as xr
import rioxarray as rxr
import rasterio as rio
import geopandas as gpd
from tqdm.notebook import tqdm

# Montar Drive
from google.colab import drive
drive.mount('/content/drive')

# Pastas do projeto
BASE_DIR = Path("/content/drive/MyDrive/Pantanal_TippingPoints/index")
RAW_DIR  = BASE_DIR / "raw"
INT_DIR  = BASE_DIR / "interim"
LOG_DIR  = BASE_DIR / "logs"
for d in [RAW_DIR, INT_DIR, LOG_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# Onde est√£o os NetCDFs VODCA (raiz que cont√©m subpastas por ano)
CANDIDATES = [
    Path("/content/drive/MyDrive/VODCA_CXKu/VODCA_CXKu/VODCA_CXKu"),
    Path("/content/drive/My Drive/VODCA_CXKu/VODCA_CXKu/VODCA_CXKu"),
]
VOD_RAW_DIR = next((p for p in CANDIDATES if p.exists()), None)
assert VOD_RAW_DIR is not None, "N√£o achei a pasta VODCA. Confira o caminho."

# Busca recursiva (.nc em subpastas)
nc_files = sorted(VOD_RAW_DIR.rglob("*.nc"))
print(f"Arquivos VODCA .nc encontrados: {len(nc_files)}")
assert len(nc_files) > 0, "Nenhum .nc encontrado."

# Limite do Pantanal (WGS84)
BOUND_PATH = Path("/content/drive/MyDrive/Pantanal_TippingPoints/Pantanal.shp")
assert BOUND_PATH.exists(), f"Limite n√£o encontrado: {BOUND_PATH}"
gdf = gpd.read_file(BOUND_PATH).to_crs(4326)
if len(gdf) > 1: gdf = gdf.dissolve()
PANT_WGS_GEOM = json.loads(gdf.to_json())["features"][0]["geometry"]

# Calend√°rio (bimensal com janela flex DJFM)
def get_time_slices(years, calendar="bimonthly"):
    times=[]
    for y in years:
        if calendar == "monthly":
            times.extend([f"{y}{m:02d}" for m in range(1,13)])
        else:
            times.extend([f"{y}{m:02d}" for m in [1,3,5,7,9,11]])
    return times

# Janela flex: DJFM=4 meses, demais=2
def time_range_for_yyyymm(yyyymm, calendar="bimonthly"):
    y = int(yyyymm[:4]); m = int(yyyymm[4:6])
    start = np.datetime64(f"{y:04d}-{m:02d}-01")
    if calendar == "bimonthly":
        adv = 4 if m in {12,1,2,3} else 2
    else:
        adv = 1
    mm = m + adv
    yy = y + (mm-1)//12
    mm = ((mm-1)%12)+1
    end = np.datetime64(f"{yy:04d}-{mm:02d}-01")
    return start, end

YEARS = list(range(1987, 2022))   # ajuste se quiser
CAL   = "bimonthly"
TIME_SLICES = get_time_slices(YEARS, CAL)
print(f"{len(TIME_SLICES)} bimestres:", TIME_SLICES[:6], "‚Ä¶", TIME_SLICES[-6:])

# Checkpoint
CKP_VODCA = LOG_DIR / "00b_vodca_checkpoint.json"
state = json.load(open(CKP_VODCA)) if CKP_VODCA.exists() else {"done":{}}

# silenciar FutureWarning do xarray (dims)
warnings.filterwarnings(
    "ignore",
    message="The return type of `Dataset.dims` will be changed.*",
    category=FutureWarning
)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Arquivos VODCA .nc encontrados: 12497
210 bimestres: ['198701', '198703', '198705', '198707', '198709', '198711'] ‚Ä¶ ['202101', '202103', '202105', '202107', '202109', '202111']


In [None]:
# @title Detectar vari√°vel/coords (1 arquivo) + index por ano (sem scanner)
import xarray as xr
import re
from collections import defaultdict

# 1) escolher um arquivo de prova
assert len(nc_files) > 0, "Sem NetCDFs encontrados (nc_files vazio)."
probe_fp = str(nc_files[0])

# engines em ordem de prefer√™ncia
_ENGS = ["netcdf4", "h5netcdf", "scipy"]
_last_err = None
probe = None
for eng in _ENGS:
    try:
        probe = xr.open_dataset(probe_fp, engine=eng)
        SELECTED_ENGINE = eng
        break
    except Exception as e:
        _last_err = e
if probe is None:
    raise _last_err

# 2) identificar vari√°vel VOD e nomes das coordenadas
vod_vars = [k for k in probe.data_vars if "vod" in k.lower()]
VAR = vod_vars[0] if vod_vars else list(probe.data_vars)[0]

def _pick(name_opts, ds):
    for n in name_opts:
        if n in ds.dims or n in ds.coords: return n
    for d in ds.dims:
        if name_opts[0] in d.lower(): return d
    raise ValueError(f"Coord n√£o encontrada: {name_opts[0]}")

LAT = _pick(["lat","latitude"], probe)
LON = _pick(["lon","longitude"], probe)
probe.close()

print(f"‚úî Vari√°vel VOD: {VAR} | Coords: {LAT}, {LON} | engine de prova: {SELECTED_ENGINE}")

# 3) indexar arquivos por ANO (olhando a pasta do arquivo)
files_by_year = defaultdict(list)
for p in nc_files:
    parent = p.parent.name
    try:
        yr = int(parent)
    except:
        # tenta extrair ano do nome do arquivo (ex.: daily_images_1997-09-10.nc)
        m = re.search(r"(19|20)\d{2}", p.name)
        yr = int(m.group(0)) if m else None
    files_by_year[yr].append(str(p))

years_indexed = sorted(y for y in files_by_year.keys() if isinstance(y,int))
print(f"Anos indexados: {years_indexed[:10]} ‚Ä¶ {years_indexed[-10:]}")



‚úî Vari√°vel VOD: VODCA_CXKu | Coords: lat, lon | engine de prova: netcdf4
Anos indexados: [1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996] ‚Ä¶ [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]


In [None]:
# @title Abertura on-demand robusta: l√™ 1-a-1, pula ruins e concatena
import xarray as xr, numpy as np, re, os, shutil, uuid
from pathlib import Path

ENGINES = ["netcdf4", "h5netcdf", "scipy"]   # ordem de tentativa
USE_LOCAL_CACHE = True
LOCAL_CACHE_DIR = Path("/content/vod_tmp")
CURRENT_WINDOW_CACHE = None  # ser√° setado a cada yyyymm no loop de export

_date_pat = re.compile(r"(19|20)\d{2}-\d{2}-\d{2}")

def _maybe_cache_local(fp: str) -> str:
    """
    Copia o arquivo para um subdiret√≥rio da janela corrente (CURRENT_WINDOW_CACHE).
    Se n√£o houver subdiret√≥rio ativo, volta o original.
    """
    if not USE_LOCAL_CACHE or CURRENT_WINDOW_CACHE is None:
        return fp
    dst = CURRENT_WINDOW_CACHE / os.path.basename(fp)
    try:
        if not dst.exists():
            shutil.copyfile(fp, dst)
        return str(dst)
    except Exception:
        return fp

def _open_single_da(fp: str):
    """
    Tenta abrir UM arquivo e retorna um DataArray da vari√°vel VAR
    garantindo que a dimens√£o 'time' exista e seja datetime64[ns] (1 passo),
    usando a data do nome do arquivo (YYYY-MM-DD) como fonte da verdade.
    Se n√£o der, retorna None.
    """
    cached = _maybe_cache_local(fp)
    last_err = None

    # extrai data do nome do arquivo (ex.: daily_images_1997-09-10.nc)
    m = _date_pat.search(os.path.basename(fp))
    tval = np.datetime64(m.group(0)) if m else np.datetime64("NaT")

    for eng in ENGINES:
        for dec in (False, True):  # tenta sem/ com decode_cf
            try:
                ds = xr.open_dataset(cached, engine=eng,
                                     mask_and_scale=False, decode_cf=dec)

                # escolhe vari√°vel
                if VAR in ds.data_vars:
                    vname = VAR
                else:
                    _var = [k for k in ds.data_vars if "vod" in k.lower()]
                    vname = _var[0] if _var else list(ds.data_vars)[0]

                da = ds[vname]

                # for√ßa ordem consistente
                if "time" in da.dims:
                    # muitos di√°rios t√™m time=1; se time n√£o for datetime64,
                    # substitui pela data vinda do nome
                    if da.sizes.get("time", 0) == 1:
                        # drop time existente e injeta um novo carimbado
                        da = da.isel(time=0, drop=True).expand_dims({"time": [tval]})
                    else:
                        # (caso raro) se tiver m√∫ltiplos passos, tenta decodificar;
                        # se ainda n√£o ficar datetime64, aborta e cai no pr√≥ximo modo
                        tt = da["time"].values
                        if not np.issubdtype(tt.dtype, np.datetime64):
                            ds.close()
                            raise ValueError("time n√£o-datetime64 com >1 passo")
                        # mant√©m como est√°
                else:
                    # n√£o tem time ‚Üí injeta um passo com a data do nome
                    da = da.expand_dims({"time": [tval]})

                ds.close()
                # garante dtype do time
                if not np.issubdtype(da["time"].values.dtype, np.datetime64):
                    # √∫ltima defesa: substitui coord por tval
                    da = da.isel(time=0, drop=True).expand_dims({"time": [tval]})

                return da
            except Exception as e:
                last_err = e
                continue

    # falhou em todos os engines/modos
    return None

def open_for_window(t0, t1):
    """
    L√™ apenas os arquivos dos anos que intersectam [t0, t1),
    abrindo 1-a-1, pulando arquivos problem√°ticos e concatenando em 'time'.
    Retorna um DataArray (lazy/chunked) com a vari√°vel VOD no intervalo.
    """
    y0 = int(str(t0)[:4]); y1 = int(str(t1)[:4])
    yrs = sorted(set([y0, y1] + list(range(y0, y1+1))))

    # Coleta candidatos por ano
    fps = []
    for y in yrs:
        if y in files_by_year:
            fps.extend(files_by_year[y])
    assert fps, f"Sem NetCDFs para anos {yrs}"

    good_das = []
    bad_files = []
    for fp in fps:
        da = _open_single_da(fp)
        if da is None:
            bad_files.append(fp)
        else:
            good_das.append(da)

    if not good_das:
        raise OSError(f"Nenhum arquivo leg√≠vel na janela {yrs}. Exemplos ruins: {bad_files[:3]}")

    # concatena no tempo (xarray cuida das coords)
    da_all = xr.concat(good_das, dim="time")
    # slice temporal final
    da_win = da_all.sel(time=slice(t0, t1))
    return da_win





In [None]:
# @title Sanity check do intervalo (sem depender de 'da' global)
# Garante TIME_SLICES/CAL/YEARS se n√£o existirem
def _get_time_slices(years, calendar="bimonthly"):
    ts=[]
    for y in years:
        if calendar=="monthly":
            ts.extend([f"{y}{m:02d}" for m in range(1,13)])
        else:
            ts.extend([f"{y}{m:02d}" for m in (1,3,5,7,9,11)])
    return ts

if 'TIME_SLICES' not in globals():
    if 'YEARS' in globals() and 'CAL' in globals():
        TIME_SLICES = _get_time_slices(YEARS, CAL)
    else:
        # deduz a partir dos anos indexados
        YEARS = years_indexed if 'years_indexed' in globals() else list(range(1987, 2022))
        CAL = "bimonthly"
        TIME_SLICES = _get_time_slices(YEARS, CAL)

def _time_range_for_yyyymm(yyyymm, calendar="bimonthly"):
    y = int(yyyymm[:4]); m = int(yyyymm[4:6])
    start = np.datetime64(f"{y:04d}-{m:02d}-01")
    adv = 4 if (calendar=="bimonthly" and m in {12,1,2,3}) else (2 if calendar=="bimonthly" else 1)
    mm = m + adv; yy = y + (mm-1)//12; mm = ((mm-1)%12)+1
    end = np.datetime64(f"{yy:04d}-{mm:02d}-01")
    return start, end

test_ym = TIME_SLICES[0]
t0, t1 = _time_range_for_yyyymm(test_ym, CAL)
print(f"Testando janela {test_ym}: {t0} ‚Üí {t1}")

da_win = open_for_window(t0, t1)
print("Time steps no intervalo:", int(da_win.sizes.get("time", 0)))

da_mean = (da_win.mean("time", skipna=True)
           .rio.set_spatial_dims(x_dim=LON, y_dim=LAT, inplace=False)
           .rio.write_crs(4326, inplace=False))

print("Grid (largura x altura):", da_mean.rio.width, "x", da_mean.rio.height)
print("OK para exportar üëç")

Testando janela 198701: 1987-01-01 ‚Üí 1987-05-01
Time steps no intervalo: 0
Grid (largura x altura): 1440 x 720
OK para exportar üëç


In [None]:
# @title 3b ‚Äî Recorte espacial antecipado (BBox) dentro do loader
import numpy as np
import shapely.geometry as sgeom
from shapely.geometry import shape

# 1) BBox do Pantanal (WGS84) a partir do PANT_WGS_GEOM que voc√™ j√° tem
pant_geom = shape(PANT_WGS_GEOM)  # precisa ter sido definido antes (mesmo geometry usado no clip)
minx, miny, maxx, maxy = pant_geom.bounds
BBOX = (minx, miny, maxx, maxy)   # (lon_min, lat_min, lon_max, lat_max)

# 2) fun√ß√£o de slice que respeita a ordem da coordenada (lat pode ser decrescente!)
def _slice_by_bbox(da, lat_name, lon_name, bbox):
    lon_min, lat_min, lon_max, lat_max = bbox
    lats = da[lat_name].values
    lons = da[lon_name].values

    # latitude pode vir decrescente em muitos produtos
    lat_asc = (lats[1] - lats[0]) > 0 if lats.size > 1 else True
    lon_asc = (lons[1] - lons[0]) > 0 if lons.size > 1 else True

    lat_lo, lat_hi = (lat_min, lat_max) if lat_asc else (lat_max, lat_min)
    lon_lo, lon_hi = (lon_min, lon_max) if lon_asc else (lon_max, lon_min)

    return da.sel({lat_name: slice(lat_lo, lat_hi),
                   lon_name: slice(lon_lo, lon_hi)})

# 3) ‚ÄúMonkey-patch‚Äù leve: embrulha a fun√ß√£o _open_single_da para recortar por BBox
#    Mant√©m todo o resto igual (engines, fallback, etc.)
_old_open_single_da = _open_single_da  # preserva original

def _open_single_da(fp: str):
    da = _old_open_single_da(fp)
    if da is None:
        return None
    try:
        # aplica o recorte por bbox ainda no "arquivo-dia"
        da_cut = _slice_by_bbox(da, LAT, LON, BBOX)
        return da_cut
    except Exception:
        # se algo der ruim no slice (coord faltando), devolve como veio
        return da


In [None]:
# @title 4 ‚Äî VODCA ‚Üí GeoTIFF bimensal (BBox antecipado, tiles, cache por janela + checkpoint)
%pip -q install dask

import os, gc, math, shutil
import numpy as np
import rasterio as rio
from rasterio.windows import Window
from tqdm.notebook import tqdm
import dask

# Evita rehunk gigante em cortes
dask.config.set({"array.slicing.split_large_chunks": True})

# Sa√≠da
OUT_DIR = INT_DIR / "vodca"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Par√¢metros
BLOCK = 256                     # tamanho do tile de escrita (reduza p/ 256 se quiser mais leve)
NODATA_F32 = np.float32(-9999)  # nodata padr√£o
DO_POLY_CLIP = False             # True: clip final pelo pol√≠gono do Pantanal; False: fica s√≥ no BBox (mais r√°pido)

def _write_tiled_geotiff(da_clip, out_tif, nodata=NODATA_F32, block=BLOCK):
    """
    Grava um DataArray 2D (lat x lon) em GeoTIFF por janelas (low-RAM).
    'da_clip' deve estar com crs=EPSG:4326 e dims nomeadas LAT/LON.
    """
    da_w = da_clip.astype("float32").chunk({LON: block, LAT: block})

    height = int(da_w.sizes[LAT])
    width  = int(da_w.sizes[LON])
    transform = da_w.rio.transform()
    crs = da_w.rio.crs

    profile = {
        "driver": "GTiff",
        "height": height,
        "width":  width,
        "count": 1,
        "dtype": "float32",
        "crs": crs,
        "transform": transform,
        "tiled": True,
        "blockxsize": block,
        "blockysize": block,
        "compress": "lzw",
        "BIGTIFF": "YES",
        "nodata": float(nodata),
    }

    # limita cache GDAL interno (MB)
    with rio.Env(GDAL_CACHEMAX=256):
        with rio.open(out_tif, "w", **profile) as dst:
            ny = math.ceil(height / block)
            nx = math.ceil(width  / block)
            for j in range(ny):
                y0, y1 = j*block, min((j+1)*block, height)
                for i in range(nx):
                    x0, x1 = i*block, min((i+1)*block, width)
                    # fatia lazy ‚Üí computa s√≥ a janela
                    sub = da_w.isel({LAT: slice(y0, y1), LON: slice(x0, x1)}).data
                    sub_np = np.array(sub.compute(), dtype="float32")
                    # aplica nodata onde NaN
                    if np.isnan(sub_np).any():
                        sub_np = np.where(np.isfinite(sub_np), sub_np, nodata).astype("float32")
                    w = Window.from_slices((y0, y1), (x0, x1))
                    dst.write(sub_np, 1, window=w)

    del da_w
    gc.collect()

# Loop de export (um arquivo por bimestre)
for yyyymm in tqdm(TIME_SLICES, desc="VODCA ‚Üí GeoTIFF (BBox early, tiles)"):
    out_tif = OUT_DIR / f"vodca_{yyyymm}.tif"
    if out_tif.exists():
        state["done"][yyyymm] = True
        json.dump(state, open(CKP_VODCA, "w"))
        continue

    # --- subcache desta janela (usado pela C√©lula 3 ao copiar .nc) ---
    CURRENT_WINDOW_CACHE = Path(f"/content/vod_tmp/{yyyymm}")
    CURRENT_WINDOW_CACHE.mkdir(parents=True, exist_ok=True)

    try:
        # Janela flex (mesma fun√ß√£o usada no NB1)
        t0, t1 = time_range_for_yyyymm(yyyymm, CAL)

        # Abre SOMENTE os di√°rios que cobrem a janela (cada di√°rio j√° vem recortado por BBox pela C√©lula 3b)
        da_win = open_for_window(t0, t1)
        if da_win.sizes.get("time", 0) == 0:
            state["done"][yyyymm] = True
            json.dump(state, open(CKP_VODCA, "w"))
            # limpa cache da janela e segue
            shutil.rmtree(CURRENT_WINDOW_CACHE, ignore_errors=True)
            CURRENT_WINDOW_CACHE = None
            continue

        # M√©dia do per√≠odo (lazy)
        da_mean = da_win.mean(dim="time", skipna=True)

        # Define dims espaciais + CRS
        da_mean = (da_mean
                   .rio.set_spatial_dims(x_dim=LON, y_dim=LAT, inplace=False)
                   .rio.write_crs(4326, inplace=False))

        # Clip poligonal final (opcional ‚Äî o BBox j√° reduziu bastante)
        if DO_POLY_CLIP:
            try:
                da_out = da_mean.rio.clip([PANT_WGS_GEOM], crs=4326, drop=True)
            except Exception:
                da_out = da_mean
        else:
            da_out = da_mean

        # Escrita em tiles (low-RAM)
        _write_tiled_geotiff(da_out, out_tif, nodata=NODATA_F32, block=BLOCK)

        # Checkpoint
        state["done"][yyyymm] = True
        json.dump(state, open(CKP_VODCA, "w"))

    finally:
        # Limpa cache local desta janela SEMPRE (mesmo se der erro)
        try:
            shutil.rmtree(CURRENT_WINDOW_CACHE, ignore_errors=True)
        except Exception:
            pass
        CURRENT_WINDOW_CACHE = None
        # Limpeza de RAM
        for _v in ["da_win", "da_mean", "da_out"]:
            if _v in locals(): del locals()[_v]
        gc.collect()

print("‚úî Finalizado. GeoTIFFs em:", OUT_DIR)








VODCA ‚Üí GeoTIFF (BBox early, tiles):   0%|          | 0/210 [00:00<?, ?it/s]

‚úî Finalizado. GeoTIFFs em: /content/drive/MyDrive/Pantanal_TippingPoints/index/interim/vodca
