In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
GEONLM com CUDA (CuPy + cuML + cuGraph), compatível com NumPy 2.0 (NEP 50):
- RESET de patches antigos do CuPy (copyto/full) se houver
- Shim de cp.full que NÃO usa cp.copyto (evita recursão)
- cuML NearestNeighbors (GPU) + cuGraph SSSP (GPU)
"""

import warnings, time
import numpy as np
import cupy as cp
from skimage.transform import downscale_local_mean

# ============================================================
# RESET CUPY (remove monkey-patches anteriores, se existirem)
# ============================================================
try:
    from cupy._manipulation import basic as _mbasic
    from cupy._creation import basic as _cbasic
    cp.copyto = _mbasic.copyto
    cp.full   = _cbasic.full
    if hasattr(cp, "_nep50_full_patched"):
        delattr(cp, "_nep50_full_patched")
except Exception:
    pass

# ============================================================
# SHIM NEP50: novo cp.full que NÃO chama cp.copyto
# (aloca via cp.ndarray e preenche com broadcasting)
# ============================================================
if not hasattr(cp, "_nep50_full_patched"):
    def _full_nep50_no_copyto(shape, fill_value, dtype=None, order='C'):
        # dtype alvo
        if dtype is None:
            dtype = np.array(fill_value).dtype
        # aloca sem inicializar
        a = cp.ndarray(shape, dtype, order=order)
        # preenche via broadcasting/elementwise (não usa copyto)
        a[...] = cp.asarray(fill_value, dtype=dtype)
        return a
    cp.full = _full_nep50_no_copyto
    cp._nep50_full_patched = True
# ============================================================

import skimage.io, skimage.color
from skimage.metrics import peak_signal_noise_ratio, structural_similarity

import cudf
import cugraph
from cuml.neighbors import NearestNeighbors

warnings.simplefilter(action='ignore')

# -----------------------------
# Utilidades em GPU
# -----------------------------
def pad_symmetric_gpu(img_cp: cp.ndarray, f: int) -> cp.ndarray:
    f_i = int(f)
    return cp.pad(img_cp, ((f_i, f_i), (f_i, f_i)), mode='symmetric')


def extract_window_patches_gpu(img_n: cp.ndarray, i: int, j: int, f: int, t: int, m: int, n: int):
    """
    Extrai patches flatten da janela de busca do pixel (i,j).
    Retorna: dataset_cp (num_elem, patch_size), source_idx (int), pixels_busca_cp (num_elem,)
    """
    i = int(i); j = int(j); f = int(f); t = int(t); m = int(m); n = int(n)

    im = i + f
    jn = j + f

    rmin = max(im - t, f)
    rmax = min(im + t, m + f)
    smin = max(jn - t, f)
    smax = min(jn + t, n + f)

    patch_size = (2 * f + 1) * (2 * f + 1)
    num_elem = (rmax - rmin) * (smax - smin)

    dataset_cp = cp.empty((num_elem, patch_size), dtype=cp.float32)
    pixels_busca_cp = cp.empty((num_elem,), dtype=cp.float32)

    k = 0
    source_idx = -1
    for r in range(rmin, rmax):
        for s in range(smin, smax):
            W = img_n[r - f:r + f + 1, s - f:s + f + 1]
            dataset_cp[k, :] = W.astype(cp.float32, copy=False).ravel()
            pixels_busca_cp[k] = img_n[r, s].astype(cp.float32, copy=False)
            if r == im and s == jn:
                source_idx = k
            k += 1

    return dataset_cp, int(source_idx), pixels_busca_cp


def knn_graph_edges_gpu(dataset_cp: cp.ndarray, nn: int) -> cudf.DataFrame:
    """
    KNN na GPU (cuML). Retorna arestas ['src','dst','weight'] (int32,int32,float32).
    """
    n_samples = int(dataset_cp.shape[0])
    nn = int(min(max(1, nn), max(1, n_samples - 1)))  # evita nn > n_samples

    knn = NearestNeighbors(n_neighbors=nn, metric='euclidean', output_type='cupy')
    knn.fit(dataset_cp)
    distances_cp, indices_cp = knn.kneighbors(dataset_cp)  # (n_samples, nn) em CuPy

    src_cp = cp.repeat(cp.arange(n_samples, dtype=cp.int32), nn)
    dst_cp = indices_cp.reshape(-1).astype(cp.int32, copy=False)
    w_cp   = distances_cp.reshape(-1).astype(cp.float32, copy=False)

    edges_gdf = cudf.DataFrame({'src': src_cp, 'dst': dst_cp, 'weight': w_cp})
    edges_gdf['src']    = edges_gdf['src'].astype('int32')
    edges_gdf['dst']    = edges_gdf['dst'].astype('int32')
    edges_gdf['weight'] = edges_gdf['weight'].astype('float32')
    return edges_gdf


def sssp_gpu(edges_gdf, source_idx):
    import numpy as np
    import cudf, cugraph

    # dtypes garantidos
    if edges_gdf['src'].dtype != 'int32':      edges_gdf['src'] = edges_gdf['src'].astype('int32')
    if edges_gdf['dst'].dtype != 'int32':      edges_gdf['dst'] = edges_gdf['dst'].astype('int32')
    if edges_gdf['weight'].dtype != 'float32': edges_gdf['weight'] = edges_gdf['weight'].astype('float32')

    source_idx = np.int32(source_idx)

    G = cugraph.Graph(directed=True)
    G.from_cudf_edgelist(
        edges_gdf,
        source='src',
        destination='dst',
        edge_attr='weight',   # <- define a coluna de pesos no grafo
        renumber=False
    )

    # >>> Remover o parâmetro 'weight' aqui <<<
    df_sssp = cugraph.sssp(G, source=source_idx)

    df_sssp['distance'] = df_sssp['distance'].astype('float32')
    return df_sssp


def geonlm_pixel_gpu(img_n: cp.ndarray, i: int, j: int, f: int, t: int, h: float, nn: int, m: int, n: int) -> float:
    """
    Valor GEONLM para um pixel (i,j) usando KNN + SSSP na GPU. Retorna float (host).
    """
    dataset_cp, source_idx, pixels_busca_cp = extract_window_patches_gpu(img_n, i, j, f, t, m, n)

    edges_gdf = knn_graph_edges_gpu(dataset_cp, nn)
    df_sssp = sssp_gpu(edges_gdf, source_idx)

    vertices_cp = df_sssp['vertex'].values
    dist_cp = df_sssp['distance'].values.astype(cp.float32, copy=False)

    valid_mask = cp.isfinite(dist_cp)
    dist_valid = dist_cp[valid_mask]
    vert_valid = vertices_cp[valid_mask].astype(cp.int32, copy=False)

    h_cp = cp.asarray(h, dtype=cp.float32)
    sims = cp.exp(- (dist_valid * dist_valid) / (h_cp * h_cp))

    pix = pixels_busca_cp[vert_valid].astype(cp.float32, copy=False)

    NL = (sims * pix).sum(dtype=cp.float32)
    Z  =  sims.sum(dtype=cp.float32)

    if float(cp.asnumpy(Z)) == 0.0:
        val = img_n[int(i) + int(f), int(j) + int(f)].astype(cp.float32, copy=False)
    else:
        val = NL / Z

    return float(cp.asnumpy(val))


def geonlm_gpu(ruidosa_np: np.ndarray, f=4, t=7, h_geo=150.0, nn=10) -> np.ndarray:
    """
    GEONLM principal: entrada NumPy -> processa em GPU -> saída NumPy float32.
    """
    f = int(f); t = int(t); nn = int(nn); h_geo = float(h_geo)

    ruidosa_cp = cp.asarray(ruidosa_np, dtype=cp.float32)
    m, n = map(int, ruidosa_cp.shape)
    img_n = pad_symmetric_gpu(ruidosa_cp, f)

    filtrada = np.empty((m, n), dtype=np.float32)

    for i in range(m):
        for j in range(n):
            filtrada[i, j] = geonlm_pixel_gpu(img_n, i, j, f, t, h_geo, nn, m, n)

    return filtrada


# -----------------------------
# Demo simples (equivalente ao main)
# -----------------------------
if __name__ == "__main__":
    # Ajuste o caminho da imagem conforme seu dataset
    img = skimage.io.imread('../0.gif')
    img = img[0, :, :] if len(img.shape) > 2 else img
    if img.ndim > 2:
        img = skimage.color.rgb2gray(img) * 255.0
    from skimage.transform import downscale_local_mean
    img =  downscale_local_mean(img, (16, 16)).astype(np.uint8)
    #img = img.astype(np.float32)
    m, n = img.shape
    print(f'Num. linhas = {m}')
    print(f'Num. colunas = {n}\n')

    sigma = float(10.0)
    ruido = np.random.normal(0.0, sigma, (m, n)).astype(np.float32)
    ruidosa = np.clip(img + ruido, 0.0, 255.0).astype(np.float32)

    f = int(4)
    t = int(7)
    h_geo = float(150.0)
    nn = int(10)

    # Smoke test do shim (agora não deve recursar)
    x = cp.full((2,2), 0, dtype=cp.float32)
    print("cp.full smoke test:", x.dtype, float(cp.asnumpy(x.sum())))

    print('***********************************')
    print('*            GEONLM CUDA          *')
    print('***********************************\n')

    t0 = time.time()
    filtrada_geo = geonlm_gpu(ruidosa, f=f, t=t, h_geo=h_geo, nn=nn)
    t1 = time.time()

    filtrada_u8 = np.clip(filtrada_geo, 0.0, 255.0).astype(np.uint8)
    img_u8 = np.clip(img, 0.0, 255.0).astype(np.uint8)

    psnr_geo = peak_signal_noise_ratio(img_u8, filtrada_u8)
    ssim_geo = structural_similarity(img_u8, filtrada_u8)

    print(f'\nPSNR (GEO NLM): {psnr_geo:.4f}')
    print(f'SSIM (GEO NLM): {ssim_geo:.4f}')
    print(f'Tempo total (GEONLM CUDA): {t1 - t0:.3f} s\n')

    skimage.io.imsave('GEONLM_cuda.png', filtrada_u8)


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
GEONLM (Geodesic NLM) em GPU (RAPIDS):
- CuPy + cuML (kNN) + cuGraph (SSSP)
- Compatível com NumPy 2.0 (NEP 50) via shim em cp.full (sem patch em copyto)
- Processamento tileado: 1 grafo por bloco (tile)
"""

import warnings, time
import numpy as np
import cupy as cp

# ========= RESET patches antigos (se houve) =========
try:
    from cupy._manipulation import basic as _mbasic
    from cupy._creation import basic as _cbasic
    cp.copyto = _mbasic.copyto
    cp.full   = _cbasic.full
    if hasattr(cp, "_nep50_full_patched"):
        delattr(cp, "_nep50_full_patched")
except Exception:
    pass

# ========= SHIM NEP50: cp.full sem usar copyto =========
if not hasattr(cp, "_nep50_full_patched"):
    def _full_nep50_no_copyto(shape, fill_value, dtype=None, order='C'):
        if dtype is None:
            dtype = np.array(fill_value).dtype
        a = cp.ndarray(shape, dtype, order=order)
        a[...] = cp.asarray(fill_value, dtype=dtype)  # broadcast/elementwise
        return a
    cp.full = _full_nep50_no_copyto
    cp._nep50_full_patched = True

import rmm
from rmm.allocators.cupy import rmm_cupy_allocator
import cudf, cugraph
from cuml.neighbors import NearestNeighbors
import skimage.io, skimage.color
from skimage.metrics import peak_signal_noise_ratio, structural_similarity
from cupy.lib.stride_tricks import sliding_window_view as cp_sw

warnings.simplefilter(action='ignore')

# ========= RMM pool (reduz overhead de alocação) =========
def enable_rmm_pool(initial_pool_size_bytes=2_000_000_000):
    rmm.reinitialize(pool_allocator=True, initial_pool_size=initial_pool_size_bytes)
    cp.cuda.set_allocator(rmm_cupy_allocator)

# ========= Utilidades =========
def build_knn_graph_cudf(patches_cp: cp.ndarray, nn: int) -> cudf.DataFrame:
    """
    patches_cp: (Nwin, patch_dim) float32
    Retorna arestas em cudf: ['src','dst','weight'] (int32,int32,float32)
    """
    n_samples = int(patches_cp.shape[0])
    nn = int(min(max(1, nn), max(1, n_samples - 1)))

    knn = NearestNeighbors(n_neighbors=nn, metric='euclidean', output_type='cupy')
    knn.fit(patches_cp)
    d_cp, idx_cp = knn.kneighbors(patches_cp)  # (n_samples, nn)

    src_cp = cp.repeat(cp.arange(n_samples, dtype=cp.int32), nn)
    dst_cp = idx_cp.reshape(-1).astype(cp.int32, copy=False)
    w_cp   = d_cp.reshape(-1).astype(cp.float32, copy=False)

    gdf = cudf.DataFrame({'src': src_cp, 'dst': dst_cp, 'weight': w_cp})
    gdf['src']    = gdf['src'].astype('int32')
    gdf['dst']    = gdf['dst'].astype('int32')
    gdf['weight'] = gdf['weight'].astype('float32')
    return gdf

def sssp_on_graph(G: cugraph.Graph, source_idx: int) -> cudf.DataFrame:
    """
    cuGraph 24.06: sssp(G, source=...) não aceita kwarg 'weight' (usa edge_attr do grafo).
    """
    df = cugraph.sssp(G, source=np.int32(source_idx))
    df['distance'] = df['distance'].astype('float32')
    return df

# ========= Núcleo por bloco (tile) =========
def geonlm_block(img_pad: cp.ndarray, top: int, left: int,
                 B: int, f: int, t: int, h: float, nn: int) -> cp.ndarray:
    """
    Processa bloco BxB com canto (top,left) na imagem original (sem padding).
    Padding externo é (f + t) em cada lado.
    Macro-janela: (B + 2*(t+f)) x (B + 2*(t+f))
    """
    f = int(f); t = int(t); B = int(B)
    h_cp = cp.asarray(float(h), dtype=cp.float32)

    pad_total = f + t
    im0, jm0 = top + pad_total, left + pad_total

    # macro recorte dentro do img_pad com pad_total garantido
    macro = img_pad[im0 - (t + f) : im0 + B + (t + f),
                    jm0 - (t + f) : jm0 + B + (t + f)]
    M0, M1 = macro.shape
    assert M0 == M1, "Macro não quadrada; ver índices."

    ph = 2*f + 1                             # tamanho do patch
    H = M0 - ph + 1                          # = B + 2*t
    W = H

    # sliding window dos patches da macro: (H, W, ph, ph) -> (H*W, ph*ph)
    sw = cp_sw(macro, (ph, ph))
    patches = sw.reshape(H*W, ph*ph).astype(cp.float32, copy=False)

    # grafo kNN 1x por bloco
    edges_gdf = build_knn_graph_cudf(patches, nn)

    # grafo cuGraph (usa 'weight' automaticamente)
    G = cugraph.Graph(directed=True)
    G.from_cudf_edgelist(edges_gdf, source='src', destination='dst',
                         edge_attr='weight', renumber=False)

    # intensidades do centro de cada patch (um por nó da macro)
    centers = macro[f:f+H, f:f+W].reshape(H*W)

    # saída do bloco
    out_block = cp.empty((B, B), dtype=cp.float32)

    # para cada pixel do bloco, SSSP com fonte no nó central -> (bi+t, bj+t)
    for bi in range(B):
        for bj in range(B):
            src = (bi + t) * W + (bj + t)  # índice linear do nó fonte
            df_sssp = sssp_on_graph(G, src)

            dist = df_sssp['distance'].values  # CuPy array
            valid = cp.isfinite(dist)
            d = dist[valid]
            v = df_sssp['vertex'].values[valid].astype(cp.int32, copy=False)

            sims = cp.exp(- (d * d) / (h_cp * h_cp))
            pix  = centers[v].astype(cp.float32, copy=False)

            Z = sims.sum(dtype=cp.float32)
            if float(cp.asnumpy(Z)) == 0.0:
                out_block[bi, bj] = macro[f + t + bi, f + t + bj].astype(cp.float32, copy=False)
            else:
                NL = (sims * pix).sum(dtype=cp.float32)
                out_block[bi, bj] = NL / Z

    return out_block

# ========= Pipeline completo (tileado) =========
def geonlm_gpu_tiled(ruidosa_np: np.ndarray, f=4, t=7, h_geo=150.0, nn=10, B=16,
                     rmm_pool_bytes=2_000_000_000) -> np.ndarray:
    """
    ruidosa_np: imagem ruidosa (NumPy, float32/uint8)
    Retorna imagem filtrada (NumPy float32)
    """
    enable_rmm_pool(rmm_pool_bytes)

    ruidosa_cp = cp.asarray(ruidosa_np, dtype=cp.float32)
    m, n = map(int, ruidosa_cp.shape)

    # *** padding deve ser (f + t) para suportar macro-janela ***
    pad_total = int(f + t)
    img_pad = cp.pad(ruidosa_cp, ((pad_total, pad_total), (pad_total, pad_total)), mode='symmetric')

    out_cp = cp.empty_like(ruidosa_cp, dtype=cp.float32)

    for i in range(0, m, B):
        for j in range(0, n, B):
            bi = min(B, m - i)
            bj = min(B, n - j)
            block_cp = geonlm_block(img_pad, i, j, bi, f, t, h_geo, nn)
            out_cp[i:i+bi, j:j+bj] = block_cp

    return cp.asnumpy(out_cp).astype(np.float32)

# ========= Demo =========
if __name__ == "__main__":
    IMG_PATH = "../0.gif"  # ajuste

    img = skimage.io.imread(IMG_PATH)
    img = img[0, :, :] if img.ndim > 2 else img
    if img.ndim > 2:
        img = skimage.color.rgb2gray(img) * 255.0
    img = img.astype(np.float32)
    from skimage.transform import downscale_local_mean
    img =  downscale_local_mean(img, (16, 16)).astype(np.uint8)
    m, n = img.shape
    print(f"Imagem: {m}x{n}")

    sigma = 10.0
    ruido = np.random.normal(0.0, sigma, (m, n)).astype(np.float32)
    ruidosa = np.clip(img + ruido, 0.0, 255.0).astype(np.float32)

    f = 4
    t = 7
    h_geo = 150.0
    nn = 10
    B = 16

    # Smoke test do shim (não deve dar erro)
    x = cp.full((2,2), 0, dtype=cp.float32)
    print("cp.full ok:", x.dtype, float(cp.asnumpy(x.sum())))

    print("\n*** GEONLM (GPU, tileado) ***")
    t0 = time.time()
    filtrada_geo = geonlm_gpu_tiled(ruidosa, f=f, t=t, h_geo=h_geo, nn=nn, B=B,
                                    rmm_pool_bytes=2_000_000_000)
    t1 = time.time()

    filtrada_u8 = np.clip(filtrada_geo, 0.0, 255.0).astype(np.uint8)
    img_u8 = np.clip(img, 0.0, 255.0).astype(np.uint8)

    psnr_geo = peak_signal_noise_ratio(img_u8, filtrada_u8)
    ssim_geo = structural_similarity(img_u8, filtrada_u8)

    print(f"PSNR (GEO NLM): {psnr_geo:.4f}")
    print(f"SSIM (GEO NLM): {ssim_geo:.4f}")
    print(f"Tempo total: {t1 - t0:.3f} s")

    skimage.io.imsave('GEONLM_cuda_tiled.png', filtrada_u8)


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
GEONLM (Geodesic NLM) em GPU (RAPIDS):
- CuPy + cuML (kNN) + cuGraph (SSSP)
- Compatível com NumPy 2.0 (NEP 50) via shim em cp.full (sem patch em copyto)
- Processamento tileado: 1 grafo por bloco, reaproveitando estruturas
- Métricas de GPU (tempo, uso, memória, potência) + headroom (quanto dá pra aumentar)
"""

import warnings, time, statistics
import numpy as np
import cupy as cp
warnings.simplefilter(action='ignore')

# ========= RESET patches antigos (se você testou shims antes) =========
try:
    from cupy._manipulation import basic as _mbasic
    from cupy._creation import basic as _cbasic
    cp.copyto = _mbasic.copyto
    cp.full   = _cbasic.full
    if hasattr(cp, "_nep50_full_patched"):
        delattr(cp, "_nep50_full_patched")
except Exception:
    pass

# ========= SHIM NEP50: cp.full sem usar copyto (evita recursão) =========
if not hasattr(cp, "_nep50_full_patched"):
    def _full_nep50_no_copyto(shape, fill_value, dtype=None, order='C'):
        if dtype is None:
            dtype = np.array(fill_value).dtype
        a = cp.ndarray(shape, dtype, order=order)
        a[...] = cp.asarray(fill_value, dtype=dtype)  # broadcast/elementwise
        return a
    cp.full = _full_nep50_no_copyto
    cp._nep50_full_patched = True

# ========= RMM pool (reduz overhead de alocação) =========
import rmm
try:
    from rmm.allocators.cupy import rmm_cupy_allocator
except Exception:
    rmm_cupy_allocator = None  # fallback; tratamos abaixo

def enable_rmm_pool(initial_pool_size_bytes=5_000_000_000):
    rmm.reinitialize(pool_allocator=True, initial_pool_size=initial_pool_size_bytes)
    if rmm_cupy_allocator is None:
        raise RuntimeError("RMM: não encontrei o allocator do CuPy nesta versão.")
    cp.cuda.set_allocator(rmm_cupy_allocator)

# ========= RAPIDS / imagem =========
import cudf, cugraph
from cuml.neighbors import NearestNeighbors
import skimage.io, skimage.color
from skimage.metrics import peak_signal_noise_ratio, structural_similarity
from cupy.lib.stride_tricks import sliding_window_view as cp_sw

# ========= Cronômetro GPU (CUDA Events) =========
def time_gpu(func, *args, **kwargs):
    start = cp.cuda.Event()
    end   = cp.cuda.Event()
    start.record()
    out = func(*args, **kwargs)
    end.record()
    end.synchronize()
    ms = cp.cuda.get_elapsed_time(start, end)  # <<< correto nesta API
    return out, ms / 1000.0

# ========= Medidor NVML (uso/mem/potência) =========
try:
    import pynvml as nv
    class GPUMeter:
        def __init__(self, device_index=0, interval=0.25):
            self.idx = device_index
            self.interval = float(interval)
            self.samples = []
            self.mem_total_MiB = None
        def __enter__(self):
            nv.nvmlInit()
            self.handle = nv.nvmlDeviceGetHandleByIndex(self.idx)
            mem = nv.nvmlDeviceGetMemoryInfo(self.handle)
            self.mem_total_MiB = mem.total/(1024**2)
            self._running = True
            import threading
            self.t0 = time.perf_counter()
            self.th = threading.Thread(target=self._loop, daemon=True); self.th.start()
            return self
        def _loop(self):
            while self._running:
                try:
                    util = nv.nvmlDeviceGetUtilizationRates(self.handle)
                    mem  = nv.nvmlDeviceGetMemoryInfo(self.handle)
                    pwr  = nv.nvmlDeviceGetPowerUsage(self.handle)
                    self.samples.append((util.gpu, mem.used/(1024**2), pwr/1000.0))
                except nv.NVMLError:
                    pass
                time.sleep(self.interval)
        def __exit__(self, exc_type, exc, tb):
            self._running = False
            self.th.join()
            self.t1 = time.perf_counter()
            nv.nvmlShutdown()
        def summary(self):
            dur = getattr(self, "t1", time.perf_counter()) - getattr(self, "t0", time.perf_counter())
            if not self.samples:
                return {"duration_s": dur, "note": "sem amostras"}
            gpu = [s[0] for s in self.samples]
            mem = [s[1] for s in self.samples]
            pwr = [s[2] for s in self.samples]
            return {
                "duration_s": round(dur, 3),
                "gpu_util_avg_%": round(statistics.mean(gpu), 1),
                "gpu_util_max_%": max(gpu),
                "mem_used_avg_MiB": round(statistics.mean(mem), 1),
                "mem_used_max_MiB": round(max(mem), 1),
                "mem_total_MiB": round(self.mem_total_MiB, 1),
                "power_avg_W": round(statistics.mean(pwr), 1),
                "power_max_W": round(max(pwr), 1),
                "samples": len(self.samples),
                "interval_s": self.interval,
            }
except Exception:
    # fallback sem NVML
    class GPUMeter:
        def __init__(self, *a, **k): pass
        def __enter__(self): self.t0=time.perf_counter(); return self
        def __exit__(self, *a): self.t1=time.perf_counter()
        def summary(self): return {"duration_s": round(self.t1-self.t0,3), "note":"NVML indisponível"}

# ========= Estimador de memória por tile (headroom) =========
def estimate_tile_bytes(f, t, B, nn):
    N = (B + 2*(t + f))**2            # nós na macro
    P = (2*f + 1)**2                  # dimensão do patch
    patches_bytes = N * P * 4         # float32
    edges_bytes   = N * nn * 12       # src int32 + dst int32 + weight float32
    extras_bytes  = 2 * N * 4         # distâncias + centers
    fudge_graph   = 1.5               # overhead de estruturas
    total = patches_bytes + int(fudge_graph * edges_bytes) + extras_bytes
    return int(total)

# ========= Construção do grafo kNN (cudf) =========
def build_knn_graph_cudf(patches_cp: cp.ndarray, nn: int) -> cudf.DataFrame:
    n_samples = int(patches_cp.shape[0])
    nn = int(min(max(1, nn), max(1, n_samples - 1)))
    knn = NearestNeighbors(n_neighbors=nn, metric='euclidean', output_type='cupy')
    knn.fit(patches_cp)
    d_cp, idx_cp = knn.kneighbors(patches_cp)
    src_cp = cp.repeat(cp.arange(n_samples, dtype=cp.int32), nn)
    dst_cp = idx_cp.reshape(-1).astype(cp.int32, copy=False)
    w_cp   = d_cp.reshape(-1).astype(cp.float32, copy=False)
    gdf = cudf.DataFrame({'src': src_cp, 'dst': dst_cp, 'weight': w_cp})
    gdf['src']    = gdf['src'].astype('int32')
    gdf['dst']    = gdf['dst'].astype('int32')
    gdf['weight'] = gdf['weight'].astype('float32')
    return gdf

def sssp_on_graph(G: cugraph.Graph, source_idx: int) -> cudf.DataFrame:
    df = cugraph.sssp(G, source=np.int32(source_idx))  # sem kwarg 'weight' na 24.06
    df['distance'] = df['distance'].astype('float32')
    return df

# ========= Núcleo por bloco (tile) =========
def geonlm_block(img_pad: cp.ndarray, top: int, left: int,
                 B: int, f: int, t: int, h: float, nn: int) -> cp.ndarray:
    f = int(f); t = int(t); B = int(B)
    pad_total = f + t
    im0, jm0 = top + pad_total, left + pad_total
    # macro: (B + 2*(t+f)) x (B + 2*(t+f))
    macro = img_pad[im0 - (t + f) : im0 + B + (t + f),
                    jm0 - (t + f) : jm0 + B + (t + f)]
    M0, M1 = macro.shape
    assert M0 == M1, "Macro não quadrada; ver índices/padding."
    ph = 2*f + 1
    H = M0 - ph + 1            # = B + 2*t
    W = H
    sw = cp_sw(macro, (ph, ph))
    patches = sw.reshape(H*W, ph*ph).astype(cp.float32, copy=False)
    edges_gdf = build_knn_graph_cudf(patches, nn)
    G = cugraph.Graph(directed=True)
    G.from_cudf_edgelist(edges_gdf, source='src', destination='dst',
                         edge_attr='weight', renumber=False)
    centers = macro[f:f+H, f:f+W].reshape(H*W)
    out_block = cp.empty((B, B), dtype=cp.float32)
    h_cp = cp.asarray(float(h), dtype=cp.float32)
    for bi in range(B):
        for bj in range(B):
            src = (bi + t) * W + (bj + t)
            df_sssp = sssp_on_graph(G, src)
            dist = df_sssp['distance'].values
            valid = cp.isfinite(dist)
            d = dist[valid]
            v = df_sssp['vertex'].values[valid].astype(cp.int32, copy=False)
            sims = cp.exp(- (d * d) / (h_cp * h_cp))
            pix  = centers[v].astype(cp.float32, copy=False)
            Z = sims.sum(dtype=cp.float32)
            if float(cp.asnumpy(Z)) == 0.0:
                out_block[bi, bj] = macro[f + t + bi, f + t + bj].astype(cp.float32, copy=False)
            else:
                NL = (sims * pix).sum(dtype=cp.float32)
                out_block[bi, bj] = NL / Z
    return out_block

# ========= Pipeline completo (tileado) =========
def geonlm_gpu_tiled(ruidosa_np: np.ndarray, f=4, t=7, h_geo=150.0, nn=10, B=16,
                     rmm_pool_bytes=5_000_000_000) -> np.ndarray:
    enable_rmm_pool(rmm_pool_bytes)
    ruidosa_cp = cp.asarray(ruidosa_np, dtype=cp.float32)
    m, n = map(int, ruidosa_cp.shape)
    pad_total = int(f + t)
    img_pad = cp.pad(ruidosa_cp, ((pad_total, pad_total), (pad_total, pad_total)), mode='symmetric')
    out_cp = cp.empty_like(ruidosa_cp, dtype=cp.float32)
    for i in range(0, m, B):
        for j in range(0, n, B):
            bi = min(B, m - i)
            bj = min(B, n - j)
            block_cp = geonlm_block(img_pad, i, j, bi, f, t, h_geo, nn)
            out_cp[i:i+bi, j:j+bj] = block_cp
    return cp.asnumpy(out_cp).astype(np.float32)

# ========= Demo (troque IMG_PATH) =========
if __name__ == "__main__":
    IMG_PATH = "../0.gif"  # ajuste para sua imagem
    img = skimage.io.imread(IMG_PATH)
    img = img[0, :, :] if img.ndim > 2 else img
    if img.ndim > 2:
        img = skimage.color.rgb2gray(img) * 255.0
    img = img.astype(np.float32)
    from skimage.transform import downscale_local_mean
    img =  downscale_local_mean(img, (8, 8)).astype(np.uint8)
    m, n = img.shape
    print(f"Imagem: {m}x{n}")

    sigma = 10.0
    ruido = np.random.normal(0.0, sigma, (m, n)).astype(np.float32)
    ruidosa = np.clip(img + ruido, 0.0, 255.0).astype(np.float32)

    # Parâmetros
    f = 4
    t = 15
    h_geo = 150.0
    nn = 30
    B = 96

    # # Smoke test do shim
    # x = cp.full((2,2), 0, dtype=cp.float32)
    # print("cp.full ok:", x.dtype, float(cp.asnumpy(x.sum())))

    # ===== Métricas GPU + Headroom =====
    def pretty_mib(x_bytes): return f"{x_bytes/(1024**2):.1f} MiB"
    def estimate_tile_bytes(f, t, B, nn):
        N = (B + 2*(t + f))**2
        P = (2*f + 1)**2
        patches_bytes = N * P * 4
        edges_bytes   = N * nn * 12
        extras_bytes  = 2 * N * 4
        fudge_graph   = 1.5
        return int(patches_bytes + int(fudge_graph * edges_bytes) + extras_bytes)

    cp.cuda.Device().synchronize()
    with GPUMeter(device_index=0, interval=0.25) as gm:
        filtrada_geo, secs = time_gpu(geonlm_gpu_tiled, ruidosa, f=f, t=t, h_geo=h_geo, nn=nn, B=B)

    cp.cuda.Device().synchronize()
    print(f"\nTempo GPU (s): {secs:.3f}")
    stats = gm.summary()
    print("\n== GPU STATS ==")
    for k, v in stats.items(): print(f"{k}: {v}")

    # Headroom de memória
    mem_total = stats.get("mem_total_MiB", None)
    mem_peak  = stats.get("mem_used_max_MiB", None)
    if mem_total and mem_peak:
        headroom_MiB = max(0.0, mem_total - mem_peak)
        tile_MiB = estimate_tile_bytes(f, t, B, nn) / (1024**2)
        extra_tiles = int(headroom_MiB // tile_MiB) if tile_MiB > 0 else 0
        print("\n== HEADROOM (memória) ==")
        print(f"Consumo estimado por tile: {tile_MiB:.1f} MiB")
        print(f"Memória de pico observada: {mem_peak:.1f} / {mem_total:.1f} MiB")
        print(f"Headroom estimado: {headroom_MiB:.1f} MiB  → tiles extras possíveis (paralelo): {extra_tiles}")
        if stats.get("gpu_util_avg_%", 0) < 70:
            print("→ GPU ociosa: dá pra subir B (tile) ~+25–50% ou aumentar nn um pouco.")
        else:
            print("→ GPU já ocupada; aumente B com cautela ou reduza nn/t.")
    else:
        print("\n(NVML indisponível para headroom; apenas tempo de GPU medido.)")

    # Métricas de qualidade
    filtrada_u8 = np.clip(filtrada_geo, 0.0, 255.0).astype(np.uint8)
    img_u8 = np.clip(img, 0.0, 255.0).astype(np.uint8)
    psnr_geo = peak_signal_noise_ratio(img_u8, filtrada_u8)
    ssim_geo = structural_similarity(img_u8, filtrada_u8)
    print(f"\nPSNR (GEO NLM): {psnr_geo:.4f}")
    print(f"SSIM (GEO NLM): {ssim_geo:.4f}")

    skimage.io.imsave('GEONLM_cuda_tiled.png', filtrada_u8)


Imagem: 64x64
