In [15]:
import skimage.io
import skimage.color
import cupy as cp
import time

# Função para replicar bordas (mirror padding)
#@njit
def mirror(A, f):
    n, m = A.shape
    nlin = n + 2 * f
    ncol = m + 2 * f
    B = cp.zeros((nlin, ncol), dtype=A.dtype)  # Criando B como CuPy
    A = cp.array(A)  # Garantindo que A também seja CuPy (se A for NumPy, converta)

    B[f:nlin-f, f:ncol-f] = A
    B[0:f, 0:f] = cp.flip(A[0:f, 0:f])
    B[0:f, ncol-f:ncol] = cp.flip(A[0:f, m-f:m])
    B[nlin-f:nlin, 0:f] = cp.flip(A[n-f:n, 0:f])
    B[nlin-f:nlin, ncol-f:ncol] = cp.flip(A[n-f:n, m-f:m])
    B[0:f, f:ncol-f] = cp.flipud(A[0:f, :])
    B[nlin-f:nlin, f:ncol-f] = cp.flipud(A[n-f:n, :])
    B[f:nlin-f, 0:f] = cp.fliplr(A[:, 0:f])
    B[f:nlin-f, ncol-f:ncol] = cp.fliplr(A[:, m-f:m])
    return B


nlm_kernel_shared_code = r'''
extern "C" __global__
void nlm_kernel_shared(
    const float* img_n, float* output,
    int m, int n, int f, int t, float h, int padded_width
) {
    int i = blockIdx.y * blockDim.y + threadIdx.y;
    int j = blockIdx.x * blockDim.x + threadIdx.x;

    int Bx = blockDim.x;
    int By = blockDim.y;

    int pad = f + t;

    int sh_width = Bx + 2 * pad;
    int sh_height = By + 2 * pad;

    extern __shared__ float sh_img[];

    int base_i = blockIdx.y * blockDim.y + f - pad;
    int base_j = blockIdx.x * blockDim.x + f - pad;

    // Carrega patch expandido na shared memory
    for (int y = threadIdx.y; y < sh_height; y += By) {
        for (int x = threadIdx.x; x < sh_width; x += Bx) {
            int img_i = base_i + y;
            int img_j = base_j + x;

            // Replicação de borda
            int ii = img_i < 0 ? 0 : (img_i >= m + 2*f ? m + 2*f - 1 : img_i);
            int jj = img_j < 0 ? 0 : (img_j >= n + 2*f ? n + 2*f - 1 : img_j);

            sh_img[y * sh_width + x] = img_n[ii * padded_width + jj];
        }
    }
    __syncthreads();

    if (i >= m || j >= n) return;

    int local_i = threadIdx.y + pad;
    int local_j = threadIdx.x + pad;

    float NL = 0.0f;
    float Z = 0.0f;

    int rmin = max(local_i - t, pad);
    int rmax = min(local_i + t, By + pad - 1);
    int smin = max(local_j - t, pad);
    int smax = min(local_j + t, Bx + pad - 1);

    for (int r = rmin; r <= rmax; ++r) {
        for (int s = smin; s <= smax; ++s) {
            float d2 = 0.0f;
            for (int u = -f; u <= f; ++u) {
                for (int v = -f; v <= f; ++v) {
                    float diff = sh_img[(local_i + u) * sh_width + (local_j + v)] -
                                 sh_img[(r + u) * sh_width + (s + v)];
                    d2 += diff * diff;
                }
            }
            float sij = __expf(-d2 / (h * h));
            Z += sij;
            NL += sij * sh_img[r * sh_width + s];
        }
    }
    output[i * n + j] = NL / Z;
}
'''

def NLM_fast_cuda_shared(img, h, f, t):
    img = img.astype(cp.float32)
    m, n = img.shape
    padded = mirror(img, f)

    kernel_code = nlm_kernel_shared_code.encode('ascii', 'ignore').decode('ascii')
    module = cp.RawModule(code=kernel_code, options=('-std=c++11',))
    #module = cp.RawModule(code=nlm_kernel_shared_code, options=('-std=c++11',))
    kernel = module.get_function("nlm_kernel_shared")

    output = cp.zeros((m, n), dtype=cp.float32)

    threads_per_block = (16, 16)
    block_x = (n + threads_per_block[0] - 1) // threads_per_block[0]
    block_y = (m + threads_per_block[1] - 1) // threads_per_block[1]
    grid = (block_x, block_y)

    sh_width = threads_per_block[0] + 2 * (f + t)
    sh_height = threads_per_block[1] + 2 * (f + t)
    shared_mem_size = sh_width * sh_height * 4  # float32 = 4 bytes

    kernel(
        grid, threads_per_block,
        (
            padded.ravel(), output.ravel(),
            cp.int32(m).get(), cp.int32(n).get(), cp.int32(f).get(), cp.int32(t).get(),
            cp.float32(h).get(), cp.int32(padded.shape[1]).get()
        ),
        shared_mem=shared_mem_size
    )

    return output

# ------------------ TESTE --------------------

# img = skimage.io.imread('extras/images/ct2.png')
# if len(img.shape) > 2:
#     img = skimage.color.rgb2gray(img)
#     img = 255 * img
# img = cp.array(img).astype(cp.float32)

# h = 10.0
# f = 3
# t = 10

# start = time.time()
# result = NLM_fast_cuda_shared(img, h, f, t)
# cp.cuda.Stream.null.synchronize()  # espera terminar GPU
# print("Tempo GPU:", time.time()-start)


In [5]:
import os
import math
import numpy as np
import cupy as cp

def read_directories(directory, img=None, exclude_json=None):
    # Get a list of filenames in the specified directory
    filenames = []
    for filename in os.listdir(directory):
        if img is not None:
            # If 'img' is provided, filter filenames containing it
            if img in filename:   
                filenames.append(filename)
        elif exclude_json is not None:
            filenames.append(filename.replace('.json',''))     
        else:
            filenames.append(filename)    
    return filenames


def add_poisson_noise(img):
    """
    Aplica ruído de Poisson corretamente sem overflow, utilizando CuPy (GPU).

    Parâmetros:
        img (cp.ndarray): Imagem com valores em [0,255] ou [0,1].

    Retorna:
        cp.ndarray: imagem ruidosa, clipada para [0, 255], dtype uint8.
    """
    # Se estiver em [0, 1], escala para 0-255
    if cp.max(img) <= 1.0:
        img = (img * 255).astype(cp.float32)
    else:
        img = img.astype(cp.float32)

    # Garante que os valores Poisson não causem overflow
    poisson_img = cp.random.poisson(img).astype(cp.float32)
    poisson_img = cp.clip(poisson_img, 0, 255)

    return poisson_img.astype(cp.uint8)


In [6]:
def compute_adaptive_q(sigma_est):
    q_nlm = 0.8 + 0.5 * cp.tanh(0.3 * (sigma_est - 1))
    q_geo = 1.0 + 0.7 * cp.tanh(0.25 * (sigma_est - 1.5))

    q_nlm = cp.clip(q_nlm, 0.7, 2.2) * 10
    q_geo = cp.clip(q_geo, 0.9, 2.7) * 10

    return q_nlm, q_geo

In [16]:
import skimage.io
import skimage.color
import numpy as np
import cupy as cp
import time
from numba import njit
from skimage.transform import downscale_local_mean
from skimage.restoration import estimate_sigma


# Função CUDA já dada no seu código, chamada NLM_fast_cuda_shared
# (deve estar definida no mesmo script)

# Carrega e prepara imagem
img_path = '../images/1.gif'
img = skimage.io.imread(img_path)
img = img[0, :, :] if len(img.shape) > 2 else img

if len(img.shape) > 2:
    img = skimage.color.rgb2gray(img)
    img = 255 * img


img_downscale = downscale_local_mean(img, (2, 2))


img_cpu = np.array(img_downscale, dtype=np.float32)
img_gpu = cp.array(img_cpu)

m, n = img_gpu.shape

sigma = 10
ruido = cp.random.normal(0, sigma, (m, n))

# Cria imagem ruidosa
ruidosa = img_gpu + ruido

# Clipa imagem para intervalo [0, 255]
ruidosa[cp.where(ruidosa > 255)] = 255
ruidosa[cp.where(ruidosa < 0)] = 0

#noised_poisson = add_poisson_noise(img_gpu)

#print(type(noised_poisson))

#noised_poisson_cp = cp.array(noised_poisson).get()

ruidosa = cp.array(ruidosa).get()


sigma_est = estimate_sigma(ruidosa)
print(f"Sigma estimado: {sigma_est}")

h_nlm, h_geo = compute_adaptive_q(sigma_est)
print(f"h_nlm: {h_nlm}")

h = 105
f = 4
t = 10

# Rodar CPU
start_cpu = time.time()


# Rodar GPU
start_gpu = time.time()
result_gpu = NLM_fast_cuda_shared(ruidosa, h_nlm, f, t)
result_gpu_cp = cp.asnumpy(result_gpu)
cp.cuda.Stream.null.synchronize()  # Espera GPU terminar
print("Tempo GPU:", time.time() - start_gpu)



# Comparação das duas imagens filtradas


# Opcional: salvar imagens para visualização
import matplotlib.pyplot as plt
plt.plot()

plt.title("GPU NLM FAST")
plt.imshow(result_gpu_cp, cmap='gray')
plt.show()

img_gpu = cp.asnumpy(img_gpu).astype(np.uint8)
result_gpu_cp = cp.asnumpy(result_gpu_cp).astype(np.uint8)

from skimage.metrics import peak_signal_noise_ratio, structural_similarity
import numpy as np


# Calcula PSNR
psnr = peak_signal_noise_ratio(img_gpu, result_gpu_cp)
print('PSNR (NLM CUDA): %f' % psnr)

# Calcula SSIM
ssim = structural_similarity(img_gpu, result_gpu_cp)
print('SSIM (NLM CUDA): %f' % ssim)


Sigma estimado: 12.463414387802912
h_nlm: 12.989709078310172


AttributeError: 'numpy.int32' object has no attribute 'get'