## Reading + Loading Benchmark

### Reading w/ skimage

In [None]:
# TODO : Tests no usable by anyone else, need to be updated

from skimage.io import imread
import glob
import time

image_paths = glob.glob('/nobackup/kp276129/test/1*/slice_010*') # 10 4k tif

t1 = time.perf_counter()
imgs = [imread(img) for img in image_paths]
t2 = time.perf_counter()

print(f"Time taken to read {len(image_paths)} images: {t2 - t1} seconds")

### Reading w/ nvimagecodec

In [None]:
# WAITING FOR NVIDIA TO FIX THE ISSUE https://github.com/NVIDIA/nvImageCodec/issues/5
# from nvidia import nvimgcodec
# import time
# import glob

# params = nvimgcodec.DecodeParams(color_spec=nvimgcodec.ColorSpec.UNCHANGED, allow_any_depth=True)
# dec = nvimgcodec.Decoder()

# t1 = time.perf_counter()
# imgs=dec.read(glob.glob('/nobackup/kp276129/test/1*/slice_010*'), params)
# t2 = time.perf_counter()
# print(f"Time taken to read images: {t2 - t1} seconds")

### Loading into GPU (CuPy)

In [None]:
import cupy as cp

imgs_cp = []

# Converting 10 images to CuPy arrays
t3 = time.perf_counter()
for img in imgs:
    imgs_cp.append(cp.asarray(img).squeeze())
t4 = time.perf_counter()

print(f"Time taken to convert {len(imgs)} images to CuPy arrays: {t4 - t3} seconds")

### Processing vnsr2d

In [None]:
import sys
sys.path.append('..')
from src.pyvsnr import vsnr2d

filters=[{'name':'Dirac', 'noise_level':0.35}]

# Time to apply vsnr2d to 10 images one by one
t5 = time.perf_counter()
for img in imgs_cp:
    vsnr2d(img, filters)
t6 = time.perf_counter()

print(f"Time taken to apply vsnr2d to {len(imgs)} images: {t6 - t5} seconds")

### Time Distribution

In [None]:
import matplotlib.pyplot as plt

activities = ['Reading', 'Loading', 'Processing']

time_taken = [t2-t1, t4-t3, t6-t5]

# Create a pie chart
plt.pie(time_taken, labels=activities, autopct='%1.1f%%', startangle=140)

# Add title and legend
plt.title('Time Distribution')
plt.legend(activities, loc="best")

# Display the plot
plt.show()

## Profiling

### Pyvsnr VRAM Usage

In [None]:
import sys
import cupy as cp
sys.path.append('..')
from src.pyvsnr import vsnr2d

filters=[{'name':'Dirac', 'noise_level':0.35}]
nb_img = 1

for _ in range(nb_img):
    img = cp.random.rand(4224, 4224)
    vsnr2d(img, filters)

### Pyvsnr VRAM Profiling

In [None]:
%load_ext memory_profiler
import sys
sys.path.append('..')
from src.pyvsnr import vsnr2d
import cupy as cp

%mprun -f vsnr2d vsnr2d(cp.random.rand(4224, 4224), [{'name':'Dirac', 'noise_level':0.35}])

### VRAM Usage for Batch Processing FFT

In [None]:
import cupy as cp
import matplotlib.pyplot as plt 

def image_generator(nb_img, batch_size):
    for i in range(0, nb_img, batch_size):
        batch_imgs = cp.random.rand(batch_size, 4224, 4224, dtype=cp.float32)
        yield batch_imgs

batch_size = 2
nb_img = 100

# Processing simple FFT
for batch in image_generator(nb_img, batch_size):
    imgs_corr = cp.fft.fft2(batch)

batch_sizes = [1,2,10,20,50]
vram_usage = [1215,1963,7947,15437,37899]

# Calculate VRAM usage per image
for i in range(1, len(batch_sizes)):
    vram_per_image = (vram_usage[i] - vram_usage[i-1]) / (batch_sizes[i] - batch_sizes[i-1])
    print(f"VRAM usage per image for batch size {batch_sizes[i]} is: {vram_per_image}")

plt.xlabel('Batch Size')
plt.ylabel('VRAM Usage (MB)')
plt.plot(batch_sizes, vram_usage)

### Pyvsnr Profiling

In [None]:
import sys
import pstats
import cProfile
import io

sys.path.append('../')
import src.pyvsnr as pyvsnr
import cupy as cp

img = cp.random.rand(4200, 4200)
filters=[{'name':'Dirac', 'noise_level':0.35}]

pr = cProfile.Profile()
pr.enable()

pyvsnr.vsnr2d(img, filters)

pr.disable()
s = io.StringIO()
ps = pstats.Stats(pr, stream=s).sort_stats('time')
ps.print_stats()

print(s.getvalue())

## Others

### Batch vs Individual Processing

#### Pyvsnr Average Time

In [None]:
import sys
sys.path.append('..')
from src.pyvsnr import vsnr2d
import cupy as cp
import time

filters=[{'name':'Dirac', 'noise_level':0.35}]
img = cp.random.rand(2048, 2048).astype(cp.float32)
nit=200

# Calculatin average time for 100 images
t1 = time.perf_counter()
for i in range(nit):
    vsnr2d(img, filters)
t2 = time.perf_counter()

# print average
print(f"Average time to apply vsnr2d: {(t2-t1)/nit} seconds")

#### Pyvsnr Batch Average Time

In [None]:
import sys
sys.path.append('..')
from pyvsnr.vsnr2d import vsnr2d
import cupy as cp
import time

def batch_generator(images, batch_size):
    for i in range(0, len(images), batch_size):
        yield images[i:i+batch_size]


nb_img = 200
batch_size = 10
filters=[{'name':'Dirac', 'noise_level':0.35}]
imgs = cp.random.rand(nb_img, 2048, 2048).astype(cp.float32)

t1 = time.perf_counter()
for batch in batch_generator(imgs, batch_size):
    vsnr2d(batch, filters, algo='cupy')
t2 = time.perf_counter()

print(f"Average time to apply vsnr2d_batch: {(t2-t1)/nb_img} seconds")


### Testing different batch sizes for Pyvsnr

In [None]:
import sys
sys.path.append('..')

from pyvsnr.vsnr2d import vsnr2d
import matplotlib.pyplot as plt   
import cupy as cp
import time

def batch_generator(images, batch_size):
    for i in range(0, len(images), batch_size):
        yield cp.stack(images[i:i+batch_size])

filters=[{'name':'Dirac', 'noise_level':0.35}]
img = cp.random.rand(2048, 2048).astype(cp.float32)

batch_sizes = [5,10,15,20,25]
times = []

for batch_size in batch_sizes:
    t1 = time.perf_counter()
    for batch in batch_generator([img]*200, batch_size):
        vsnr2d(batch, filters)
    t2 = time.perf_counter()
    times.append((t2-t1)/200)

    cp.fft.config.get_plan_cache().clear()
    cp.get_default_memory_pool().free_all_blocks()
    cp.get_default_pinned_memory_pool().free_all_blocks()



plt.xlabel('Batch Size')
plt.ylabel('Time (s)')
plt.plot(batch_sizes, times)

### Multrithreading on pyvsnr

In [None]:
# Actually slower than processing in batch
import sys
from multiprocessing import Pool
sys.path.append('..')
from src.pyvsnr import vsnr2d
import numpy as np
import time

filters=[{'name':'Dirac', 'noise_level':0.35}]
img = np.random.rand(2048, 2048) # CuPy usage leads to CUDA initialization error
nit=200

def task(i):
    vsnr2d(img, filters,algo="cuda")

# Calculating average time for 100 images
t1 = time.perf_counter()
with Pool(10) as pool:
    pool.map(task, range(nit))
t2 = time.perf_counter()

# print average
print(f"Average time to apply vsnr2d: {(t2-t1)/nit} seconds")


### imread vs imread_collection : Same

In [None]:
from skimage.io import imread, imread_collection
import glob
import time

image_paths = glob.glob('/nobackup/kp276129/test/1*/*')

# Method 1 : Read with imread_collection
t1 = time.perf_counter()
imgs1 = list(imread_collection(image_paths))
t2 = time.perf_counter()
print(f"Time taken to read images with imread_collection: {t2 - t1} seconds")

# Method 2 : Read with imread
t3 = time.perf_counter()
imgs2 = [imread(path) for path in image_paths]
t4 = time.perf_counter()
print(f"Time taken to read images with imread: {t4 - t3} seconds")


### Use streams to overlap data transfer and computation

In [None]:
import cupy as cp
from pyvsnr.vsnr2d import vsnr2d
from tifffile import imread
import glob
import time
from concurrent.futures import ThreadPoolExecutor, as_completed


# List of image paths
image_paths = glob.glob('/nobackup/kp276129/test/1*/*')

# Number of images and batch size
num_img = 100
batch_size = 10

def process_image(image):
    return vsnr2d(image, [{'name':'Dirac', 'noise_level':0.35}])

# Function to load a batch of images
def load_batch(paths):
    return [cp.array(imread(path)) for path in paths]

# Function to process a batch of images
def process_batch(batch):
    return [process_image(image) for image in batch]

# Create a ThreadPoolExecutor with more workers for better concurrency
executor = ThreadPoolExecutor(max_workers=4)

t1 = time.perf_counter()

# Initialize the first batch loading
futures = {executor.submit(load_batch, image_paths[i:i+batch_size]): i for i in range(0, num_img, batch_size)}
results = []

for future in as_completed(futures):
    i = futures[future]
    batch = future.result()
    t_start = time.perf_counter()
    result = process_batch(batch)
    t_end = time.perf_counter()
    results.append(result)
    # print(f"Processed batch {i//batch_size + 1}, processing time: {t_end - t_start:.4f} seconds")
    
    # Submit the next batch
    if i + batch_size < num_img:
        futures[executor.submit(load_batch, image_paths[i+batch_size:i+2*batch_size])] = i + batch_size

t2 = time.perf_counter()
print(f"Time taken with threads: {t2 - t1} seconds")

t3 = time.perf_counter()

# Process images sequentially without threads for comparison
for i in range(0, num_img, batch_size):
    batch = load_batch(image_paths[i:i+batch_size])
    t_start = time.perf_counter()
    result = process_batch(batch)
    t_end = time.perf_counter()
    # print(f"Processed batch {i//batch_size + 1}, processing time: {t_end - t_start:.4f} seconds")
    assert cp.allclose(results[i//batch_size], result)

t4 = time.perf_counter()
print(f"Time taken without threads: {t4 - t3} seconds")
