### Check CUDA installation

In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Fri_Jan__6_16:45:21_PST_2023
Cuda compilation tools, release 12.0, V12.0.140
Build cuda_12.0.r12.0/compiler.32267302_0


In [2]:
!nvidia-smi

Sun Apr 28 20:42:57 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1060        Off |   00000000:01:00.0  On |                  N/A |
| N/A   69C    P2             23W /   78W |     147MiB /   6144MiB |      2%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

### Load Data sets

In [3]:
#!pip install git+https://github.com/nottombrown/imagenet_stubs

In [4]:
import os
import _pickle as cPickle
import numpy as np
import imagenet_stubs
from imagenet_stubs.imagenet_2012_labels import label_to_name as IMAGENET_CLASS_NAMES

PATH = "../data/cifar-10-batches-py/"
IMAGENET_PATH = "../data/Imagenet32_train/"

#### Load Cifar-10

In [5]:
def load_batch(batchFile):
    f = open(PATH+batchFile, 'rb')
    
    dict_ = cPickle.load(f, encoding='latin1')
    images = dict_['data']
    labels = dict_['labels']
    imagearray = np.array(images)   #   (10000, 3072)
    labelarray = np.array(labels)   #   (10000,)
    
    return imagearray, labelarray

def load_classes():
    file = 'batches.meta'
    
    f = open(PATH+file, 'rb')
    dict = cPickle.load(f)
    return dict['label_names']

In [6]:
CIFAR_IMAGES, CIFAR_LABELS = load_batch("data_batch_1")

In [7]:
CIFAR_CLASS_NAMES = load_classes()

In [8]:
CIFAR_IMAGES.shape

(10000, 3072)

#### Load ImageNet

In [9]:
def unpickle(file):
    with open(file, 'rb') as fo:
        dict = cPickle.load(fo)
    return dict

def load_databatch(data_folder, idx, img_size=32):
    data_file = os.path.join(data_folder, 'train_data_batch_')

    dict_ = unpickle(data_file + str(idx))

    images = dict_['data']
    labels = dict_['labels']

    return images, labels

In [10]:
IMAGENET_IMAGES, IMAGENET_LABELS = load_databatch(IMAGENET_PATH, 1)

In [11]:
IMAGENET_CLASS_NAMES(0)

'tench, Tinca tinca'

In [12]:
IMAGENET_IMAGES.shape

(128116, 3072)

### Prepare Kernels - Python POC

In [13]:
def scale_image_down_and_gray_1d(image, new_rows, new_cols):

    pixels = 1024
    newPixels = new_rows * new_cols;
    factor = pixels / newPixels;

    new_image = np.empty(newPixels, dtype=numpy.uint8)

    for idx in range(newPixels):
        pixelToTake = int(idx*factor)
        
        pixel = 0.2989 * image[pixelToTake] \
              + 0.5870 * image[1024+pixelToTake] \
              + 0.1140 * image[2048+pixelToTake]
        
        new_image[idx] = pixel
        
    return new_image

In [14]:
def scale_image_down_and_gray(np_image, new_rows, new_columns):
    
    columns = np_image.shape[0]
    rows = np_image.shape[1]

    factor_columns = columns / new_columns
    factor_rows = rows / new_rows

    new_image = np.empty([new_columns, new_rows, 1], dtype=numpy.uint8)

    print(new_image.shape)
    
    for row in range(new_rows):
        for col in range(new_columns):
            new_row = int(row * factor_rows)
            new_col = int(col * factor_columns)

            pixel = 0.2989 * np_image[new_col][new_row][0] \
                  + 0.5870 * np_image[new_col][new_row][1] \
                  + 0.1140 * np_image[new_col][new_row][2]
            
            new_image[col][row][0] = pixel
    return new_image

In [15]:
def bilinear_interpolation(x, y, points):
    '''Interpolate (x,y) from values associated with four points.

    The four points are a list of four triplets:  (x, y, value).
    The four points can be in any order.  They should form a rectangle.

        >>> bilinear_interpolation(12, 5.5,
        ...                        [(10, 4, 100),
        ...                         (20, 4, 200),
        ...                         (10, 6, 150),
        ...                         (20, 6, 300)])
        165.0

    '''
    # See formula at:  http://en.wikipedia.org/wiki/Bilinear_interpolation

    points = sorted(points)               # order points by x, then by y
    (x1, y1, q11), (_x1, y2, q12), (x2, _y1, q21), (_x2, _y2, q22) = points

    if x1 != _x1 or x2 != _x2 or y1 != _y1 or y2 != _y2:
        raise ValueError('points do not form a rectangle')
    if not x1 <= x <= x2 or not y1 <= y <= y2:
        raise ValueError('(x, y) not within the rectangle')

    return (q11 * (x2 - x) * (y2 - y) +
            q21 * (x - x1) * (y2 - y) +
            q12 * (x2 - x) * (y - y1) +
            q22 * (x - x1) * (y - y1)
           ) / ((x2 - x1) * (y2 - y1) + 0.0)

In [16]:
def comp_difference(image_2d):
    sequence = ""

    rows = image_2d.shape[0]
    cols = image_2d.shape[1]

    print( rows, cols )
    for row in range(rows):
        for col in range(1, cols):
            if image_2d[row][col-1] < image_2d[row][col]:
                sequence += '1'
            else:
                sequence += '0'

    print(sequence)

### CUDA kernes

#### Import PyCuda

In [17]:
import pycuda.autoinit
import pycuda.driver as drv
import numpy

from pycuda.compiler import SourceModule

#### Convert 1000 images at once

In [62]:
mod = SourceModule("""

__global__ void ResizeAndGray1000Images(float* image, float* hash, const int pixels, const int new_rows, 
                                        const int new_cols, const int amountOfImages, const int hashSize)
{

    int idx           = threadIdx.x + blockIdx.x * blockDim.x;
    float newPixels   = new_rows * new_cols;
    int* pixelsValues = new int [new_rows * new_cols];
    
    if (idx > amountOfImages) return;

    float factor      = pixels / newPixels;
    int imageStartIdx = pixels * 3 * idx;
    
    for( int i=0; i<newPixels; i++ )
    {
        const auto pixelToTake = (int)(imageStartIdx+i*factor);
        pixelsValues[i] = 0.2989 * image[pixelToTake] + 0.5870 * image[1024+pixelToTake] + 0.1140 * image[2048+pixelToTake];
    }
    for( int i=1; i<newPixels; i++ )
    {
        pixelsValues[i-1] < pixelsValues[i] ? hash[hashSize*idx+i-1] = 1 : hash[hashSize*idx+i-1] = 0;
    }

    delete[] pixelsValues;
}

""")

ResizeAndGray1000Images = mod.get_function("ResizeAndGray1000Images")

#### Convert 1 image at once

In [63]:
mod = SourceModule("""

__global__ void ResizeAndGrayOneImage(float* image, float* hash, int pixels, int new_rows, int new_cols)
{
    extern __shared__ int pixelsValues[72];

    int idx         = threadIdx.x + blockIdx.x * blockDim.x;
    float newPixels = new_rows * new_cols;
    
    if (idx >= newPixels) return;

    float factor           = pixels / newPixels;
    const auto pixelToTake = (int)(idx*factor);


    auto pixel = 0.2989 * image[pixelToTake] + 0.5870 * image[1024+pixelToTake] + 0.1140 * image[2048+pixelToTake];

    pixelsValues[idx] = pixel;

    __syncthreads();
    
    if(idx == 0) return;

    pixelsValues[idx-1] < pixelsValues[idx] ? hash[idx-1] = 1 : hash[idx-1] = 0;    
}

""")

ResizeAndGrayOneImage = mod.get_function("ResizeAndGrayOneImage")

#### Convert CUDA output to human readable HEX Hash

In [64]:
def floatArrayToHexString(imageHashArray):
    hash_ = ""

    tmp_imageHashArray = imageHashArray.astype(numpy.uint8)

    
    for i in range(0, imageHashArray.shape[0], 4):
        tmp = str(tmp_imageHashArray[i]) + str(tmp_imageHashArray[i+1]) + str(tmp_imageHashArray[i+2]) + str(tmp_imageHashArray[i+3])
        
        hash_ += hex(int(tmp, 2))[2:]

    return hash_

#### Tests Kernels

In [65]:
HASH_SIZE = 64;

##### One Input Config

In [66]:
HASH_SIZE = 64;
THREADS_X = 64;
THREADS_Y = 1;
THREADS_Z = 1;

PIXELS = 1024
NEW_ROWS = 8
NEW_COLS = 9

INPUT_IMAGE = CIFAR_IMAGES[0].astype(numpy.float32)
OUT_HASH    = numpy.zeros([HASH_SIZE]).astype(numpy.float32)

In [67]:
ResizeAndGrayOneImage(drv.In(INPUT_IMAGE), drv.Out(OUT_HASH), np.int32(PIXELS) , np.int32(NEW_ROWS) , np.int32(NEW_COLS), 
            block=(THREADS_X, THREADS_Y, THREADS_Z))

In [68]:
floatArrayToHexString(OUT_HASH) # 63E3683A50325929

'dd599776ab5126b2'

##### 1000 Input Config

In [69]:
HASH_SIZE = 64;
THREADS_X_1000 = 1000;
THREADS_Y = 1;
THREADS_Z = 1;

PIXELS = 1024
NEW_ROWS = 8
NEW_COLS = 9

INPUT_IMAGE_1000 = CIFAR_IMAGES[:THREADS_X_1000].astype(numpy.float32).reshape(THREADS_X_1000 * PIXELS * 3)
OUT_HASH_1000    = numpy.zeros([THREADS_X_1000 * HASH_SIZE]).astype(numpy.float32)

In [70]:
ResizeAndGray1000Images(drv.In(INPUT_IMAGE_1000), drv.Out(OUT_HASH_1000), np.int32(PIXELS), 
                        np.int32(NEW_ROWS), np.int32(NEW_COLS), np.int32(THREADS_X_1000), np.int32(HASH_SIZE),
                        block=(THREADS_X_1000,THREADS_Y,THREADS_Z))

In [71]:
print(floatArrayToHexString(OUT_HASH_1000[0:64]))
print(floatArrayToHexString(OUT_HASH_1000[640:704]))
print(floatArrayToHexString(OUT_HASH_1000[6336:6400]))

dd599776ab5126b2
af52ad462a297695
9aadaa5ca956ac5a


#### Kernel Performance Tests

In [72]:
import time

In [75]:
start = time.time()

ResizeAndGray1000Images(drv.In(INPUT_IMAGE_1000), drv.Out(OUT_HASH_1000), np.int32(PIXELS), 
                        np.int32(NEW_ROWS), np.int32(NEW_COLS), np.int32(THREADS_X_1000), np.int32(HASH_SIZE),
                        block=(THREADS_X_1000,THREADS_Y,THREADS_Z))

end = time.time()
print( "time: ", end - start )

time:  0.0224609375


In [74]:
start = time.time()

for i in range(1000):
    ResizeAndGrayOneImage(drv.In(INPUT_IMAGE), drv.Out(OUT_HASH), np.int32(PIXELS) , np.int32(NEW_ROWS) , np.int32(NEW_COLS), 
            block=(THREADS_X, THREADS_Y, THREADS_Z))
end = time.time()
print( "time: ", end - start )

time:  0.1704249382019043


### CuLSH

In [43]:
from typing import List, Dict, Tuple

class LSH:
    def __init__(self, size, hash_size):
        self.buckets = []
        self.bandsNumber = size
        self.rows = int(hash_size**2 / self.bandsNumber)
        self.hash_buckets_list: List[Dict[int, List[str]]] = [dict() for _ in range(self.bandsNumber)]
        self.signatures = dict()
        self.hash_size = hash_size

        for i in range(size):
            self.buckets.append({})
            
    def fill_buckets(self, signature, idx):
        for i in range(self.bandsNumber):
            signature_band = signature[i*self.rows:(i+1)*self.rows]
            signature_band_bytes = signature_band.tobytes()
            if signature_band_bytes not in self.hash_buckets_list[i]:
                self.hash_buckets_list[i][signature_band_bytes] = list()
            self.hash_buckets_list[i][signature_band_bytes].append(idx)
    
    def compute_hash(self, images):
        idx = 0
        signature_1000 = numpy.zeros([1000 * 64]).astype(numpy.float32)
        for i in range(0, 10):
            INPUT_IMAGES = images[1000*i:1000*i+1000].astype(numpy.float32).reshape(1000 * 1024 * 3)
            ResizeAndGray1000Images(drv.In(INPUT_IMAGES), drv.Out(signature_1000), np.int32(1024), 
                        np.int32(8), np.int32(9), np.int32(1000), np.int32(64),
                        block=(1000,1,1))
            
            signature_ = signature_1000.astype(numpy.bool_)

            for j in range(0, 1000):
                one_signature = signature_[64*j:64*j+64]
                self.signatures[idx] = np.packbits(one_signature)
                self.fill_buckets(one_signature, idx)
                idx += 1

    def compute_candidate(self):
        candidate_pairs = set()
        
        for hash_buckets in self.hash_buckets_list:
            for hash_bucket in hash_buckets.values():
                if len(hash_bucket) > 1:
                    hash_bucket = sorted(hash_bucket) # Why needed ?
                    for i in range(len(hash_bucket)):
                        for j in range(i+1, len(hash_bucket)):
                            candidate_pairs.add(
                                tuple([hash_bucket[i],hash_bucket[j]])
                            )
        return candidate_pairs

    def check_candidate(self, candidate_pairs, threshold=0.9):
        
        near_duplicates = list()
        for cpa, cpb in candidate_pairs:
            hd = sum(np.bitwise_xor(
                    np.unpackbits(self.signatures[cpa]), 
                    np.unpackbits(self.signatures[cpb])
            ))
            similarity = (self.hash_size**2 - hd) / self.hash_size**2
            if similarity > threshold:
                near_duplicates.append((cpa, cpb, similarity))
                
        # Sort near-duplicates by descending similarity and return
        near_duplicates.sort(key=lambda x:x[2], reverse=True)
        return near_duplicates

In [44]:
start = time.time()

lsh = LSH(8, 8)
lsh.compute_hash(CIFAR_IMAGES)
#pairs = lsh.compute_candidate()
#duplicates = lsh.check_candidate(pairs,0.9)

end = time.time()
print( "time: ", end - start )

time:  0.13852357864379883


#### Experiments

In [56]:
#https://stackoverflow.com/questions/39445786/how-to-define-cuda-device-constant-like-a-c-const-constexpr
# Whats about Torch resize kernel ???? - comparision
# whats about numpy on gpu ????? - comparision
# Zgrać zdjecia i porownac jak radzi sobie w sytuacji zycia codziennego !!
# Zdjecia z poza setu jak sobie radzi - dHash i NeuralNetwork
#https://stackoverflow.com/questions/30261859/using-dictionary-in-pycuda

In [77]:
from typing import List, Dict, Tuple

class LSH:
    def __init__(self, size, hash_size):
        self.buckets = []
        self.bandsNumber = size
        self.rows = int(hash_size**2 / self.bandsNumber)
        self.hash_buckets_list: List[Dict[int, List[str]]] = [dict() for _ in range(self.bandsNumber)]
        self.signatures = dict()
        self.hash_size = hash_size

        for i in range(size):
            self.buckets.append({})
            
    def fill_buckets(self, signature, idx):
        for i in range(self.bandsNumber):
            signature_band = signature[i*self.rows:(i+1)*self.rows]
            signature_band_bytes = signature_band.tobytes()
            if signature_band_bytes not in self.hash_buckets_list[i]:
                self.hash_buckets_list[i][signature_band_bytes] = list()
            self.hash_buckets_list[i][signature_band_bytes].append(idx)
    
    def compute_hash(self, images):
        idx = 0
        signature_1000 = numpy.zeros([1000 * 64]).astype(numpy.float32)
        for i in range(0, 10):
            INPUT_IMAGES = images[1000*i:1000*i+1000].astype(numpy.float32).reshape(1000 * 1024 * 3)
            ResizeAndGray1000Images(drv.In(INPUT_IMAGES), drv.Out(signature_1000), np.int32(1024), 
                        np.int32(8), np.int32(9), np.int32(1000), np.int32(64),
                        block=(1000,1,1))
            
            signature_ = signature_1000.astype(numpy.bool_)

            for j in range(0, 1000):
                one_signature = signature_[64*j:64*j+64]
                self.signatures[idx] = np.packbits(one_signature)
                self.fill_buckets(one_signature, idx)
                idx += 1

    def compute_candidate(self):
        candidate_pairs = set()

        #print(self.hash_buckets_list)
        for hash_buckets in self.hash_buckets_list:
           #print(hash_buckets)
            for hash_bucket in hash_buckets.values():
                #print(hash_bucket)
                if len(hash_bucket) > 1:
                    hash_bucket = sorted(hash_bucket) # Why needed ?
                    #print(hash_bucket)
                    for i in range(len(hash_bucket)):
                        for j in range(i+1, len(hash_bucket)):
                            candidate_pairs.add(
                                tuple([hash_bucket[i],hash_bucket[j]])
                            )
        return candidate_pairs

    def check_candidate(self, candidate_pairs, threshold=0.9):
        
        near_duplicates = list()
        for cpa, cpb in candidate_pairs:
            hd = sum(np.bitwise_xor(
                    np.unpackbits(self.signatures[cpa]), 
                    np.unpackbits(self.signatures[cpb])
            ))
            similarity = (self.hash_size**2 - hd) / self.hash_size**2
            if similarity > threshold:
                near_duplicates.append((cpa, cpb, similarity))
                
        # Sort near-duplicates by descending similarity and return
        near_duplicates.sort(key=lambda x:x[2], reverse=True)
        return near_duplicates

In [78]:
start = time.time()

lsh = LSH(8, 8)
lsh.compute_hash(CIFAR_IMAGES)
pairs = lsh.compute_candidate()
#duplicates = lsh.check_candidate(pairs,0.9)

end = time.time()
print( "time: ", end - start )

time:  3.5973927974700928
