In [2]:
import numpy as np
from numba import cuda, jit
import time
from PIL import Image
import os

In [4]:
# Simulating loading synthetic word dataset images
def load_synthetic_word_dataset(folder_path):
    images = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.png') or filename.endswith('.jpg'):
            img_path = os.path.join(folder_path, filename)
            img = Image.open(img_path).convert('L')  # convert to grayscale
            img = img.resize((128, 32))  # resizing to a fixed size
            img_array = np.array(img).flatten() / 255.0  # normalize pixel values
            images.append(img_array)
    return np.array(images)

The algorithm in the provided code implements a Dense (or fully connected) layer with a forward_seq method to compute the output based on the input.

**Purpose**: To compute the output of a fully connected layer based on the input, weights, and biases.
**Method**: Use the dot product to compute the output before applying the ReLU activation function to discard negative values.


In [5]:
# Dense layer implementation
class Dense:
    def __init__(self, input_size, output_size):
        self.input_size = input_size
        self.output_size = output_size
        self.weight = np.random.rand(input_size, output_size) - 0.5
        self.bias = np.random.rand(output_size) - 0.5

    def forward_seq(self, input):
        output = np.zeros((input.shape[0], self.output_size))
        for i in range(input.shape[0]):
            for j in range(self.output_size):
                output[i, j] = np.dot(input[i], self.weight[:, j]) + self.bias[j]
                output[i, j] = max(0, output[i, j])  # ReLU activation
        return output

The provided code implements a function forward_jit using Numba's @jit decorator to optimize the computation.

**@jit(nopython=True)**:
  * This decorator from Numba is used to compile the function to machine code for better performance.
  * The nopython=True option ensures that the function runs in "nopython" mode, which means that the code will be compiled to run without relying on the Python interpreter, resulting in significant speedup.
  * Due to JIT compilation, forward_jit is expected to perform significantly faster, especially for large input sizes. Numba optimizes the loops and mathematical operations at a lower level.


In [6]:
@jit(nopython=True)
def forward_jit(input, weight, bias):
    output = np.zeros((input.shape[0], weight.shape[1]))
    for i in range(input.shape[0]):
        for j in range(weight.shape[1]):
            output[i, j] = np.dot(input[i], weight[:, j]) + bias[j]
            output[i, j] = max(0, output[i, j])  # ReLU activation
    return output

* The function uses a 2D grid and block configuration to determine the threads.
This means the work is divided into smaller blocks, each containing multiple threads.

* Each thread computes the dot product of one row of the input matrix input with one column of the weight matrix weight. The result of this dot product is then added to the corresponding bias bias[j].


In [7]:
@cuda.jit
def forward_cuda(input, weight, bias, output):
    i, j = cuda.grid(2)
    if i < input.shape[0] and j < weight.shape[1]:
        val = 0
        for k in range(weight.shape[0]):
            val += input[i, k] * weight[k, j]
        output[i, j] = max(0, val + bias[j])  # ReLU activation

The function forward_cuda_3d optimizes the computation of the forward pass of a dense layer using CUDA with 3D grid and block configuration.

In [8]:
@cuda.jit
def forward_cuda_3d(input, weight, bias, output):
    x, y, z = cuda.grid(3)
    if x < input.shape[0] and y < weight.shape[1] and z < weight.shape[0]:
        cuda.atomic.add(output, (x, y), input[x, z] * weight[z, y])
        if z == weight.shape[0] - 1:
            output[x, y] += bias[y]
            output[x, y] = max(0, output[x, y])  # ReLU activation

In [9]:
# Path to synthetic word dataset folder
folder_path = 'Synthetic_Word_Dataset'

# Load synthetic word dataset
input_data = load_synthetic_word_dataset(folder_path)

In [10]:

# Creating Dense layer
input_size = input_data.shape[1]
output_size = 63  # based on the final dense layer output units
dense_layer = Dense(input_size, output_size)
# Generating synthetic data
batch_size = 256
input_data = np.random.rand(batch_size, input_size) - 0.5


In [11]:
# Sequential Execution
seq_start = time.time()
output_seq = dense_layer.forward_seq(input_data)
seq_end = time.time()


In [12]:
# JIT Execution
jit_start = time.time()
output_jit = forward_jit(input_data, dense_layer.weight, dense_layer.bias)
jit_end = time.time()

  output[i, j] = np.dot(input[i], weight[:, j]) + bias[j]


In [15]:
# CUDA Execution
output_cuda = np.zeros((input_data.shape[0], output_size))
threadsperblock = (16, 16)
blockspergrid_x = int(np.ceil(input_data.shape[0] / threadsperblock[0]))
blockspergrid_y = int(np.ceil(output_size / threadsperblock[1]))
blockspergrid = (blockspergrid_x, blockspergrid_y)

cuda_start = time.time()
forward_cuda[blockspergrid, threadsperblock](input_data, dense_layer.weight, dense_layer.bias, output_cuda)
cuda_end = time.time()

CudaDriverError: Driver missing function: cuDeviceGetUuid

In [None]:
# CUDA 3D Execution

output_cuda_3d = np.zeros((batch_size, output_size))
threadsperblock = (8, 8, 8)
blockspergrid_x = int(np.ceil(batch_size / threadsperblock[0]))
blockspergrid_y = int(np.ceil(input_size / threadsperblock[1]))
blockspergrid_z = int(np.ceil(output_size / threadsperblock[2]))
blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z)

cuda_start_3d = time.time()
forward_cuda_3d[blockspergrid, threadsperblock](input_data, dense_layer.weight, dense_layer.bias, output_cuda)
cuda_end_3d = time.time()

In [None]:
# Timing and Error Analysis
print("Dense Layer Execution Times")
print(f"Time Sequential: {seq_end - seq_start}")
print(f"Time JIT: {jit_end - jit_start}")
print(f"Time CUDA: {cuda_end - cuda_start}")
print(f"Time CUDA 3D: {cuda_end_3d - cuda_start_3d}")

print(f"Error between Sequential and JIT: {np.sum(np.abs(output_seq - output_jit))}")
print(f"Error between Sequential and CUDA: {np.sum(np.abs(output_seq - output_cuda))}")
print(f"Error between Sequential and CUDA 3D: {np.sum(np.abs(output_seq - output_cuda_3d))}")