<a href="https://colab.research.google.com/github/Divyadharshini1924/ASSIGNMENT-02/blob/main/Report.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# @title Convolution Layer

import numpy as np
from numba import cuda
import time

#------------Convolution kernel--------------#
class Convolution(Layer):
    def __init__(self, n_filters=32, filter_size=3, stride=1, activation=None, input_shape=(28, 28, 1)):
        self.input_shape = input_shape
        self.n_filters = n_filters
        self.filter_size = filter_size
        self.stride = stride
        self.activation = activation
        self.use_device = False
        self.bias = np.zeros((n_filters, 1))
        self.init_weight()

    def get_out_shape(self):
        output_width = (self.input_shape[2] -
                        self.filter_size) // self.stride + 1
        output_height = (
            self.input_shape[1] - self.filter_size) // self.stride + 1

        return ( self.n_filters,output_height, output_width)
    def init_weight(self):
        self.weights = np.random.randn(
            self.n_filters, self.input_shape[0],self.filter_size, self.filter_size)/(self.filter_size**2)
    def forward(self, inputs):
        self.inputs = inputs
        n_batchs, n_chanels,in_height, in_width = inputs.shape
        assert self.input_shape == inputs.shape[1:], "Input shape incorrect"
        output_height, output_width = self.get_out_shape()[1:]
        ## ===========USING CPU===========##
        outputs = np.zeros( (n_batchs, self.n_filters,output_height, output_width ))
        for i_idx in range(n_batchs):
            for row in range(output_height):
                for col in range(output_width):
                    for f_idx in range(self.n_filters):
                        row_start = row * self.stride
                        row_end = row_start + self.filter_size
                        col_start = col * self.stride
                        col_end = col_start + self.filter_size
                        outputs[i_idx,f_idx, row, col] = np.sum(
                            self.weights[f_idx] * inputs[i_idx, :, row_start:row_end, col_start:col_end])

        if(self.activation == "relu"):
            outputs = np.maximum(0, outputs)
        return outputs

    def backward(self, output_gradient, learning_rate):
        n_batchs,input_channels, input_height, input_width = self.inputs.shape
        _,n_filters,  output_height, output_width = output_gradient.shape
        ## ===========USING CPU===========##
        filter_gradient = np.zeros(self.weights.shape)
        input_gradient = np.zeros(self.inputs.shape)
        for i_batch in range(n_batchs):
            for row in range(output_height):
                for col in range(output_width):
                    for fillterIdx in range(n_filters):
                        row_start = row * self.stride
                        row_end = row_start + self.filter_size
                        col_start = col * self.stride
                        col_end = col_start + self.filter_size
                        out_grad_val = output_gradient[i_batch,fillterIdx, row, col ]
                        filter_gradient[fillterIdx] += self.inputs[i_batch, :,row_start:row_end, col_start:col_end] * out_grad_val
                        input_gradient[i_batch, :, row_start:row_end, col_start:col_end] += self.weights[fillterIdx] * out_grad_val
        if(self.activation == "relu"):
            input_gradient[self.inputs <= 0] = 0

        self.weights -= learning_rate * filter_gradient/n_batchs

        return filter_gradient


NameError: name 'Layer' is not defined

In [12]:
# @title Build CNN model
import numpy as np
from numba import cuda
import time
# from CNNModel import CNNModel
# from layers import Convolution, Flatten, MaxPool2D, Dense
class Layer():

    def forward(self, inputs):
        pass

    def backward(self, output_gradient, learning_rate):
        pass

    def get_out_shape(self):
        pass

    def init_weight(self):
        pass
# from layers_v1 import Layer
# import numpy as np


class CNNModel:
    def __init__(self, layers: list[Layer] = []):
        pre_layer = layers[0]
        pre_layer.init_weight()
        for layer in layers[1:]:
            layer.input_shape = pre_layer.get_out_shape()
            layer.init_weight()
            pre_layer = layer
        self.layers: list[Layer] = layers

    def forward(self, X):
        output = X
        for layer in self.layers:
            output = layer.forward(output)
        return output

    def backward(self, out_grad, learning_rate):
        for layer in reversed(self.layers):
            out_grad = layer.backward(out_grad, learning_rate)
        return out_grad

    def fit(self, X_train, Y_train, epochs=1, batch_size=32, learning_rate=0.001):
        num_batch = (len(X_train)-1)//batch_size+1
        for i_epoch in range(epochs):
            print(f"\nEpoch {i_epoch+1}/{epochs}:")
            train_loss = 0
            acc = 0
            progress = '.'*30
            for i in range(num_batch-1):

                batch_start = i * batch_size
                batch_end = (i + 1) * batch_size
                batch_X = X_train[batch_start: batch_end]
                batch_Y = Y_train[batch_start: batch_end]
                predictions = self.forward(batch_X)
                out_grad = 2.0 * (predictions - batch_Y)
                self.backward(out_grad, learning_rate)

                # print result
                acc_batch = np.mean(
                    np.argmax(predictions, axis=1) == np.argmax(batch_Y, axis=1))
                acc += acc_batch
                loss = np.sum((predictions - batch_Y) ** 2)
                train_loss += loss
                i_str = int(i/num_batch*30)
                progress = progress[:i_str] + ">" + progress[i_str+1:]
                print(
                    f"\r {i}/{num_batch} [{progress}] accuaray: {acc_batch:.5f}, train loss = {loss/len(batch_Y):.5f}", end='')
                progress = progress[:i_str] + "=" + progress[i_str+1:]

            train_loss /= len(X_train)

            print(
                f"\r {num_batch}/{num_batch} [{progress}] accuaray: {acc/num_batch:.5f}, train loss = {train_loss:.5f}", end='')

    def predict(self, X):
        return self.forward(X)

    def use_device(self, value):
        for layer in self.layers:
            output = layer.use_device = value


class Flatten(Layer):
    def __init__(self, input_shape=(28, 28, 1)):
        self.input_shape = input_shape
        pass

    def get_out_shape(self):
        t = 1
        for i in self.input_shape:
            t *= i
        return t

    def forward(self, inputs):
        self.inputs = inputs
        assert self.input_shape == inputs.shape[1:], "Input shape incorrect"
        return inputs.reshape(inputs.shape[0], -1)

    def backward(self, output_gradient, learning_rate):
        shape = self.inputs.shape
        return output_gradient.reshape(shape)

    def init_weight(self):
        pass

#II. SEQUENTIAL VERSION V2

Analysis:

- Using loops in python is very slow for calculations with great complexity.
- Sequential version 2 improves the use of numpy instead of loops for faster calculations.
- The steps to design the Conv, pool, and Dense classes are similar to the sequential version v1

In [13]:
from keras.datasets import mnist


In [14]:
'''import pandas as pd
import keras

from keras.datasets import mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Assuming you want to load the kidneyData.csv
#mnist = "/content/mnist.csv"
#data = pd.read_csv(mnist)

# If you want to split it into train and test sets (assuming train_X, train_y, etc. exist)
from google.colab import files
uploaded = files.upload()  # This will prompt you to upload the file


# Replace X, y with the actual features and labels in your data
X = data.drop(columns=['target'])  # Assuming 'target' is the label column
y = data['target']

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

# Now you have train and test datasets
print(train_X.shape, test_X.shape)
'''
from keras.datasets import mnist
(train_X, train_y), (test_X, test_y) = mnist.load_data()
y_train = np.zeros((len(train_y),10))
y_test = np.zeros((len(test_y),10))
for i in range (len(y_train)):
  y_train[i,train_y[i]]=1
for i in range (len(y_test)):
  y_test[i,test_y[i]]=1
x_train=train_X.reshape(train_X.shape[0],1, train_X.shape[1], train_X.shape[2])
x_test=test_X.reshape(test_X.shape[0],1, test_X.shape[1], test_X.shape[2])
x_train=x_train/255
x_test=x_test/255

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step


In [15]:
# @title Convolution Layer
class Convolution(Layer):
    def __init__(self, n_filters=32, filter_size=3, stride=1, activation=None, input_shape=(1, 28, 28)):
        self.input_shape = input_shape
        self.n_filters = n_filters
        self.filter_size = filter_size
        self.stride = stride
        self.activation = activation
        self.use_device = False
        self.bias = np.zeros((n_filters, 1))
        self.init_weight()

    def get_out_shape(self):
        output_width = (self.input_shape[2] -
                        self.filter_size) // self.stride + 1
        output_height = (
            self.input_shape[1] - self.filter_size) // self.stride + 1

        return ( self.n_filters,output_height, output_width)

    def init_weight(self):
        np.random.seed(10)
        self.weights = np.random.randn(
            self.n_filters, self.input_shape[0],self.filter_size, self.filter_size)/(self.filter_size**2)
    def forward(self, inputs):
        self.inputs = inputs
        n_batchs, n_chanels,in_height, in_width = inputs.shape
        assert self.input_shape == inputs.shape[1:], "Input shape incorrect"
        output_height, output_width = self.get_out_shape()[1:]
        outputs = np.zeros( (n_batchs, self.n_filters,output_height, output_width ))
        for row in range(output_height):
            for col in range(output_width):
                for f_idx in range(self.n_filters):
                    row_start = row * self.stride
                    row_end = row_start + self.filter_size
                    col_start = col * self.stride
                    col_end = col_start + self.filter_size
                    outputs[:,f_idx, row, col ] = np.sum(
                        self.weights[f_idx]*inputs[:, :, row_start:row_end, col_start:col_end],axis=(1,2,3) )

        if(self.activation == "relu"):
            outputs = np.maximum(0, outputs)

        return outputs

    def backward(self, output_gradient, learning_rate):
        n_batchs,input_channels, input_height, input_width = self.inputs.shape
        _,n_filters,  output_height, output_width = output_gradient.shape

        filter_gradient = np.zeros(self.weights.shape)
        input_gradient = np.zeros(self.inputs.shape)
        # for i_batch in range(n_batchs):
        for row in range(output_height):
            for col in range(output_width):
                for fillterIdx in range(n_filters):
                    row_start = row * self.stride
                    row_end = row_start + self.filter_size
                    col_start = col * self.stride
                    col_end = col_start + self.filter_size
                    out_grad_val = output_gradient[:,fillterIdx, row, col,np.newaxis,np.newaxis,np.newaxis]
                    filter_gradient[fillterIdx] +=  np.sum(self.inputs[:, :, row_start:row_end, col_start:col_end] * out_grad_val,axis=0)
                    input_gradient[:,: , row_start:row_end, col_start:col_end] += self.weights[fillterIdx] * out_grad_val

        if(self.activation == "relu"):
              input_gradient[self.inputs <= 0] = 0

        self.weights -= learning_rate * filter_gradient/n_batchs
        return input_gradient



* Forward Test

In [16]:
input_shape=(16,100,100)
inputs = np.random.randint(0,255,(32,*input_shape))/255
conv = Convolution(32,3,1,input_shape=input_shape)
%time out_host=conv.forward(inputs)

CPU times: user 12.9 s, sys: 0 ns, total: 12.9 s
Wall time: 13 s


* Backward Test

In [17]:
%time in_grad_host=conv.backward(out_host,0.0001)

CPU times: user 29.2 s, sys: 0 ns, total: 29.2 s
Wall time: 29.3 s


In [18]:
# @title Maxpooling Layer

class MaxPool2D(Layer):
    def __init__(self, pool_size=2, stride=2, input_shape=(1,28, 28)):
        self.pool_size = pool_size
        self.stride = stride
        self.use_device = False
        self.inputs = None
        self.inputs_device = None
        self.input_shape = input_shape

    def get_out_shape(self):
        output_height = ( self.input_shape[1] - self.pool_size) // self.stride + 1
        output_width = (self.input_shape[2] -  self.pool_size) // self.stride + 1
        return (self.input_shape[0],output_height, output_width)

    def forward(self, inputs):
        # Save input
        batch_size,num_channels, input_height, input_width = inputs.shape
        assert self.input_shape == inputs.shape[1:], "Input shape incorrect"
        self.inputs = inputs
        ( _,output_height, output_width) = self.get_out_shape()

        outputs = np.zeros( (batch_size,num_channels, output_height, output_width))
        for h in range(output_height):
            for w in range(output_width):
                h_start = h * self.stride
                h_end = h_start + self.pool_size
                w_start = w * self.stride
                w_end = w_start + self.pool_size
                outputs[:, :,h, w] = np.max( inputs[:, :, h_start:h_end, w_start:w_end], axis=(2, 3))

        return outputs


    def backward(self, output_gradient, learning_rate):
        batch_size,num_channels, output_height, output_width = output_gradient.shape
        input_gradient = np.zeros(self.inputs.shape)
        for h in range(output_height):
            for w in range(output_width):
                h_start = h * self.stride
                h_end = h_start + self.pool_size
                w_start = w * self.stride
                w_end = w_start + self.pool_size
                input_slice = self.inputs[:, :, h_start:h_end, w_start:w_end]
                max_vals = np.max(
                    input_slice, axis=(2, 3), keepdims=True)
                max_mask = (input_slice == max_vals)
                input_gradient[:,:, h_start:h_end, w_start:w_end] += max_mask * output_gradient[:,:,  h, w,  np.newaxis, np.newaxis]
        return input_gradient
    def init_weight(self):
        pass



* Forward Test

In [19]:
input_shape=(32,200,200)
inputs = np.random.randint(0,255,(64,*input_shape))/255
maxp = MaxPool2D(2,2,input_shape=input_shape)
%time out_host=maxp.forward(inputs)

CPU times: user 2.44 s, sys: 91.7 ms, total: 2.53 s
Wall time: 2.56 s


* Backward Test

In [20]:
%time in_grad_host=maxp.backward(out_host,0.0001)

CPU times: user 6.52 s, sys: 638 ms, total: 7.16 s
Wall time: 7.24 s


In [21]:
# @title Dense Layer
class Dense(Layer):
    def __init__(self, num_outputs, activation=None, input_shape=100):
        self.num_outputs = num_outputs
        self.biases = np.zeros((1, num_outputs))
        self.activation = activation
        self.use_device = False
        self.inputs = None
        self.input_shape = input_shape
        self.init_weight()

    def init_weight(self):
        self.weights = np.random.randn(
            self.input_shape, self.num_outputs) / self.num_outputs

    def get_out_shape(self):
        return self.num_outputs

    def forward(self, inputs):
        self.inputs = inputs
        assert self.input_shape == inputs.shape[-1], "Input shape incorrect"
        outputs = np.dot(inputs, self.weights) + self.biases
        if self.activation == "softmax":
            outputs = self.softmax(outputs)
        return outputs

    def softmax(self, x):
        e_x = np.exp(x-np.max(x, axis=1, keepdims=True))
        return e_x/e_x.sum(axis=1, keepdims=True)

    def backward(self, output_gradient, learning_rate):
        input_grad = np.dot(output_gradient, self.weights.T)
        weights_gradient = np.dot(self.inputs.T, output_gradient)
        biases_gradient = np.sum(output_gradient, axis=0, keepdims=True)
        self.weights -= learning_rate * weights_gradient
        self.biases -= learning_rate * biases_gradient
        return input_grad


class Flatten(Layer):
    def __init__(self, input_shape=(28, 28, 1)):
        self.input_shape = input_shape
        pass

    def get_out_shape(self):
        t = 1
        for i in self.input_shape:
            t *= i
        return t

    def forward(self, inputs):
        self.inputs = inputs
        assert self.input_shape == inputs.shape[1:], "Input shape incorrect"
        return inputs.reshape(inputs.shape[0], -1)

    def backward(self, output_gradient, learning_rate):
        shape = self.inputs.shape
        return output_gradient.reshape(shape)

    def init_weight(self):
        pass

In [22]:
inputs=np.random.randint(1,255, (256,10000))/255
dense=Dense(1024, input_shape= 10000)
%time out_host=dense.forward(inputs)

CPU times: user 304 ms, sys: 10.8 ms, total: 315 ms
Wall time: 166 ms


## Train and Test

In [23]:
modelII=CNNModel([
    Convolution(n_filters=16, filter_size=3, stride=1,activation='relu',input_shape=(1,28,28)),
    MaxPool2D(pool_size=2),
    Convolution(n_filters=32, filter_size=3, stride=1,activation='relu'),
    Flatten(),
    Dense(128),
    Dense(10, activation='softmax')
])

In [None]:
%%time
modelII.fit(x_train,y_train, epochs=3, batch_size=128)


Epoch 1/3:
 30/469 [=>............................] accuaray: 0.85156, train loss = 0.26191

In [None]:
%time y_predict =modelII.predict(x_test)

#III. PARALLEL VERSION V1





Analysis:

-The first parallel version simply maps the sequential version using grid and thread instead of loops
Have not used optimization techniques such as streaming, using SMEM, reducing wap divergence, choosing block size,...

-There is a connection between the layers (the output of one layer is the input of the next layer), so we cannot parallelize the calculations of the layers at the same time, but must separate the tasks to parallelize them.

-The time-consuming computational work mainly falls into calculating forward and backward, so we just need to parallelize these functions on each layer.


In [None]:
# @title Convolution Layer
import numpy as np
from numba import cuda

#------------Convolution kernel--------------#
@cuda.jit
def conv_forward_kernel(inputs, weights, stride, outputs, activation):
    n_chanels=inputs.shape[1]
    filter_size= weights.shape[-1]
    n_batch, n_filters,output_height, output_width = outputs.shape
    i_batch, row, col = cuda.grid(3)
    if(row >= output_height or col >= output_width or i_batch >= n_batch):
        return

    for fillterIdx in range(n_filters):
        sum = 0
        for chanel_idx in range(n_chanels):
            for fillterRow in range(filter_size):
                for fillterCol in range(filter_size):
                    iR = row*stride + fillterRow
                    iC = col*stride + fillterCol
                    sum += inputs[i_batch,chanel_idx, iR, iC] * weights[fillterIdx,chanel_idx, fillterRow, fillterCol]
        if(activation == 1 and sum < 0):
            sum = 0
        outputs[i_batch,fillterIdx, row, col] = sum


@cuda.jit
def conv_backward_kernel(input, weights, stride, input_gradient, output_gradient, filter_gradient, activation):
    n_chanels,filter_size = weights.shape[1:-1]
    n_batch,n_filters, output_height, output_width  = output_gradient.shape
    i_batch, row, col = cuda.grid(3)
    if(row >= output_height or col >= output_width or i_batch >= n_batch):
        return

    for fillterIdx in range(n_filters):
        for fillterRow in range(filter_size):
            for fillterCol in range(filter_size):
                out_value = output_gradient[i_batch, fillterIdx,row, col]
                for i_chanel in range(n_chanels):
                    iR = row*stride + fillterRow
                    iC = col*stride + fillterCol
                    in_val = input[i_batch, i_chanel,iR, iC]
                    cuda.atomic.add(
                        filter_gradient, (fillterIdx, i_chanel,fillterRow, fillterCol), input[i_batch, i_chanel,iR, iC] * out_value)
                    if(not (in_val <= 0 and activation == 1)):
                      cuda.atomic.add(input_gradient, (i_batch, i_chanel,iR, iC),
                                          weights[fillterIdx,i_chanel, fillterRow, fillterCol] * out_value)




class Convolution(Layer):
    def __init__(self, n_filters=32, filter_size=3, stride=1, activation=None, input_shape=(28, 28, 1)):
        self.input_shape = input_shape
        self.n_filters = n_filters
        self.filter_size = filter_size
        self.stride = stride
        self.activation = activation
        self.use_device = False
        self.bias = np.zeros((n_filters, 1))
        self.init_weight()

    def get_out_shape(self):
        output_width = (self.input_shape[2] -
                        self.filter_size) // self.stride + 1
        output_height = (
            self.input_shape[1] - self.filter_size) // self.stride + 1

        return ( self.n_filters,output_height, output_width)


    def init_weight(self):
        self.weights = np.random.randn(
            self.n_filters, self.input_shape[0],self.filter_size, self.filter_size)/(self.filter_size**2)

    def forward(self, inputs):

        self.inputs = inputs
        n_batchs, n_chanels,in_height, in_width = inputs.shape
        assert self.input_shape == inputs.shape[1:], "Input shape incorrect"
        output_height, output_width = self.get_out_shape()[1:]
        block_size = (8, 8, 8)
        grid_size = ((n_batchs-1)//block_size[0]+1, (output_height-1) //
                      block_size[1]+1, (output_width-1)//block_size[2]+1)
        d_outputs = cuda.device_array((n_batchs, self.n_filters,output_height, output_width ))
        self.d_weights = cuda.to_device(self.weights)
        self.d_inputs = cuda.to_device(self.inputs)
        conv_forward_kernel[grid_size, block_size](
            self.d_inputs, self.d_weights, 1, d_outputs, int(self.activation == "relu"))
        outputs = d_outputs.copy_to_host()

        return outputs

    def backward(self, output_gradient, learning_rate):
        n_batchs,input_channels, input_height, input_width = self.inputs.shape
        _,n_filters,  output_height, output_width = output_gradient.shape
        block_size = (4, 4, 4)
        grid_size = ((n_batchs-1)//block_size[0]+1, (output_height-1) //
                      block_size[1]+1, (output_width-1)//block_size[2]+1)
        d_filter_grad = cuda.device_array(self.weights.shape)
        d_input_grad = cuda.device_array(self.inputs.shape)
        d_output_grad = cuda.to_device(output_gradient)
        # call kernel
        conv_backward_kernel[grid_size, block_size](
            self.d_inputs, self.d_weights, 1, d_input_grad, d_output_grad, d_filter_grad, int(self.activation == "relu"))
        cuda.synchronize()
        input_gradient = d_input_grad.copy_to_host()
        filter_gradient = d_filter_grad.copy_to_host()
        ## ===========END USING DEVICE===========##
        self.weights -= learning_rate * filter_gradient/n_batchs

        return input_gradient


Design - Convolution

**+Forward:**
**Input:** Input, weights, stride, activation
-How to install kernel function:
-The output dimension will be ( n_batch, out_height, out_width, n_filter)
-Each thread will be responsible for 1 output element including n_batch, out_height, out_width.
-Using 3d block and 3d grid, choose x dimension corresponding to n_batch, y corresponding to out_height, z corresponding to out_width
**Output:** Result matrix “output”

**+Backward:**
**Input: **Input,output_grad, weight
How to install kernel function:
-The dimension of output_grad is (n_batch, out_height, out_width, n_filter))
-Each thread will be responsible for 1 element in output_grad including n_batch, out_height, out_width.
-Using 3d block and 3d grid, choose x dimension corresponding to n_batch, y corresponding to out_height, z corresponding to out_width
-In addition, the values ​​of Input_grad and weight_grad will accumulate between threads at the same time, so you need to use the atomic_add function to accumulate the value of the element.

**Output:**Input_grad matrix, weight_grad

* Forward Test

In [None]:
input_shape=(16,100,100)
inputs = np.random.randint(0,255,(32,*input_shape))/255
conv = Convolution(32,3,1,input_shape=input_shape)
%time out_host=conv.forward(inputs)

* Backward Test

In [None]:
%time in_grad_host=conv.backward(out_host,0.0001)

In [None]:
# @title Maxpooling Layer
#------------MaxPool2D kernel--------------#
@cuda.jit
def maxPool2D_forward_kernel(inputs, outputs, stride, pool_size):
    n_batchs,n_chanels, in_height, in_width = inputs.shape
    n_batchs,n_chanels, output_height, output_width = outputs.shape
    ibatch, out_h, out_w = cuda.grid(3)
    # Max pool over input
    if(ibatch >= n_batchs or out_h >= output_height or out_w >= output_width):
        return

    for i_chanel in range(n_chanels):
        max_value = -np.inf
        for h_pool in range(pool_size):
            for w_pool in range(pool_size):
                max_value = max(
                    max_value, inputs[ibatch, i_chanel,out_h*stride+h_pool, w_pool+out_w*stride])
        outputs[ibatch,i_chanel, out_h, out_w] = max_value


@cuda.jit
def maxPool2D_backward_kernel(inputs, inputs_grad, outputs_grad, stride, pool_size):
    n_batchs,n_chanels, in_height, in_width = inputs.shape
    n_batchs,n_chanels, output_height, output_width,  = outputs_grad.shape
    ibatch, out_h, out_w = cuda.grid(3)
    # Max pool over input
    if(ibatch >= n_batchs or out_h >= output_height or out_w >= output_width):
        return
    for i_chanel in range(n_chanels):
        max_value = -np.inf
        for h_pool in range(pool_size):
            for w_pool in range(pool_size):
                max_value = max(
                    max_value, inputs[ibatch, i_chanel,out_h*stride+h_pool, w_pool+out_w*stride])

        for h_pool in range(pool_size):
            for w_pool in range(pool_size):
                if(inputs[ibatch,i_chanel ,out_h*stride+h_pool, w_pool+out_w*stride] == max_value):
                    inputs_grad[ibatch, i_chanel,out_h*stride+h_pool, w_pool+out_w*stride] += outputs_grad[ibatch,i_chanel,  out_h, out_w]


class MaxPool2D(Layer):
    def __init__(self, pool_size=2, stride=2, input_shape=(28, 28, 1)):
        self.pool_size = pool_size
        self.stride = stride
        self.use_device = False
        self.inputs = None
        self.inputs_device = None
        self.input_shape = input_shape

    def get_out_shape(self):
        output_height = ( self.input_shape[1] - self.pool_size) // self.stride + 1
        output_width = (self.input_shape[2] -  self.pool_size) // self.stride + 1
        return (self.input_shape[0],output_height, output_width)

    def forward(self, inputs):
        # Save input
        batch_size,num_channels, input_height, input_width = inputs.shape
        assert self.input_shape == inputs.shape[1:], "Input shape incorrect"
        self.inputs = inputs
        ( _,output_height, output_width) = self.get_out_shape()
        d_outputs = cuda.device_array(
            (batch_size,num_channels, output_height, output_width))
        block_size = (8, 4, 4)
        grid_size = ((batch_size-1)//block_size[0]+1, (output_height-1) //
                      block_size[1]+1, (output_width-1)//block_size[2]+1)
        self.d_inputs = cuda.to_device(inputs)
        maxPool2D_forward_kernel[grid_size, block_size](
            self.d_inputs, d_outputs, self.stride, self.pool_size)
        outputs = d_outputs.copy_to_host()

        return outputs

    def backward(self, output_gradient, learning_rate):
        batch_size,num_channels, output_height, output_width = output_gradient.shape
        d_input_grad = cuda.device_array(self.inputs.shape)
        d_output_grad = cuda.to_device(output_gradient)
        block_size = (8, 4, 4)
        grid_size = ((batch_size-1)//block_size[0]+1, (output_height-1) //
                      block_size[1]+1, (output_width-1)//block_size[2]+1)
        maxPool2D_backward_kernel[grid_size, block_size](
            self.d_inputs, d_input_grad, d_output_grad, self.stride, self.pool_size)
        input_gradient = d_input_grad.copy_to_host()
        return input_gradient


    def init_weight(self):
        pass


Design - Convolution

**+Forward:**
**Input:** Input, stride, pool size
-How to install kernel function:
-The output dimension will be ( n_batch, out_height, out_width, n_channel)
-Each thread will be responsible for 1 output element including n_batch, out_height, out_width.
-Using 3d block and 3d grid, choose x dimension corresponding to n_batch, y corresponding to out_height, z corresponding to out_width
Output: Ma trận kết quả “output”

**+Backward:**
Input: Input,output_grad, stride, pool_size
-How to install kernel function:
-The output_grad dimension is (n_batch, out_height, out_width, n_channel)
-Each thread will be responsible for 1 element in output_grad including n_batch, out_height, out_width.
-Using 3d block and 3d grid, choose x dimension corresponding to n_batch, y corresponding to out_height, z corresponding to out_width
**Output:** Input_grad

* Forward Test

In [None]:
input_shape=(32,200,200)
inputs = np.random.randint(0,255,(64,*input_shape))/255
maxp = MaxPool2D(2,2,input_shape=input_shape)
%time out_host=maxp.forward(inputs)

* Backward Test

In [None]:
%time in_grad_host=maxp.backward(out_host,0.0001)

In [None]:
# @title Dense Layer

@cuda.jit
def dense_forward_kernel(inputs, weights, bias, outputs):
    row, col = cuda.grid(2)
    height = weights.shape[0]
    if(row >= outputs.shape[0] or col >= outputs.shape[1]):
        return
    sum = 0
    for i in range(inputs.shape[1]):
        sum += inputs[row, i] * weights[i, col]
    outputs[row, col] = sum + bias[0, col]



class Dense(Layer):
    def __init__(self, num_outputs, activation=None, input_shape=100):
        self.num_outputs = num_outputs
        self.biases = np.zeros((1, num_outputs))
        self.activation = activation
        self.use_device = False
        self.inputs = None
        self.input_shape = input_shape
        self.init_weight()

    def init_weight(self):
        self.weights = np.random.randn(
            self.input_shape, self.num_outputs) / self.num_outputs

    def get_out_shape(self):
        return self.num_outputs

    def forward(self, inputs):
        self.inputs = inputs
        assert self.input_shape == inputs.shape[-1], "Input shape incorrect"
        block_size = (8, 4)
        grid_size = ((inputs.shape[0]-1)//block_size[0]+1,
                     (self.num_outputs-1)//block_size[1]+1)
        # if(grid_size[0]*grid_size[1] < 128):
        #     self.use_device = False
        # outputs = None
        # if(self.use_device == False):
        #     outputs = np.dot(inputs, self.weights) + self.biases
        # else:
        self.d_weights = cuda.to_device(self.weights)
        self.d_biases = cuda.to_device(self.biases)
        d_outputs = cuda.device_array((inputs.shape[0], self.num_outputs))
        self.d_inputs = cuda.to_device(inputs)
        start = time.time()
        dense_forward_kernel[grid_size, block_size](
            self.d_inputs, self.d_weights, self.d_biases, d_outputs)
        outputs = d_outputs.copy_to_host()

        # if(self.activation=="relu"):
        #   outputs = np.maximum(0,outputs)
        if self.activation == "softmax":
            outputs = self.softmax(outputs)
        return outputs

    def softmax(self, x):
        e_x = np.exp(x-np.max(x, axis=1, keepdims=True))
        return e_x/e_x.sum(axis=1, keepdims=True)

    def backward(self, output_gradient, learning_rate):
        # start = time.time()
        # input_grad=None
        # if(self.use_device==False):

        input_grad = np.dot(output_gradient, self.weights.T)
        weights_gradient = np.dot(self.inputs.T, output_gradient)
        biases_gradient = np.sum(output_gradient, axis=0, keepdims=True)

        # Update weights and biases
        self.weights -= learning_rate * weights_gradient
        self.biases -= learning_rate * biases_gradient

        return input_grad

Design- Convolution

**+Forward:**
**Input:** Input, weigth, bias
-How to install kernel function:
  The output direction will be ( n_batch, n_out)
  Each thread will be responsible for 1 output element including n_batch, n_out,
  Using 2d block and 2d grid, choose x dimension corresponding to n_batch, y corresponding to n_out,
**Output:** Result matrix “output”

In [None]:
inputs=np.random.randint(1,255, (256,10000))/255
dense=Dense(1024, input_shape= 10000)
%time out_host=dense.forward(inputs)

## Train and Test

In [None]:
modelIII=CNNModel([
    Convolution(n_filters=16, filter_size=3, stride=1,activation='relu',input_shape=(1,28,28)),
    MaxPool2D(pool_size=2),
    Convolution(n_filters=32, filter_size=3, stride=1,activation='relu'),
    Flatten(),
    Dense(128),
    Dense(10, activation='softmax')
])

In [None]:
%%time
modelIII.fit(x_train,y_train, epochs=3, batch_size=128)

In [None]:
%time y_predict =modelIII.predict(x_test)

# IV. PARALLEL VERSION V2


Analysis:

- The first parallel version simply maps the sequential version using grid and thread instead of loops
- However, in this improved version, we will optimize memory access, limit the use of GMEM by using RMEM + SMEM to help access memory.
```

In [None]:
# @title Convolution Layer
import numpy as np
from numba import cuda, types as numba_types

#------------Convolution kernel--------------#
@cuda.jit
def conv_forward_kernel(d_inputs, weights, stride, outputs, activation):
    shared_input = cuda.shared.array((8,6,6),numba_types.float32)
    i_batch, row, col = cuda.grid(3)
    n_batch, n_filters,output_height, output_width = outputs.shape
    if( i_batch >= n_batch): return
    filter_size= weights.shape[-1]
    n_chanels=d_inputs.shape[1]
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    tz = cuda.threadIdx.z
    by=cuda.blockIdx.y*cuda.blockDim.y
    bz=cuda.blockIdx.z*cuda.blockDim.z
    for fillterIdx in range(n_filters):
        sum_val = 0
        for chanel_idx in range(n_chanels):
            for i in range(4):
              idx=(ty*4+tz)+i*16
              y,z=idx//6,idx%6
              if(y>=6): break
              shared_input[tx,y,z] =d_inputs[i_batch,chanel_idx,by+y, bz+z]
            cuda.syncthreads()
            for fillterRow in range(filter_size):
                for fillterCol in range(filter_size):
                    sum_val +=shared_input[tx,fillterRow+ty,fillterCol+tz]*weights[fillterIdx,chanel_idx, fillterRow, fillterCol]
            cuda.syncthreads()

        if not (row >= output_height or col >= output_width):
          if(activation == 1 and sum_val < 0):
              sum_val = 0
          outputs[i_batch,fillterIdx, row, col] = sum_val


@cuda.jit
def conv_backward_kernel(d_inputs, weights, stride, input_gradient, output_gradient, filter_gradient, activation):
    shared_input = cuda.shared.array((8,6,6),numba_types.float32)
    n_chanels,filter_size = weights.shape[1:-1]
    n_batch,n_filters, output_height, output_width  = output_gradient.shape
    i_batch, row, col = cuda.grid(3)
    # if(row >= output_height or col >= output_width or i_batch >= n_batch):
    #     return
    if(i_batch >= n_batch):
      return
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    tz = cuda.threadIdx.z
    by=cuda.blockIdx.y*cuda.blockDim.y
    bz=cuda.blockIdx.z*cuda.blockDim.z
    for fillterIdx in range(n_filters):
        out_value = output_gradient[i_batch, fillterIdx,row, col]
        for i_chanel in range(n_chanels):
          for i in range(3):
              idx=(ty*4+tz)+i*16
              y,z=idx//6,idx%6
              if(y>=6): break
              shared_input[tx,y,z] =d_inputs[i_batch,i_chanel,by+y, bz+z]
          cuda.syncthreads()
          if not (row >= output_height or col >= output_width):
            for fillterRow in range(filter_size):
                for fillterCol in range(filter_size):
                        iR = row*stride + fillterRow
                        iC = col*stride + fillterCol
                        in_val = d_inputs[i_batch, i_chanel,iR, iC]
                        cuda.atomic.add(
                            filter_gradient, (fillterIdx, i_chanel,fillterRow, fillterCol), in_val* out_value)
                        if(not (in_val <= 0 and activation == 1)):
                          cuda.atomic.add(input_gradient, (i_batch, i_chanel,iR, iC),
                                              weights[fillterIdx,i_chanel, fillterRow, fillterCol] * out_value)
          cuda.syncthreads()


class Convolution(Layer):
    def __init__(self, n_filters=32, filter_size=3, stride=1, activation=None, input_shape=(28, 28, 1)):
        self.input_shape = input_shape
        self.n_filters = n_filters
        self.filter_size = filter_size
        self.stride = stride
        self.activation = activation
        self.use_device = False
        self.bias = np.zeros((n_filters, 1))
        self.init_weight()

    def get_out_shape(self):
        output_width = (self.input_shape[2] -
                        self.filter_size) // self.stride + 1
        output_height = (
            self.input_shape[1] - self.filter_size) // self.stride + 1

        return ( self.n_filters,output_height, output_width)


    def init_weight(self):
        self.weights = np.random.randn(
            self.n_filters, self.input_shape[0],self.filter_size, self.filter_size)/(self.filter_size**2)

    def forward(self, inputs):

        self.inputs = inputs
        n_batchs, n_chanels,in_height, in_width = inputs.shape
        assert self.input_shape == inputs.shape[1:], "Input shape incorrect"
        output_height, output_width = self.get_out_shape()[1:]
        block_size = (8, 4, 4)
        grid_size = ((n_batchs-1)//block_size[0]+1, (output_height-1) //
                      block_size[1]+1, (output_width-1)//block_size[2]+1)
        d_outputs = cuda.device_array((n_batchs, self.n_filters,output_height, output_width ))
        self.d_weights = cuda.to_device(self.weights)
        self.d_inputs = cuda.to_device(self.inputs)
        conv_forward_kernel[grid_size, block_size](
            self.d_inputs, self.d_weights, 1, d_outputs, int(self.activation == "relu"))
        outputs = d_outputs.copy_to_host()

        return outputs

    def backward(self, output_gradient, learning_rate):
        n_batchs,input_channels, input_height, input_width = self.inputs.shape
        _,n_filters,  output_height, output_width = output_gradient.shape
        block_size = (8, 4, 4)
        grid_size = ((n_batchs-1)//block_size[0]+1, (output_height-1) //
                      block_size[1]+1, (output_width-1)//block_size[2]+1)
        d_filter_grad = cuda.device_array(self.weights.shape)
        d_input_grad = cuda.device_array(self.inputs.shape)
        d_output_grad = cuda.to_device(output_gradient)
        # call kernel
        conv_backward_kernel[grid_size, block_size](
            self.d_inputs, self.d_weights, 1, d_input_grad, d_output_grad, d_filter_grad, int(self.activation == "relu"))
        cuda.synchronize()
        input_gradient = d_input_grad.copy_to_host()
        filter_gradient = d_filter_grad.copy_to_host()
        ## ===========END USING DEVICE===========##
        self.weights -= learning_rate * filter_gradient/n_batchs

        return input_gradient


Design - Convolution

The design steps are similar to parallel version v1
However, here we will use SMEM to save input in the same block for reuse many times
In addition, in the backward kernel version we use RMEM to store output_grad, helping to limit multiple accesses to GMEM.

```

* Forward Test

In [None]:
input_shape=(16,100,100)
inputs = np.random.randint(0,255,(32,*input_shape))/255
conv = Convolution(32,3,1,input_shape=input_shape)
%time out_host=conv.forward(inputs)

* Backward Test

In [None]:
%time in_grad_host=conv.backward(out_host,0.0001)

In [None]:
# @title Maxpooling Layer
@cuda.jit
def maxPool2D_forward_kernel(d_inputs, outputs, stride, pool_size):
    share_size = 8
    shared_input = cuda.shared.array((8,share_size,share_size),numba_types.float32)
    n_batchs,n_chanels, in_height, in_width = d_inputs.shape
    n_batchs,n_chanels, output_height, output_width = outputs.shape
    i_batch, out_h, out_w = cuda.grid(3)
    # Max pool over input
    if(i_batch >= n_batchs): return
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    tz = cuda.threadIdx.z
    by=cuda.blockIdx.y*cuda.blockDim.y * stride
    bz=cuda.blockIdx.z*cuda.blockDim.z * stride

    for i_chanel in range(n_chanels):
        for i in range(4):
          idx=(ty*4+tz)+i*16
          y,z=idx//share_size,idx%share_size
          if(y>=share_size): break
          shared_input[tx,y,z] =d_inputs[i_batch,i_chanel,by+y, bz+z]
        cuda.syncthreads()
        max_value = -np.inf
        if not (out_h >= output_height or out_w >= output_width):
          for h_pool in range(pool_size):
              for w_pool in range(pool_size):
                  max_value = max(
                      # max_value, d_inputs[i_batch, i_chanel,out_h*stride+h_pool, w_pool+out_w*stride])
                      max_value, shared_input[tx,ty*stride+h_pool, w_pool+tz*stride])
          outputs[i_batch,i_chanel, out_h, out_w] = max_value
        cuda.syncthreads()

@cuda.jit
def maxPool2D_backward_kernel(d_inputs, inputs_grad, outputs_grad, stride, pool_size):
    share_size = 8
    shared_input = cuda.shared.array((8,share_size,share_size),numba_types.float32)
    n_batchs,n_chanels, in_height, in_width = d_inputs.shape
    n_batchs,n_chanels, output_height, output_width,  = outputs_grad.shape
    i_batch, out_h, out_w = cuda.grid(3)
    # Max pool over input
    if(i_batch >= n_batchs ):
        return
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    tz = cuda.threadIdx.z
    by=cuda.blockIdx.y*cuda.blockDim.y * stride
    bz=cuda.blockIdx.z*cuda.blockDim.z * stride
    for i_chanel in range(n_chanels):
        for i in range(4):
          idx=(ty*4+tz)+i*16
          y,z=idx//share_size,idx%share_size
          if(y>=share_size): break
          shared_input[tx,y,z] =d_inputs[i_batch,i_chanel,by+y, bz+z]
        cuda.syncthreads()

        max_value = -np.inf
        if not (out_h >= output_height or out_w >= output_width):
          for h_pool in range(pool_size):
              for w_pool in range(pool_size):
                  max_value = max( max_value,  shared_input[tx,ty*stride+h_pool, w_pool+tz*stride])

          for h_pool in range(pool_size):
              for w_pool in range(pool_size):
                  if( shared_input[tx,ty*stride+h_pool, w_pool+tz*stride] ==max_value):
                      inputs_grad[i_batch, i_chanel,out_h*stride+h_pool, w_pool+out_w*stride] += outputs_grad[i_batch,i_chanel,  out_h, out_w]
        cuda.syncthreads()
#------------Linear kernel--------------#

class MaxPool2D(Layer):
    def __init__(self, pool_size=2, stride=2, input_shape=(28, 28, 1)):
        self.pool_size = pool_size
        self.stride = stride
        self.use_device = False
        self.inputs = None
        self.inputs_device = None
        self.input_shape = input_shape

    def get_out_shape(self):
        output_height = ( self.input_shape[1] - self.pool_size) // self.stride + 1
        output_width = (self.input_shape[2] -  self.pool_size) // self.stride + 1
        return (self.input_shape[0],output_height, output_width)

    def forward(self, inputs):
        # Save input
        batch_size,num_channels, input_height, input_width = inputs.shape
        assert self.input_shape == inputs.shape[1:], "Input shape incorrect"
        self.inputs = inputs
        ( _,output_height, output_width) = self.get_out_shape()
        d_outputs = cuda.device_array(
            (batch_size,num_channels, output_height, output_width))
        block_size = (8, 4, 4)
        grid_size = ((batch_size-1)//block_size[0]+1, (output_height-1) //
                      block_size[1]+1, (output_width-1)//block_size[2]+1)
        self.d_inputs = cuda.to_device(inputs)
        maxPool2D_forward_kernel[grid_size, block_size](
            self.d_inputs, d_outputs, self.stride, self.pool_size)
        outputs = d_outputs.copy_to_host()

        return outputs

    def backward(self, output_gradient, learning_rate):
        batch_size,num_channels, output_height, output_width = output_gradient.shape
        d_input_grad = cuda.device_array(self.inputs.shape)
        d_output_grad = cuda.to_device(output_gradient)
        block_size = (8, 4, 4)
        grid_size = ((batch_size-1)//block_size[0]+1, (output_height-1) //
                      block_size[1]+1, (output_width-1)//block_size[2]+1)
        maxPool2D_backward_kernel[grid_size, block_size](
            self.d_inputs, d_input_grad, d_output_grad, self.stride, self.pool_size)
        input_gradient = d_input_grad.copy_to_host()
        return input_gradient


    def init_weight(self):
        pass


Design - MaxPooling

The MaxPooling algorithm mechanism is similar to the CNN Convolution Layer
So the design using smem,rmem is similar to the Convolution class.

* Forward Test

In [None]:
input_shape=(32,200,200)
inputs = np.random.randint(0,255,(64,*input_shape))/255
maxp = MaxPool2D(2,2,input_shape=input_shape)
%time out_host=maxp.forward(inputs)

* Backward Test

In [None]:
%time in_grad_host=maxp.backward(out_host,0.0001)

In [None]:
# @title Dense Layer

@cuda.jit
def dense_forward_kernel(inputs, weights, bias, outputs):
    row, col = cuda.grid(2)
    height = weights.shape[0]
    if(row >= outputs.shape[0] or col >= outputs.shape[1]):
        return
    sum = 0
    for i in range(inputs.shape[1]):
        sum += inputs[row, i] * weights[i, col]
    outputs[row, col] = sum + bias[0, col]



class Dense(Layer):
    def __init__(self, num_outputs, activation=None, input_shape=100):
        self.num_outputs = num_outputs
        self.biases = np.zeros((1, num_outputs))
        self.activation = activation
        self.use_device = False
        self.inputs = None
        self.input_shape = input_shape
        self.init_weight()

    def init_weight(self):
        self.weights = np.random.randn(
            self.input_shape, self.num_outputs) / self.num_outputs

    def get_out_shape(self):
        return self.num_outputs

    def forward(self, inputs):
        self.inputs = inputs
        assert self.input_shape == inputs.shape[-1], "Input shape incorrect"
        block_size = (8, 4)
        grid_size = ((inputs.shape[0]-1)//block_size[0]+1,
                     (self.num_outputs-1)//block_size[1]+1)
        # if(grid_size[0]*grid_size[1] < 128):
        #     self.use_device = False
        # outputs = None
        # if(self.use_device == False):
        #     outputs = np.dot(inputs, self.weights) + self.biases
        # else:
        self.d_weights = cuda.to_device(self.weights)
        self.d_biases = cuda.to_device(self.biases)
        d_outputs = cuda.device_array((inputs.shape[0], self.num_outputs))
        self.d_inputs = cuda.to_device(inputs)
        start = time.time()
        dense_forward_kernel[grid_size, block_size](
            self.d_inputs, self.d_weights, self.d_biases, d_outputs)
        outputs = d_outputs.copy_to_host()

        # if(self.activation=="relu"):
        #   outputs = np.maximum(0,outputs)
        if self.activation == "softmax":
            outputs = self.softmax(outputs)
        return outputs

    def softmax(self, x):
        e_x = np.exp(x-np.max(x, axis=1, keepdims=True))
        return e_x/e_x.sum(axis=1, keepdims=True)

    def backward(self, output_gradient, learning_rate):
        # start = time.time()
        # input_grad=None
        # if(self.use_device==False):

        input_grad = np.dot(output_gradient, self.weights.T)
        weights_gradient = np.dot(self.inputs.T, output_gradient)
        biases_gradient = np.sum(output_gradient, axis=0, keepdims=True)

        # Update weights and biases
        self.weights -= learning_rate * weights_gradient
        self.biases -= learning_rate * biases_gradient

        return input_grad

In [None]:
inputs=np.random.randint(1,255, (256,10000))/255
dense=Dense(1024, input_shape= 10000)
%time out_host=dense.forward(inputs)

## Train and Test

In [None]:
modelIV=CNNModel([
    Convolution(n_filters=16, filter_size=3, stride=1,activation='relu',input_shape=(1,28,28)),
    MaxPool2D(pool_size=2),
    Convolution(n_filters=32, filter_size=3, stride=1,activation='relu'),
    Flatten(),
    Dense(128),
    Dense(10, activation='softmax')
])

In [None]:
%%time
modelIV.fit(x_train,y_train, epochs=3, batch_size=128)

In [None]:

%time y_predict =modelIV.predict(x_test)

# V. EVALUATE

Evaluation script
- To test the CNN model running on host and device, we use the Mnist data set including 60,000 training sets and 10,000 test sets.
- Training on 3epoch and batch size is 128.
- The model is built as follows:
    Convolution(n_filters=16, filter_size=3, stride=1,activation='relu',input_shape=(1,28,28)),
    MaxPool2D(pool_size=2),
    Convolution(n_filters=32, filter_size=3, stride=1, activation='relu'),
    Flatten(),
    Dense(128),
    Dense(10, activation='softmax')
- Training results show that the model's accuracy is relatively good (>95% after 3 epochs)
```

Result

Runtime comparison table between sequential and parallel installation versions

|           |CPU V1 | CPU V2 |  GPU V1 | GPU V2  |
|-----------|:-----:|:-----: |:-------:|:-------:|
|Convolution|3m46   | 11.7 s |   974ms |  472ms  |         
|MaxPooling |19s    | 2.5s   |   551ms |  517ms  |         
|Dense      |292ms  | 292ms  |  289ms  |  289ms  |         
|Training   | inf   |  166ms |   2m2   |  1m36   |        
|Testing    |17m34  |1m32    | 1.46s   |  1.28s  |   


Comment
```
-In the sequential version installed on python, the calculation speed is very slow, especially the backward calculation step (sequential v1)
- The speed of serialization v1 is improved when using the numpy calculation library (sequential v2)
- After installing the parallel version on cuda the results are much better than the previous self version (parallel v1)
- Effective use of memory such as SMEM, RMEM contributes to shortening optimal memory access time (parallel v2)
- In general, among the three real-time layers, Convolution is the largest, followed by Maxpooling and finally Dense (taking up almost negligible time)
=> For the CNN image recognition problem, using GPUs for parallel processing is extremely necessary and especially important to make the training and research process faster and easier.
```