In [1]:
from datasets import load_dataset
import numpy as np

In [2]:
dataset = load_dataset("mnist")

In [3]:
train_dataset = dataset["train"]
test_dataset = dataset["test"]

In [4]:
np.array(train_dataset[0]['image']).shape

(28, 28)

In [41]:
# Implementing LeNet
# Initial Layer of convolution with feature map/ filter count of 6
# Kernel size 5, output size will be (28 - 5) + 1 = 24
# Let's build convolution Layer, so that I could generalize any architecture implementation
class Conv_Layer:
    def __init__(self, input_channel, input_size, stride, filter_shape, output_chanel_or_filter_num, batch_size,padding=0):
        self.input_channel = input_channel # channel means to multiply with same filter in same place and adding the input_channel times
        self.input_size = input_size # size of input or for first conv layer, it will be image resolution
        self.stride = stride
        self.filter_shape = filter_shape
        self.output_chanel = output_chanel_or_filter_num # This could also be called the number of filer numbers
        self.batch_size = batch_size
        self.padding = padding
        self.initialize_filer_weights()
        self.initialize_filter_biases()

        # In a normalize standarizarion here,
        # the shape of input is (input_channel, input_shape, input_shape)
    
    def initialize_filer_weights(self):
        # This structure defines every filter weights for each input channel, and it goes for each output channel
        self.filter_weights = 0.01 * np.random.randn(self.output_chanel, self.input_channel,self.filter_shape,self.filter_shape)

    def initialize_filter_biases(self):
        self.filter_biases = np.zeros((self.output_chanel, 1))
        # print(f'biases:: {self.filter_biases}')

    def forward(self, input):
        self.inputs = input
        # self.output = []
        single_filter_output_shape = int(((self.input_size - self.filter_shape)/self.stride) + 1) if self.padding == 0 else \
        int(((self.input_size - self.filter_shape + 2 * self.padding)/self.stride) + 1)

        # We are assuming that the input image shape, the height and width is always symmetrical or equal
        self.output = np.zeros((self.batch_size,self.output_chanel,single_filter_output_shape,single_filter_output_shape))
        for i, filter in enumerate(self.filter_weights):
            # Now I will iterate through column and through row
            for row in range (0,self.input_size,self.stride):
                
                # This projects from overflowing through columns (Verically)
                if(row > self.input_size - self.filter_shape):
                    break
                    
                for column in range (0,self.input_size,self.stride):

                    # This projects from overflowing through columns (Horizontally)
                    if(column > self.input_size - self.filter_shape):
                        break
                        
                    else:
                        # Do multiplication, do slice among input channels, multiply, sum and append to output_per_filter
                        multi_channel_filter_feature_block =  self.inputs[:,:,row:self.filter_shape + row, column:self.filter_shape + column]
                        # Now multiplying each of the arrays with filter, and adding each and each one of them next
                        filter_applied_frame = multi_channel_filter_feature_block * filter
                        # added all filter per output channel fashion
                        # print(f'i is:: {i} and {self.filter_biases[0]}, column is {column}')
                        
                        # This needs to updated to only be summed up in internal axis along each batch and there internal multiplicative product
                        self.output[:,i, row, column] =  filter_applied_frame.sum(axis=(1,2,3)) + self.filter_biases[i]

    # Need to calculate gradients for both filters and dinputs
    # first calculating for dinputs
    def backward(self,dvalues):
        self.dbiases = np.sum(dvalues, axis=(0, 2, 3))
        self.dinputs = np.zeros_like(self.inputs , dtype=np.float64)
        # this is dweights or dfilter_weights
        self.dweights = np.zeros_like(self.filter_weights)
        for i, filter in enumerate(self.filter_weights):

            # Now I will iterate through column and through row
            for row in range (0,self.output[0].shape[1],self.stride):

                # This projects from overflowing through columns (Verically)
                if(row > self.input_size - self.filter_shape):
                    break

                # As the output shape is output_channel, row, column
                for column in range (0,self.output[0].shape[1],self.stride):

                    # This projects from overflowing through columns (Horizontally)
                    if(column > self.input_size - self.filter_shape):
                        break
                        
                    else:
                        # Do multiplication, do slice among input channels, multiply, sum and append to output_per_filter
                        # dvalue = dvalues[:,i,row,column]
                        dinputs_gradual_values = dvalues[:, i, row, column][:, None, None, None] * filter[None, :, :, :]
                       # Already has correct shape: (batch_size, input_channel, fh, fw)
                        self.dinputs[:, :, row:row + self.filter_shape, column: column + self.filter_shape] += dinputs_gradual_values

                        dweights_gradual_values = (dvalues[:, i, row, column])[:, None, None, None] \
                        * (self.inputs[:, :, row: row + self.filter_shape, column: column + self.filter_shape])
                        dweights_gradual_values = dweights_gradual_values.sum(axis=0)
                        # print(f'dweights_gradual_values shape:: {dweights_gradual_values.shape}')
                        self.dweights[i, :, :,:] += dweights_gradual_values

                     
                
        

In [None]:
# There is no learning parameters in Pooling
class Pooling:
    def __init__(self,pooling_frame_size, input_channel, input_size,batch_size,stride=0):
        self.pooling_frame_size = pooling_frame_size
        self.input_channel = input_channel
        self.input_size = input_size
        self.batch_size = batch_size

        if(stride != 0):
            self.stride = stride
        else:
            self.stride = self.pooling_frame_size
        
    def pooling(self,inputs):
        pass

    def backward(self,dvalues):
        pass

class Average_Pooling(Pooling):
    def __init__(self,pooling_frame_size, input_channel, input_size,batch_size,stride=0):
        super().__init__(pooling_frame_size, input_channel, input_size,batch_size,stride)
        

    def pooling(self,inputs):
        self.inputs = inputs
        # self.output = []
        self.output = np.zeros((self.batch_size, self.input_channel, int(((self.input_size - self.pooling_frame_size)/self.stride) + 1), \
                               int((self.input_size - self.pooling_frame_size)/self.stride) + 1))
        
        for i in range(self.input_channel):
            # Now I will iterate through column and through row
            for row in range (0,self.input_size,self.stride):

                # This projects from overflowing through columns (Verically)
                if(row > self.input_size - self.pooling_frame_size):
                    break
                    
                for column in range (0,self.input_size,self.stride):

                    # This projects from overflowing through columns (Horizontally)
                    if(column > self.input_size - self.pooling_frame_size):
                        break
                        
                    else:
                        # Do multiplication, do slice among input channels, multiply, sum and append to output_per_filter
                        # now we are also taking batch_size into consideration, so, few things need to change
                        # input_shape = batch_size, input_channel, input_size, input_size 
                        average_pooling_section = self.inputs[:,i, row:self.pooling_frame_size + row, column:self.pooling_frame_size+ column]
                        # I used axis=(1,2) not axis=(1,2,3) because, since i is single dimension, it just omits it, so shape becomes
                        # batch_size, pooling_frame, pooling_frame
                        average = np.mean(average_pooling_section, axis=(1,2)) # need to average up only on per batch basis
                        # because we are outputting in a smaller size than input, also the indexing is supposed to go from 0 to more that's why //
                        self.output[:, i, row // self.stride, column // self.stride] = average
    

    def backward(self,dvalues):
        self.dinputs = np.ones_like(self.inputs) # It isn't proper dinputs, but more of a initial filter that hepls to caluclate the dinputs
        for i in range(self.input_channel):
            # the shape of dvalues will be, batch_size, input_filter channel, input_size reduced by pooling frame, input_size reduced by pooling frame
            # for input_filter in range(self.input_size):
                # we are considering row and column being symmetrical
                for row in range(self.output.shape[2]):
                    for column in range(dvalues.shape[2]):
                        # Now mutiply the block with specific elements
                        # modifying the ones_like variable cause it's average, and every elements are involved
                        row_specific = 0 if row == 0 else row * self.pooling_frame_size
                        column_specific = 0 if column == 0 else column * self.pooling_frame_size

                        gradual_dinputs = (1/(self.pooling_frame_size ** 2))* dvalues[:,i,row,column]

                        gradual_dinputs = gradual_dinputs[:, np.newaxis,np.newaxis]
                        gradual_dinputs = np.broadcast_to(gradual_dinputs, (self.batch_size, self.pooling_frame_size,self.pooling_frame_size))
                        
                        self.dinputs[:,i,row_specific: row_specific + self.pooling_frame_size, column_specific: column_specific \
                        + self.pooling_frame_size] += gradual_dinputs
                        
       
        # # This would also be wrong, as I have to expand dvalues dimentsion to dinputs size, dvalues shape is row/2 * column/2
        # self.dinputs = (1/(self.pooling_frame_size * 2)) * dvalues


In [7]:
class Max_Pooling(Pooling):
    def __init__(self,pooling_frame_size, input_channel, input_size, stride=0):
        super().__init__(pooling_frame_size, input_channel, input_size,stride)

    def pooling(self,inputs):
        self.inputs = inputs
        self.output = []
        # Entering 1 where ever the max value is choosen from
        # By the way the filter name doesnot mean filter like for CNN, it's just used as normal english word
        self.dinputs_max_filter = np.zeros_like(self.inputs)
        
        for i in range(self.input_channel):
            output_per_filter = []
            # Now I will iterate through column and through row
            for j in range (self.input_size):

                output_per_filter_row = []
                # This projects from overflowing through columns (Verically)
                if(j > self.input_size - self.pooling_frame_size):
                    break
                    
                for k in range (self.input_size):

                    # This projects from overflowing through columns (Horizontally)
                    if(k > self.input_size - self.pooling_frame_size):
                        break
                        
                    else:
                        # Do multiplication, do slice among input channels, multiply, sum and append to output_per_filter
                        average_pooling_section = self.inputs[i, j:self.pooling_frame_size + j, k:self.pooling_frame_size+ k]
                        max_val = np.max(average_pooling_section)
                        output_per_filter_row.append(max_val)

                        row= -1
                        column = -1
                        flat_index_argmax = np.argmax(average_pooling_section)
                        row = int(flat_index_argmax / self.pooling_frame_size)
                        column = flat_index_argmax % self.pooling_frame_size

                        # Might need to test if this really works or not
                        self.dinputs[i,j+row, k+column] = 1

                    k+=self.stride
                output_per_filter.append(output_per_filter_row)
                j+=self.stride
            
            self.output.append(output_per_filter)
        self.output = np.array(self.output)

    # I haven't fixed this max_pooling backward, will do that very soon, just feeling lazy, could just look to average pooling, where impl is correct
    def backward(self,dvalues):
        for i in range(self.input_channel):
            for dvalue in dvalues:
                for row in range(dvalue.shape[0]):
                    for column in range(dvalue.shape[1]):
                        # Now mutiply the block with specific elements
                        # modifying the ones_like variable cause it's average, and every elements are involved
                        row_specific = 0 if row == 0 else row * self.pooling_frame_size
                        self.dinputs_max_filter[i,row: row + self.pooling_frame_size, column: column + self.pooling_frame_size] = \
                        self.dinputs_max_filter * dvalues

        
        # # This is wrong, the shape is not right here dvalues has shape that is (row/2, column/2) in reference to self.dinputs
        # self.dinputs = self.dinputs_max_filter * dvalues

In [8]:
class Layer:

    def __init__(self, n_inputs, n_neurons):
        # Initializing weights and biases

        # We are initializing the values of weight in form of (n_inputs, n_neurons) just to save us time from transposing while
        # multiplying with batches of input as batch size would be (batch_size, n_inputs), so input @ weights
        self.weights = 0.01 * np.random.randn(n_inputs,n_neurons)

        # Initially initializing at 0
        self.biases = np.zeros((1,n_neurons))


    def feed_forward(self, inputs):
        # keeping in inputs, to calculate gradient for weights
        self.inputs = inputs
        self.output = np.dot(inputs, self.weights) + self.biases

    def backward(self,dvalues):
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0)
        self.dinputs = np.dot(dvalues, self.weights.T)

In [9]:
class Activation_Relu:

    def feed_forward(self,inputs):

        #Also keeping in the inputs to make the backward pass
        self.inputs = inputs
        
        # Relu basically means take input value if it is bigger than 0, else just make it 0
        self.output = np.maximum(0, inputs)

    def backward(self, dvalues):

        self.dinputs = dvalues.copy()

        self.dinputs[self.inputs <= 0] = 0
        

In [10]:
class Softmax:

    def feed_forward(self,inputs):
        normalized_val_exp = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        
        probabilities_value = normalized_val_exp / np.sum(normalized_val_exp, axis=1, keepdims=True)

        self.output = probabilities_value

    def backward(self,dvalues):

        self.dinputs = np.empty_like(dvalues)

        for index, (single_softmax_output ,single_sample_CCELoss_dval) in \
        enumerate(zip(self.output, dvalues)):

            # reshape softmax output of final layer say [1, 2, 3] is now [[1],[2],[3]]
            single_softmax_output = single_softmax_output.reshape(-1,1)
    
            jacobian_matrix = np.diagflat(single_softmax_output) - \
                                np.dot(single_softmax_output, single_softmax_output.T)
            self.dinputs[index] = np.dot(jacobian_matrix, single_sample_CCELoss_dval)
        

In [11]:
class Loss:

    # We are averaging all the errors in a batch
    #output_val is predicted_value
    def calculate(self,output_val,true_class):

        loss_along_each_itter_inBatch = self.feed_forward(output_val,true_class)

        averaged_err = np.mean(loss_along_each_itter_inBatch)

        return averaged_err
        

class Loss_CrossCategorical(Loss):

    def feed_forward(self,output_val,true_class):

        number_of_samples = len(output_val)
    
    
        # Clipping true_class prediction as it isn't 0 or 1
    
        # 1e-7 is lower limit and 1 - 1e-7 is upper limit
        
        clipped_val = np.clip(output_val, 1e-7, 1- 1e-7)
        
        # considering the output or true_class in in format [0,1,0,0,2,1,0]
        # here the output class per each sample is corresponding index
    
        #considering true_class is in this format
    
        correct_confidences = clipped_val[range(number_of_samples), true_class]
    
        # Now calculating Loss
    
        negative_log_likelihoods = -np.log(correct_confidences)
    
        return negative_log_likelihoods

    # here dvalues means softmax output or say prediction in final layer
    def backward(self, dvalues, true_class):

        number_of_possible_classes = len(dvalues[0])

        number_of_samples = len(dvalues)

        # So that division by 0 or value near 0 doesn't divide anything
        clipped_val = np.clip(dvalues, 1e-7, 1- 1e-7)

        # One hot encoding true class through index assigning
        true_class_eye_format = np.eye(number_of_possible_classes)[true_class]
        
        # The shape of true_class_eye_format will be (batch_size, number_of_possible_class) or (batch_size, final_layer_number_of_neurons)
        # same as that of softmax
        
        self.dinputs = - true_class_eye_format / clipped_val

        # Normalizing gradient
        self.dinputs = self.dinputs / number_of_samples


# Combined version of cross-categorical entropy loss and softmax partial derivate version is left on purpose
# I will do that after some time

In [12]:
class Accuracy:
    def calculate(self,predictiction_prop, true_class):
        # Calcualting argmax per row
        predicted_class = np.argmax(predictiction_prop, axis=1)

        accuracy = np.mean(predicted_class == true_class)

        return accuracy

In [13]:
class Optimizer_SGD:
    def __init__(self,lr = 1.0):
        self.learning_rate = lr

    def update_params(self,layer):
        layer.weights -= self.learning_rate * layer.dweights
        layer.biases -= self.learning_rate * layer.dbiases

    def update_params_CNN_layer(self,layer):
        layer.filter_weights -= self.learning_rate * layer.filter_weights
        layer.filter_biases -= self.learning_rate * layer.filter_biases

In [62]:

# Implementation of architecture almost same as LeNet

batch_size = 128

conv_layer1 = Conv_Layer(input_channel=1,input_size=28,stride=1,filter_shape=5,output_chanel_or_filter_num=6,batch_size=batch_size)
activation_1 = Activation_Relu()
average_pooling_layer_1 = Average_Pooling(2,6,24,batch_size=batch_size)
conv_layer2 = Conv_Layer(6,12,1,5,16,batch_size=batch_size)
activation_2 = Activation_Relu()
average_pooling_layer_2 = Average_Pooling(pooling_frame_size=2,input_channel=16,input_size=8,batch_size=batch_size)
conv_layer3 = Conv_Layer(input_channel=16,input_size=4,stride=1,filter_shape=4,output_chanel_or_filter_num=120,batch_size=batch_size)
activation_3 = Activation_Relu()
fully_conn_layer1 = Layer(n_inputs=120, n_neurons=84)
activation_4 = Activation_Relu()
fully_conn_layer2 = Layer(n_inputs=84, n_neurons=10)
softmax_activation = Softmax()
loss_function = Loss_CrossCategorical()
accuracy_cls = Accuracy()




loss:: 2.302567035713741
accuracy:: 0.109375
loss:: 2.3025618126616614
accuracy:: 0.0859375


KeyboardInterrupt: 

In [None]:
for data_range in range(0, len(train_dataset), batch_size):


    batch_labels = np.array(train_dataset[data_range: data_range + batch_size]['label'], dtype='int')
    
    conv_layer1.forward(np.array(train_dataset[data_range: data_range + batch_size]['image'], dtype='float64').reshape(batch_size,1,28,28)) # Here we enter the input image
    
    
    activation_1.feed_forward(conv_layer1.output)
    
    
    average_pooling_layer_1.pooling(conv_layer1.output)
    
    
    conv_layer2.forward(average_pooling_layer_1.output)

    # print(f'conv_layer_2_output_shape::{conv_layer3.output.shape}')
    
    
    activation_2.feed_forward(conv_layer2.output)
    
    
    # print(f'conv_layer2_output_shape::{conv_layer2.output.shape}')
    average_pooling_layer_2.pooling(activation_2.output)
    
    
    conv_layer3.forward(average_pooling_layer_2.output)
    
    
    activation_3.feed_forward(conv_layer3.output)

    # print(f'activation_3 output_shape::{activation_3.output.shape}')
    
    flatten_output = conv_layer3.output.reshape(batch_size,-1)

    # print(f'flatten output_shape::{flatten_output.shape}')
    
    fully_conn_layer1.feed_forward(flatten_output)
    
    
    activation_4.feed_forward(fully_conn_layer1.output)
    
    
    fully_conn_layer2.feed_forward(activation_4.output)
    
    
    softmax_activation.feed_forward(fully_conn_layer2.output)
    
    

    if(data_range % (batch_size * 11) == 0):
        print(f'loss:: {loss_function.calculate(softmax_activation.output,batch_labels)}')
        print(f'accuracy:: {accuracy_cls.calculate(softmax_activation.output,batch_labels)}')
    
    
    # Now Do Backprop
    loss_function.backward(softmax_activation.output,batch_labels)
    softmax_activation.backward(loss_function.dinputs)

    fully_conn_layer2.backward(softmax_activation.dinputs)

    activation_4.backward(fully_conn_layer2.dinputs)
    fully_conn_layer1.backward(activation_4.dinputs)
    
    activation_3.backward(fully_conn_layer1.dinputs.reshape(activation_3.output.shape))

    # print(f'activation_3 output_shape:: {conv_layer3.output.shape}')
    
    conv_layer3.backward(activation_3.dinputs)
    
    average_pooling_layer_2.backward(conv_layer3.dinputs)

    activation_2.backward(average_pooling_layer_2.dinputs)
    conv_layer2.backward(activation_2.dinputs)

    average_pooling_layer_1.backward(conv_layer2.dinputs)

    activation_1.backward(average_pooling_layer_1.dinputs)
    conv_layer1.backward(activation_1.dinputs)

    # Now updating the weights and biases
    optimizer_general = Optimizer_SGD(lr=0.01)

    optimizer_general.update_params(fully_conn_layer1)
    optimizer_general.update_params(fully_conn_layer2)

    optimizer_general.update_params_CNN_layer(conv_layer1)
    optimizer_general.update_params_CNN_layer(conv_layer2)
    optimizer_general.update_params_CNN_layer(conv_layer3)