In [None]:
import numpy as np

In [None]:
input = np.random.randn(10, 1, 28, 28)  # 10 images, 1 channel, grayscale, 28 * 28 pixel

In [None]:
# one hot encoding label

label = np.array([1, 3, 1, 2, 2, 3, 1, 1, 3 , 1])
one_hot = np.zeros((label.size, label.max() + 1))
one_hot[np.arange(label.size), label] = 1
one_hot_transpose = one_hot.T
one_hot_transpose

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0., 1., 1., 0., 1.],
       [0., 0., 0., 1., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 1., 0., 0., 1., 0.]])

# Let's make the convolutional part

In [None]:
m, n_C, n_H_prev, n_W_prev = input.shape
print(m, n_C, n_H_prev, n_W_prev)

10 1 28 28


The parameters

In [None]:
filter_amount = 6
filter_size = 3
padding = 0
stride = 1
channel_amount = 1

The output dimension

In [None]:
new_number_channel = filter_amount
new_height = int((n_H_prev + 2 * padding - filter_size)/ stride) + 1
new_weight = int((n_W_prev + 2 * padding - filter_size)/ stride) + 1

out_conv = np.zeros((m, new_number_channel, new_height, new_weight))

In [None]:
print(out_conv.shape)

(10, 6, 26, 26)


forward convolution

In [None]:
import math

bound = 1/math.sqrt(filter_size * filter_size)
W = np.random.uniform(-bound, bound, size=(filter_amount, channel_amount, filter_size, filter_size))
W_output = np.zeros((filter_amount, channel_amount, filter_size, filter_size))

bias = np.random.uniform(-bound, bound, size=(filter_amount))
bias_output = np.zeros((filter_amount))

In [None]:
for images in range(m):
  for channel in range(new_number_channel):
    for height in range(new_height):
      height_start = height * stride
      height_end = height_start + filter_size
      for weight in range(new_weight):
        weight_start = weight * stride
        weight_end = weight_start + filter_size

        out_conv[images, channel, height, weight] = np.sum(input[images, :, height_start:height_end, weight_start:weight_end] * W[channel, ...]) + bias[channel]


In [None]:
print(input.shape)

(10, 1, 28, 28)


In [None]:
out_conv.shape

(10, 6, 26, 26)

#THE MAIN GOAL: To get the correct dimmension

Activation for convolutional

In [None]:
out_conv = np.tanh(out_conv)

forward average pooling

In [None]:
m, n_C, n_H_prev, n_W_prev = out_conv.shape

new_number_channel = filter_amount
new_height = int((n_H_prev + 2 * padding - filter_size)/ stride) + 1
new_weight = int((n_W_prev + 2 * padding - filter_size)/ stride) + 1

out_pool = np.zeros((m, new_number_channel, new_height, new_weight))

for images in range(m):
  for channel in range(new_number_channel):
    for height in range(new_height):
      height_start = height * stride
      height_end = height_start + filter_size
      for weight in range(new_weight):
        weight_start = weight * stride
        weight_end = weight_start + filter_size

        out_pool[images, channel, height, weight] = np.mean(out_conv[images, channel, height_start:height_end, weight_start:weight_end])

In [None]:
out_conv.shape

(10, 6, 26, 26)

In [None]:
print(out_pool.shape)

(10, 6, 24, 24)


Forward Fully Connected layer

In [None]:
i, c, width, height = out_pool.shape
width = width * height * c       # Flatten the data, 28 * 28 pixel with 1 dimension --> 1 dimensional array (28*28*1)
height = 4                       # the amount of neuron

bound = 1/np.sqrt(width)
W_fc = np.random.uniform(low=-bound, high=bound, size=(height, width))
b_fc = np.random.uniform(low=-bound, high=bound, size=(1, height))

out_pool_flat = out_pool.reshape(10, -1)
A_fc = np.dot(out_pool_flat, W_fc.T) + b_fc

In [None]:
A_fc.shape

(10, 4)

probability distribution

In [None]:
softmax = np.exp(A_fc) / np.sum(np.exp(A_fc))

In [None]:
softmax.shape

(10, 4)

calculate the loss / error

In [None]:
batch_size = softmax.shape[1]
deltaL = softmax - one_hot        # derivative value fo the loss
loss = -np.sum(one_hot * np.log(softmax)) / batch_size # negative log-likelihood

In [None]:
loss

9.326551912935091

# Backprop CNN

fcc backprop

In [None]:

g_W_fcc = 1/m * np.dot(deltaL.T, out_pool_flat)
g_b_fcc = 1/m * np.sum(deltaL, axis=0)

new_deltaL = np.dot(deltaL, W_fc)

In [None]:
new_deltaL.shape

(10, 3456)

average_pool backprop

In [None]:
out_conv.shape

(10, 6, 26, 26)

In [None]:
out_pool.shape

(10, 6, 24, 24)

In [None]:
dout = np.reshape(new_deltaL, out_pool.shape) # Loss value shape ==> output of the pooling layer

In [None]:
m, n_C, n_H, n_W = dout.shape

In [None]:
dX = np.zeros(out_conv.shape)

for image in range(m):
  for channel in range(n_C):
    for height in range(n_H):
      height_start = height * stride
      height_end = height_start + filter_size
      for weight in range(n_W):
        weight_start = weight * stride
        weight_end = weight_start + filter_size

        # We're using average pooling layer
        average = dout[image, channel, height, weight] / (filter_size * filter_size)
        filter_average = np.full((filter_size, filter_size), average)
        dX[image, channel, height_start:height_end, weight_start:weight_end] += filter_average

In [None]:
dX.shape

(10, 6, 26, 26)

In [None]:
dout.shape

(10, 6, 24, 24)

TanH derivative

In [None]:
back_tanh = (1 - np.tanh(out_conv)**2)

convlutional backprop

In [None]:
m, n_C, n_H, n_W = input.shape   # (10, 1, 28, 28)
m, n_C_dout, n_H_dout, n_W_dout = dout.shape # shape after tanh (10, 6, 26, 26)  -> output shape after forward conv

dX = np.zeros(input.shape)
W_grad = np.zeros((filter_amount, channel_amount, filter_size, filter_size))
b_grad = np.zeros((filter_amount))

# comput dW
for image in range(m):
  for channel in range(n_C_dout):
    for height in range(n_H_dout):
      height_start = height * stride
      height_end = height_start + filter_size
      for width in range(n_W_dout):
        width_start = width * stride
        width_end = width_start + filter_size

        W_grad[channel, ...] += dout[image, channel, height, width] * input[image, :, height_start:height_end, width_start:width_end]

        # gradient for dX
        dX[image, :, height_start:height_end, width_start:width_end] += dout[image, channel, height, width] * W[channel, ...]

# computer db
for filter in range(filter_amount):
  b_grad[filter, ...] = np.sum(dout[:, filter, ...])


In [None]:
b_grad.shape

(6,)

In [None]:
W_grad.shape

(6, 1, 3, 3)

In [None]:
dX.shape

(10, 1, 28, 28)

dX == original input dimension shape

Let's put everything together

#MNIST Implementation of CNN from scratch

Why implementing MNIST dataset for CNN:



1.   To understand the intuition behind CNN(Convolutional Neural Network)
2.   There's going to be a big problem at the end of this aprticular program (MNIST implementation for CNN)



In [None]:
# Import some dependencies

import numpy as np
import math
import pandas as pd

data = pd.read_csv("/content/drive/MyDrive/mnist_train.csv")
data = np.array(data)
batch_size = 32
img, n = data.shape
m = 1200
np.random.shuffle(data)

# m = amount of data we're using
# n = amount of pixel

data_dev = data[0:1000].T
Y_dev = data_dev[0]
X_dev = data[1:n]
X_dev = X_dev / 255  # normalize the data

data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train / 255


_, m_train = X_train.shape


# One hot encoding for the label
Y_train = Y_train[:batch_size]   # size of a batch
one_hot_Y = np.zeros((Y_train.size, Y_train.max() + 1))
one_hot_Y[np.arange(Y_train.size), Y_train] = 1
one_hot = one_hot_Y.T

X_train_reshape = X_train.T.reshape(-1, 1, 28, 28)

# split the reshape data into batches of size 32
batches = [X_train_reshape[i:i+batch_size] for i in range(0, X_train_reshape.shape[0], batch_size)]

# 32, 1, 28, 28 --> batch size(amount of data inside 1 batch), dimension of the image, w, h
print(batches[0].shape)

(32, 1, 28, 28)


# Simple CNN

In [None]:
filter_amount = 6
channel_amount = 1
filter_size = 3
stride = 1
padding = 0
epoch = 10
learning_rate = 0.01

input = batches[0]

# learnable params
# conv
bound = 1/math.sqrt(filter_size * filter_size)
W = np.random.uniform(-bound, bound, size=(filter_amount, channel_amount, filter_size, filter_size))
bias = np.random.uniform(-bound, bound, size=(filter_amount))

#fully connected
width = 6 * 26 * 26
height = 32
bound = 1/np.sqrt(width)
W_fc = np.random.uniform(low=-bound, high=bound, size=(height, width))
b_fc = np.random.uniform(low=-bound, high=bound, size=(1, height))

# conv grad
W_grad = np.zeros((filter_amount, channel_amount, filter_size, filter_size))
b_grad = np.zeros((filter_amount))

  # fully connected layer initialization
  # First Fully Connected Layer Initialization
i, c, width, height = out_pool.shape
width = width * height * c
hidden_neurons = 32

  # Initialize weights and biases for the first fully connected layer (tanh)
bound_fc1 = 1/np.sqrt(width)
W_fc1 = np.random.uniform(low=-bound_fc1, high=bound_fc1, size=(hidden_neurons, width))
b_fc1 = np.random.uniform(low=-bound_fc1, high=bound_fc1, size=(1, hidden_neurons))

  # Second Fully Connected Layer (10 neurons for classification)
output_neurons = 10
bound_fc2 = 1/np.sqrt(hidden_neurons)
W_fc2 = np.random.uniform(low=-bound_fc2, high=bound_fc2, size=(output_neurons, hidden_neurons))
b_fc2 = np.random.uniform(low=-bound_fc2, high=bound_fc2, size=(1, output_neurons))


# training
for i in range(epoch):
  # conv initialization
  m, n_C_prev, n_H_prev, n_W_prev = input.shape
  new_number_channel = filter_amount
  new_height = int((n_H_prev + 2 * padding - filter_size)/ stride) + 1
  new_weight = int((n_W_prev + 2 * padding - filter_size)/ stride) + 1

  out_conv = np.zeros((m, new_number_channel, new_height, new_weight))

  # conv forward
  for images in range(m):
    for channel in range(new_number_channel):
      for height in range(new_height):
        height_start = height * stride
        height_end = height_start + filter_size
        for weight in range(new_weight):
          weight_start = weight * stride
          weight_end = weight_start + filter_size

          out_conv[images, channel, height, weight] = np.sum(input[images, :, height_start:height_end, weight_start:weight_end] * W[channel, ...]) + bias[channel]

  # tanh activation
  out_conv_tanh = np.tanh(out_conv)

  # average pooling initialization
  m, n_C_prev, n_H_prev, n_W_prev = out_conv.shape
  new_number_channel = filter_amount
  new_height = int((n_H_prev + 2 * padding - filter_size)/ stride) + 1
  new_weight = int((n_W_prev + 2 * padding - filter_size)/ stride) + 1
  out_pool = np.zeros((m, new_number_channel, new_height, new_weight))

  # average pooling forward
  for images in range(m):
    for channel in range(new_number_channel):
      for height in range(new_height):
        height_start = height * stride
        height_end = height_start + filter_size
        for weight in range(new_weight):
          weight_start = weight * stride
          weight_end = weight_start + filter_size

          out_pool[images, channel, height, weight] = np.mean(out_conv[images, channel, height_start:height_end, weight_start:weight_end])


  # fc layer
  out_pool_flat = out_pool.reshape(m, -1)  # m is the batch size (32 here)
  A_fc1 = np.dot(out_pool_flat, W_fc1.T) + b_fc1
  Z_fc1 = np.tanh(A_fc1)  # Apply tanh activation

  A_fc2 = np.dot(Z_fc1, W_fc2.T) + b_fc2

  # Softmax Activation for Output Layer
  softmax = np.exp(A_fc2) / np.sum(np.exp(A_fc2), axis=1, keepdims=True)

  # Loss
  batch_size = softmax.shape[0]
  deltaL = softmax - one_hot.T  # delta for output layer (one_hot is transposed here)
  loss = -np.sum(one_hot.T * np.log(softmax + 1e-8)) / batch_size  # add a small constant to avoid log(0)
  print(f"Loss: {loss}")

  # Backpropagation through the second fully connected layer (softmax layer)
  g_W_fc2 = 1/m * np.dot(deltaL.T, Z_fc1)  # Gradient for W_fc2
  g_b_fc2 = 1/m * np.sum(deltaL, axis=0)   # Gradient for b_fc2

  # Backpropagate the error to the first fully connected layer
  new_deltaL = np.dot(deltaL, W_fc2) * (1 - Z_fc1 ** 2)  # Derivative of tanh activation

  # Backpropagation through the first fully connected layer (tanh layer)
  g_W_fc1 = 1/m * np.dot(new_deltaL.T, out_pool_flat)  # Gradient for W_fc1
  g_b_fc1 = 1/m * np.sum(new_deltaL, axis=0)

  # average pool backprop

  new_deltaL = np.dot(W_fc1.T, new_deltaL)

  delta_out = np.reshape(new_deltaL, (32, 6, 24, 24))
  m, n_C, n_H, n_W = delta_out.shape
  dX = np.zeros((32, 6, 26, 26))

  for image in range(m):
    for channel in range(n_C):
      for height in range(n_H):
        height_start = height * stride
        height_end = height_start + filter_size
        for width in range(n_W):
          width_start = width * stride
          width_end = width_start + filter_size

          average = delta_out[image, channel, height, width] / (filter_size * filter_size)
          filter_average = np.full((filter_size, filter_size), average)
          dX[image, channel, height_start:height_end, width_start:width_end] += filter_average

  # convolution backprop

  # tanh derivative
  dX *= (1 - (np.tanh(out_conv)**2))

  m, n_C, n_H, n_W = input.shape #(10, 1, 28, 28)
  m, n_C_dout, n_H_dout, n_W_dout = delta_out.shape #shape after tanh (10, 6, 26, 26)

  dX = np.zeros(input.shape)

  #compute dW
  for image in range(m):
    for channel in range(n_C_dout):
      for height in range(n_H_dout):
        height_start = height * stride
        height_end = height_start + filter_size
        for width in range(n_W_dout):
          width_start = width * stride
          width_end = width_start + filter_size

          W_grad[channel, ...] += delta_out[image, channel, height, width] * input[image, :, height_start:height_end, width_start:width_end]

          dX[image, :, height_start:height_end, width_start:width_end] += delta_out[image, channel, height, width] * W[channel, ...]
  #compute db
  for filter in range(filter_amount):
    b_grad[filter, ...] = np.sum(delta_out[:, filter, ...])

  # update paramaters
  W_fc2 -= learning_rate * g_W_fc2
  b_fc2 -= learning_rate * g_b_fc2
  W_fc1 -= learning_rate * g_W_fc1
  b_fc1 -= learning_rate * g_b_fc1
  W -= learning_rate * W_grad
  bias -= learning_rate * b_grad

Loss: 2.319146329829259
Loss: 2.278899902328964
Loss: 2.2405174853523837
Loss: 2.2032321097388277
Loss: 2.16631548171382
Loss: 2.129038791926226
Loss: 2.090632563337759
Loss: 2.0502368604852474
Loss: 2.0068583061320155
Loss: 1.9593736045685013


When we reduce the amount of data by using convolutional technique, we also add another calculation inside our neural network


The problem : How can we improve the speed of the training, using the same technique (cnn)

if we're adding a new technique, if it's a calculation, we basicly not reducing the speed

the more calculation = more time

# PYTORCH