In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import dependencies
import numpy as np
import pandas as pd

In [None]:
data = pd.read_csv("/content/drive/MyDrive/mnist_train.csv")

In [None]:
data[0:5]

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
data = np.array(data)
data.shape

(60000, 785)

In [None]:
m, n = data.shape

In [None]:
np.max(data)

255

## Train data

In [None]:
# train and dev
data_dev = data[0:1000].T  # the 1st 1000 data
Y_dev = data_dev[0]  # take the label data
X_dev = data[1:n]   # take the pixel data
X_dev = X_dev / 255  # making the data from 0 - 1 instead of 0 - 255

data_train = data[1000:m].T  # 1000 until 60000 data
Y_train = data_train[0] # take the label data
X_train = data_train[1:n]  # take the pixel data
X_train = X_train/255 # normalize the data

In [None]:
X_train.shape

(784, 59000)

## One-hot Label data

In [None]:
one_hot_label = np.zeros((Y_train.size, Y_train.max() + 1))
one_hot_label[np.arange(Y_train.size), Y_train] = 1
one_hot_label = one_hot_label.T

# Neural net

In [None]:
#Initialize the parameters

# 1st hidden layer  --> 10 neuron
W1 = np.random.randn(10, n - 1)  # amount of neuron and input data, n amount is pixel + label
b1 = np.random.randn(10, 1)
# 2nd hidden layer (output)  -- > 10 (because we have 10 labels of category data)
W2 = np.random.randn(10, 10) # amount of neuron and output amount of the previous layer (1st layer)
b2 = np.random.randn(10, 1)

# np.random.randn() -- a random value between -0.5 to 0.5

In [None]:
# Activation function

def Sigmoid(Z):
  A = 1 / (1 + np.exp(-Z))
  return A

def tanh(Z):
  A = np.tanh(Z)
  return A

def ReLu(Z):
  A = np.maximum(0, Z)
  return A

def Softmax(Z):
  A = np.exp(Z) / np.sum(np.exp(Z))
  return A

# Forward propagation

In [None]:
# forward pass

# 1st hidden layer
Z1 = np.dot(W1, X_train) + b1  # linear function value
A1 = ReLu(Z1)  # non-linear function

# 2nd hidden layer
Z2 = np.dot(W2, A1) + b2 # linear function value for the 2nd hidden layer
A2 = Softmax(Z2) # probability distribution

In [None]:
A2.shape # the prediction

(10, 59000)

In [None]:
# y^ for the prediction and y is for the true label

In [None]:
# Loss value
log_loss = -np.sum(one_hot_label * np.log(A2)) / (m - 1)

## Backpropagation

In [None]:
dZ2 = A2 - one_hot_label # derivatives of Loss with respect of Z2

# How much the value of Z2 effecting the loss value, whatever the differences values between the prediction (A2, propbability distribution) and the True label(Y_train, one_hot_label)

In [None]:
dW2 = 1 / m * np.dot(dZ2, A1.T) # how much dZ2 -- > Loss got effected by the value of Z2
db2 = 1 / m * np.sum(dZ2)  # how much bias value efffecting the loss, we need to know how much bias value effecting the Z2

In [None]:
# How much A1 effecting the value of Z1

def deriv_Relu(Z):
  return Z > 0     # if Z is greater than 0 == True, and if Z is less than 0 if False, if we convert Boolean into Integers 0 = False, and 1 = True

In [None]:
dZ1 = np.dot(W2, dZ2) * deriv_Relu(Z1) # How much Z1 effecting the value of the loss

In [None]:
dW1 = 1 / m * np.dot(dZ1, X_train.T) # how much W1 value effecting the loss value
db1 = 1 / m * np.sum(dZ1) # how much the b1 value effecting the loss value

Now we get how much all of the learnable parameters (W1, b1, W2, b2) effecting the loss value

## Upadate the learnable parameters

In [None]:
learning_rate = 0.1  # how much faster our neural net learn, but the bigger value also means the less accurate to the global minimum

W1 = W1 - learning_rate * dW1
b1 = b1 - learning_rate * db1
W2 = W2 - learning_rate * dW2
b2 = b2 - learning_rate * db2

# the new parameters = the previous parameters subtracted by how much we can learn based on how much the previous parameters effecting the loss value

# Demo of Gradient descent

In [None]:
# Initialize weight and bias
W1 = np.random.randn(10, n - 1)
b1 = np.random.randn(10, 1)
W2 = np.random.randn(10, 10)
b2 = np.random.randn(10, 1)

In [None]:
## Make it clean
# forward prop
Z1 = np.dot(W1, X_train) + b1
A1 = np.maximum(Z1, 0)  # ReLu
Z2 = np.dot(W2, A1) + b2

# probability distribution
A2 = np.exp(Z2) / sum(np.exp(Z2))

In [None]:
loss = np.sum(np.square(one_hot_label - A2)) / m
dZ2 = A2 - one_hot_label
print(loss)

1.656755573496776


In [None]:
dW2 = 1 / m * np.dot(dZ2, A1.T)
db2 = 1 / m * np.sum(dZ2)
relu_deriv = Z1 > 0
dZ1 = np.dot(W2, dZ2) * relu_deriv
dW1 = 1 / m * np.dot(dZ1, X_train.T)
db1 = 1 / m * np.sum(dZ1)

In [None]:
W1 = W1 - 0.1 * dW1
b1 = b1 - 0.1 * db1
W2 = W2 - 0.1 * dW2
b2 = b2 - 0.1 * db2

# Neural Net

In [None]:
#let's put it all together

W1 = np.random.randn(10, n - 1)
b1 = np.random.randn(10, 1)
W2 = np.random.randn(10, 10)
b2 = np.random.randn(10, 1)

learing_rate = 0.1

iteration = 30
for i in range(iteration):
  # forward prop
  Z1 = np.dot(W1, X_train) + b1
  A1 = np.maximum(Z1, 0)
  Z2 = np.dot(W2, A1) + b2
  A2 = np.exp(Z2) / sum(np.exp(Z2))

  # loss
  loss = np.sum(np.square(one_hot_label -  A2)) / m
  print(loss)

  # backprop
  dZ2 = A2 - one_hot_label
  dW2 = 1 / m * np.dot(dZ2, A1.T)
  dB2 = 1 / m * np.sum(dZ2)
  relu_deriv = Z1 > 0
  dZ1 = np.dot(W2, dZ2) * relu_deriv
  dW1 = 1 / m * np.dot(dZ1, X_train.T)
  dB1 = 1 / m * np.sum(dZ1)

  # update the parameters
  W1 = W1 - learning_rate * dW1
  b1 = b1 - learning_rate * dB1
  W2 = W2 - learning_rate * dW2
  b2 = b2 - learning_rate * dB2

1.6825702429929252
1.6030984221730855
1.5622828298774343
1.5228648854801161
1.4900220816599483
1.4715950465935013
1.4504727662675845
1.4347647334388531
1.4142833677594002
1.3891667499535074
1.364808381646143
1.3422687900601007
1.3214251798070695
1.3016946517556527
1.2828581328904087
1.2645729954192644
1.2466432834764687
1.2290523922329533
1.2117817306724195
1.1946945390697847
1.1776338834384126
1.1605126419235234
1.143298703553244
1.1260655613613517
1.1089495500895963
1.0920968876402637
1.075698231450847
1.0599016277165794
1.044873367191528
1.0306795573398184


# Optimization

hyperparameter

In [None]:
learning_rate = 0.01
beta1 = 0.9          # beta value for Momentum
beta2 = 0.999         # beta value for RMS prop
epsilon = 1e-8       # Small value to prevent division by 0

Momentum

In [None]:
# parameters
vW1, vb1 = np.zeros_like(W1), np.zeros_like(b1)
vW2, vb2 = np.zeros_like(W2), np.zeros_like(b2)

In [None]:
vW1 = beta1 * vW1 + (1 - beta1) * dW1
vb1 = beta1 * vb1 + (1 - beta1) * db1
vW2 = beta1 * vW2 + (1 - beta1) * dW2
vb2 = beta1 * vb2 + (1 - beta1) * db2

In [None]:
# parameters update for momentum
W1 -= learning_rate * vW1
b1 -= learning_rate * vb1
W2 -= learning_rate * vW2
b1 -= learning_rate * vb2

Gradient descent with momentum

In [None]:
#let's put it all together

W1 = np.random.randn(10, n - 1)
b1 = np.random.randn(10, 1)
W2 = np.random.randn(10, 10)
b2 = np.random.randn(10, 1)

learing_rate = 0.1
# parameters
vW1, vB1 = np.zeros_like(W1), np.zeros_like(b1)
vW2, vB2 = np.zeros_like(W2), np.zeros_like(b2)

iteration = 30
for i in range(iteration):
  # forward prop
  Z1 = np.dot(W1, X_train) + b1
  A1 = np.maximum(Z1, 0)
  Z2 = np.dot(W2, A1) + b2
  A2 = np.exp(Z2) / sum(np.exp(Z2))

  # loss
  loss = np.sum(np.square(one_hot_label -  A2)) / m
  print(loss)

  # backprop
  dZ2 = A2 - one_hot_label
  dW2 = 1 / m * np.dot(dZ2, A1.T)
  dB2 = 1 / m * np.sum(dZ2)
  relu_deriv = Z1 > 0
  dZ1 = np.dot(W2, dZ2) * relu_deriv
  dW1 = 1 / m * np.dot(dZ1, X_train.T)
  dB1 = 1 / m * np.sum(dZ1)

  vW1 = beta1 * vW1 + (1 - beta1) * dW1
  vb1 = beta1 * vb1 + (1 - beta1) * dB1
  vW2 = beta1 * vW2 + (1 - beta1) * dW2
  vb2 = beta1 * vb2 + (1 - beta1) * dB2

  # parameters update for momentum
  W1 = W1 - learning_rate * vW1
  b1 = b1 - learning_rate * vb1
  W2 = W2 - learning_rate * vW2
  b2 = b2 - learning_rate * vb2

1.6249437070981676
1.624455755321246
1.6235581648621698
1.622275105691062
1.6206147901994699
1.6185808634265917
1.6161799517711557
1.613425015525712
1.6103525713628073
1.6070227612615275
1.6035121856638745
1.5998554053148684
1.596049259083847
1.5921015397427252
1.5880818646993309
1.5840587977594767
1.5800610141040323
1.5760758551415208
1.5720983177597052
1.5681556839705164
1.5642795907292804
1.5605540503997934
1.5570133340307069
1.5536592926075135
1.5504881857069779
1.5475267701710098
1.5447702569990371
1.5422211678165563
1.5398709438627107
1.5377287011210015


RMS Prop

In [None]:
# parameters rms prop
sW1, sb1 = np.zeros_like(W1), np.zeros_like(b1)
sW2, sb2 = np.zeros_like(W2), np.zeros_like(b2)

In [None]:
# formula for rms prop
sW1 = beta2 * sW1 + (1 - beta2) * np.square(dW1)
sb1 = beta2 * sb1 + (1 - beta2) * np.square(dB1)
sW2 = beta2 * sW2 + (1 - beta2) * np.square(dW2)
sb2 = beta2 * sb2 + (1 - beta2) * np.square(dB2)

In [None]:
# parameters update for rms prop

W1 = W1 - learning_rate * dW1 / (np.sqrt(sW1) + epsilon)
b1 = b1 - learning_rate * dB1 / (np.sqrt(sb1) + epsilon)
W2 = W2 - learning_rate * dW2 / (np.sqrt(sW2) + epsilon)
b2 = b2 - learning_rate * dB2 / (np.sqrt(sb2) + epsilon)

gradient descent with rms prop

In [None]:
#let's put it all together

W1 = np.random.randn(10, n - 1)
b1 = np.random.randn(10, 1)
W2 = np.random.randn(10, 10)
b2 = np.random.randn(10, 1)

learing_rate = 0.1
# parameters rms prop
sW1, sb1 = np.zeros_like(W1), np.zeros_like(b1)
sW2, sb2 = np.zeros_like(W2), np.zeros_like(b2)

iteration = 30
for i in range(iteration):
  # forward prop
  Z1 = np.dot(W1, X_train) + b1
  A1 = np.maximum(Z1, 0)
  Z2 = np.dot(W2, A1) + b2
  A2 = np.exp(Z2) / sum(np.exp(Z2))

  # loss
  loss = np.sum(np.square(one_hot_label -  A2)) / m
  print(loss)

  # backprop
  dZ2 = A2 - one_hot_label
  dW2 = 1 / m * np.dot(dZ2, A1.T)
  dB2 = 1 / m * np.sum(dZ2)
  relu_deriv = Z1 > 0
  dZ1 = np.dot(W2, dZ2) * relu_deriv
  dW1 = 1 / m * np.dot(dZ1, X_train.T)
  dB1 = 1 / m * np.sum(dZ1)

  # formula for rms prop
  sW1 = beta2 * sW1 + (1 - beta2) * np.square(dW1)
  sb1 = beta2 * sb1 + (1 - beta2) * np.square(dB1)
  sW2 = beta2 * sW2 + (1 - beta2) * np.square(dW2)
  sb2 = beta2 * sb2 + (1 - beta2) * np.square(dB2)

  # parameters update for rms prop

  W1 = W1 - learning_rate * dW1 / (np.sqrt(sW1) + epsilon)
  b1 = b1 - learning_rate * dB1 / (np.sqrt(sb1) + epsilon)
  W2 = W2 - learning_rate * dW2 / (np.sqrt(sW2) + epsilon)
  b2 = b2 - learning_rate * dB2 / (np.sqrt(sb2) + epsilon)

1.7303557507222156
1.769654407286932
1.7303694275503325
1.6023195845654734
1.733144015498148
1.5576480988351373
1.6785249659711232
1.5790128243511319
1.7326970913829427
1.5664783754187124
1.4840891828139322
1.3195651263111787
1.3350243339359864
1.187217366402409
1.1541464771117906
1.0128291753526255
1.0551584403534924
0.9859622585144238
1.088641948871076
0.9589980385185378
1.1543223357364965
0.87263353051983
0.8521352872675015
0.7798754511989375
0.7967755041806012
0.7793711844068165
0.7436928295605723
0.7346134544076686
0.7382092376422855
0.7495330717434602


Adam

In [None]:
# Initialize momentum and rms prop value

# momentum params
vW1, vb1 = np.zeros_like(W1), np.zeros_like(b1)
vW2, vb2 = np.zeros_like(W2), np.zeros_like(b2)

# rms prop
sW1, sb1 = np.zeros_like(W1), np.zeros_like(b1)
sW2, sb2 = np.zeros_like(W2), np.zeros_like(b2)

t = 0

In [None]:
# formula for Adam
t += 1

# Momentum (first moment estimate)
vW1 = beta1 * vW1 + (1 - beta1) * dW1
vb1 = beta1 * vb1 + (1 - beta1) * dB1
vW2 = beta1 * vW2 + (1 - beta1) * dW2
vb2 = beta1 * vb2 + (1 - beta1) * dB2

# rms prop (second moment estimate)
sW1 = beta2 * sW1 + (1 - beta2) * np.square(dW1)
sb1 = beta2 * sb1 + (1 - beta2) * np.square(dB1)
sW2 = beta2 * sW2 + (1 - beta2) * np.square(dW2)
sb2 = beta2 * sb2 + (1 - beta2) * np.square(dB2)

# bias correction
vW1_corrected = vW1 / (1 - beta1**t)
vb1_corrected = vb1 / (1 - beta1**t)
vW2_corrected = vW2 / (1 - beta1**t)
vb2_corrected = vb2 / (1 - beta1**t)

sW1_corrected = sW1 / (1 - beta2**t)
sb1_corrected = sb1 / (1 - beta2**t)
sW2_corrected = sW2 / (1 - beta2**t)
sb2_corrected = sb2 / (1 - beta2**t)

In [None]:
# parameters update for Adam

W1 = W1 - learning_rate * vW1_corrected / (np.sqrt(sW1_corrected) + epsilon)
b1 = b1 - learning_rate * vb1_corrected / (np.sqrt(sb1_corrected) + epsilon)
W2 = W2 - learning_rate * vW2_corrected / (np.sqrt(sW2_corrected) + epsilon)
b2 = b2 - learning_rate * vb2_corrected / (np.sqrt(sb2_corrected) + epsilon)

Gradient descent with Adam Optimization

In [None]:
#let's put it all together

W1 = np.random.randn(10, n - 1)
b1 = np.random.randn(10, 1)
W2 = np.random.randn(10, 10)
b2 = np.random.randn(10, 1)

learing_rate = 0.1
# Initialize momentum and rms prop value

# momentum params
vW1, vb1 = np.zeros_like(W1), np.zeros_like(b1)
vW2, vb2 = np.zeros_like(W2), np.zeros_like(b2)

# rms prop
sW1, sb1 = np.zeros_like(W1), np.zeros_like(b1)
sW2, sb2 = np.zeros_like(W2), np.zeros_like(b2)

t = 0

iteration = 30
for i in range(iteration):
  # forward prop
  Z1 = np.dot(W1, X_train) + b1
  A1 = np.maximum(Z1, 0)
  Z2 = np.dot(W2, A1) + b2
  A2 = np.exp(Z2) / sum(np.exp(Z2))

  # loss
  loss = np.sum(np.square(one_hot_label -  A2)) / m
  print(loss)

  # backprop
  dZ2 = A2 - one_hot_label
  dW2 = 1 / m * np.dot(dZ2, A1.T)
  dB2 = 1 / m * np.sum(dZ2)
  relu_deriv = Z1 > 0
  dZ1 = np.dot(W2, dZ2) * relu_deriv
  dW1 = 1 / m * np.dot(dZ1, X_train.T)
  dB1 = 1 / m * np.sum(dZ1)

  # formula for Adam
  t += 1

  # Momentum (first moment estimate)
  vW1 = beta1 * vW1 + (1 - beta1) * dW1
  vb1 = beta1 * vb1 + (1 - beta1) * dB1
  vW2 = beta1 * vW2 + (1 - beta1) * dW2
  vb2 = beta1 * vb2 + (1 - beta1) * dB2

  # rms prop (second moment estimate)
  sW1 = beta2 * sW1 + (1 - beta2) * np.square(dW1)
  sb1 = beta2 * sb1 + (1 - beta2) * np.square(dB1)
  sW2 = beta2 * sW2 + (1 - beta2) * np.square(dW2)
  sb2 = beta2 * sb2 + (1 - beta2) * np.square(dB2)

  # bias correction
  vW1_corrected = vW1 / (1 - beta1**t)
  vb1_corrected = vb1 / (1 - beta1**t)
  vW2_corrected = vW2 / (1 - beta1**t)
  vb2_corrected = vb2 / (1 - beta1**t)

  sW1_corrected = sW1 / (1 - beta2**t)
  sb1_corrected = sb1 / (1 - beta2**t)
  sW2_corrected = sW2 / (1 - beta2**t)
  sb2_corrected = sb2 / (1 - beta2**t)

  # parameters update for Adam

  W1 = W1 - learning_rate * vW1_corrected / (np.sqrt(sW1_corrected) + epsilon)
  b1 = b1 - learning_rate * vb1_corrected / (np.sqrt(sb1_corrected) + epsilon)
  W2 = W2 - learning_rate * vW2_corrected / (np.sqrt(sW2_corrected) + epsilon)
  b2 = b2 - learning_rate * vb2_corrected / (np.sqrt(sb2_corrected) + epsilon)

1.7468978743821215
1.7422284894338707
1.7381851308251877
1.7347369223842228
1.7303222777738028
1.7253352866651297
1.71984522822497
1.7141878026961368
1.7077683607690815
1.7000909867846283
1.6922263316293988
1.687113636523261
1.6809811756151485
1.6790433664175264
1.6798426438594858
1.6816267815805024
1.684035707072324
1.6870771799685134
1.6914597180846935
1.696620519218906
1.7019010886895447
1.7068890512662225
1.7117567659688733
1.716866603270737
1.7219939425355961
1.726555674037062
1.730924209646928
1.734356301994775
1.7371781916425348
1.739361200055825
