<a href="https://colab.research.google.com/github/AndrewstheBuilder/ScratchNeuralNetworks/blob/main/VanishingGradient_Micrograd_MNIST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setup

In [24]:
pip install micrograd_andrews



In [25]:
# Micrograd imports
from micrograd_andrews.engine import Value
from micrograd_andrews.nn import Neuron, Layer, MLP

In [26]:
# other imports
import math
import random
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import pyplot
%matplotlib inline
from keras.datasets import mnist
import copy

np.random.seed(1337)
random.seed(1337)

In [27]:
# training and test data
(train_X, train_y), (test_X, test_y) = mnist.load_data()

# normalize to have training values for pixels between 0-1.
train_X = train_X.astype('float32') / 255.0
test_X = test_X.astype('float32') / 255.0

# print('X_train: ' + str(train_X.shape))
# print('Y_train: ' + str(train_y.shape))
# print('X_test:  '  + str(test_X.shape))
# print('Y_test:  '  + str(test_y.shape))

# print('train_x[1] raw',train_X[1][5][10:21])
# pyplot.imshow(train_X[0], cmap=pyplot.get_cmap('gray'))
# pyplot.show()

flattened_trainX = train_X.reshape(-1,28*28)
print(np.mean(flattened_trainX[0:100], axis=1))
# def findOne(x):
#   if x == 1:
#     return True
#   else:
#     return False

# results = filter(findOne, train_y)

# for y in results:
#   print(y)

# from matplotlib import pyplot
# for i in range(9):
#   pyplot.subplot(330 + 1 + i)
#   pyplot.imshow(train_X[i], cmap=pyplot.get_cmap('gray'))
#   pyplot.show()

[0.13768007 0.15553722 0.0972539  0.08570928 0.11611645 0.14806421
 0.08826531 0.17940676 0.05439175 0.10956383 0.14279711 0.07127851
 0.17887655 0.1422719  0.05813325 0.12653062 0.1232243  0.13558424
 0.06797219 0.0887255  0.17087333 0.1772359  0.07828632 0.08291817
 0.10988395 0.20398661 0.06792217 0.23065226 0.20842338 0.0787565
 0.12756102 0.16289015 0.08794018 0.10591237 0.18260804 0.08166266
 0.15939876 0.18640456 0.1097489  0.13445379 0.0685024  0.14158164
 0.06320028 0.09005602 0.08600441 0.12083834 0.11595638 0.11229992
 0.08596439 0.16487594 0.09379251 0.22780614 0.1397859  0.08405362
 0.11799721 0.16189976 0.20954381 0.10035515 0.16983293 0.08422369
 0.14733894 0.0907263  0.15162066 0.23131752 0.13140257 0.0829882
 0.13494898 0.07610044 0.12360945 0.21144958 0.11048419 0.10207083
 0.05379151 0.13060725 0.13078733 0.17024308 0.13146758 0.08484894
 0.09869449 0.11668669 0.13094237 0.18483393 0.20099539 0.13401361
 0.09305722 0.13032213 0.10920368 0.13967587 0.16943777 0.125930

In [28]:
yy = copy.deepcopy(train_y)
unique_integers = list(set(yy))
unique_integers.sort()
print(unique_integers)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [29]:
def one_hot_encode(number, num_classes):
    one_hot_vector = [0] * num_classes
    one_hot_vector[number] = 1
    return one_hot_vector
# convert train_y to one hot encoding
num_classes = len(unique_integers)
yy_one = [one_hot_encode(num, num_classes) for num in yy]
print(yy_one[2])
print(yy[2])

[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
4


### Problematic MLP Code with Vanishing Gradient problem (Look below for solution)

In [None]:
def softmax(scores):
    max_score = max(scores, key=lambda x: x.data)
    norm_scores = [(score-max_score) for score in scores]
    exp_scores = [norm_score.exp() for norm_score in norm_scores]
    sum_exp_scores = sum(exp_scores)
    return [exp_score / sum_exp_scores for exp_score in exp_scores]

# define the MLP model
in_inputs=28*28
output_dim = len(unique_integers)
model = MLP(in_inputs, [70,50,output_dim])

# limit training set to overfit
# on smaller test size.
limit_x=5

# reshape here to flatten the 2D [28,28] into 1D -> 28*28
inputs = train_X[:limit_x].reshape(limit_x,-1)
expected_outputs = yy_one[:limit_x]

parameters_data_log = []
parameters_grad_log = []
# Begin gradient descent iterations
iterations = 25
for iter in range(iterations):
  parameters_data_log.append([])
  parameters_grad_log.append([])
  # forward the model one input at a time to get scores
  correct = 0
  for i_input in range(len(inputs)):
    scores = model(inputs[i_input])
    # print('raw scores',scores)
    probs_predicted = softmax(scores)
    # Loss for a single input
    yi_one = expected_outputs[i_input]
    loss = []
    probs_list = [p.data for p in probs_predicted]

    if(yi_one.index(max(yi_one)) == probs_list.index(max(probs_list))):
      correct += 1

    #  Negative Log Likelihood loss
    print('probs_predicted',probs_predicted)
    for k in range(len(yi_one)):
      loss.append(-1*((yi_one[k]*probs_predicted[k].log())+(1-yi_one[k])*(1-probs_predicted[k]).log()))
    total_loss = sum(loss)/len(loss)

    # print the probabilities at the last iteration
    if iter == iterations-1:
      print('probabilities predicted:',probs_list)
      print('predicted value:',probs_list.index(max(probs_list)))
      print('actual value:',yi_one.index(max(yi_one)))

    # Back propagation
    model.zero_grad()
    total_loss.backward()

    # Update parameters
    for p in model.parameters():
      parameters_data_log[iter].append(p.data)
      parameters_grad_log[iter].append(p.grad)
      p.data -= p.grad
  print('Iteration '+str(iter) +' total loss: '+str(total_loss.data))
  print('Accuracy:'+str(correct/len(inputs)))

### Solution to Vanishing Gradient Problem In MLP solving MNIST
- Fixes
  - Using cross entropy loss instead of binary cross entropy loss because the outputs are mutually exclusive of each other
  - The backwards() for log was incorrect. I was dividing by the output of log(x) and not 1/x itself.
  - I still can not fix it. MNIST is too difficult for me to solve by myself. 09/14/24
  - Okay I fixed it. I adjusted the learning_rate to be 0.05. But I think what did it is making the number of parameters and number of layers smaller.
    - I went from 70,50 neurons in the deep layers to 7,5 then just 7 neurons.
    - I took out a layer.
    - Too many neurons may have contributed to the vanishing gradient problem.

In [None]:
# This will fix the vanishing gradient problem from Micrograd_MNIST Draft 3d.
def softmax(scores):
    max_score = max(scores, key=lambda x: x.data)
    norm_scores = [(score-max_score) for score in scores]
    exp_scores = [norm_score.exp() for norm_score in norm_scores]
    sum_exp_scores = sum(exp_scores)
    return [exp_score / sum_exp_scores for exp_score in exp_scores]

# define the MLP model
in_inputs=28*28
output_dim = len(unique_integers)
model = MLP(in_inputs, [7,output_dim])

# limit training set to overfit
# on smaller test size.
limit_x=5

# reshape here to flatten the 2D [28,28] into 1D -> 28*28
inputs = train_X[:limit_x].reshape(limit_x,-1)
expected_outputs = yy_one[:limit_x]

parameters_data_log = []
parameters_grad_log = []
# Begin gradient descent iterations
iterations = 20
learning_rate = 0.05
for iter in range(iterations):
  parameters_data_log.append([])
  parameters_grad_log.append([])
  # forward the model one input at a time to get scores
  correct = 0
  total_loss = 0.0
  for i_input in range(len(inputs)):
    scores = model(inputs[i_input])
    # print('raw scores',scores)
    probs_predicted = softmax(scores)
    # Loss for a single input
    yi_one = expected_outputs[i_input]
    # loss = []
    probs_list = [p.data for p in probs_predicted]
    correct_index = yi_one.index(max(yi_one))
    if(correct_index == probs_list.index(max(probs_list))):
      correct += 1

    #  Negative Log Likelihood loss???
    # Cross Entropy Loss
    # print('probs_predicted',probs_predicted)
    # for k in range(len(yi_one)):
    # loss.append()
    # print('probs_predicted[correct_index]',probs_predicted[correct_index])
    loss = -1*(probs_predicted[correct_index]).log()
    total_loss += loss.data

    # print the probabilities at the last iteration
    if iter == iterations-1:
      print('probabilities predicted:',probs_list)
      print('predicted value:',probs_list.index(max(probs_list)))
      print('actual value:',yi_one.index(max(yi_one)))

    # Back propagation
    model.zero_grad()
    loss.backward()

    # Update parameters
    for p in model.parameters():
      parameters_data_log[iter].append(p.data)
      parameters_grad_log[iter].append(p.grad)
      p.data -= p.grad * learning_rate
  print('Iteration '+str(iter) +' Average loss across inputs: '+str(total_loss/len(inputs)))
  print('Accuracy:'+str(correct/len(inputs)))

Iteration 0 Average loss across inputs: 7.254621218603589
Accuracy:0.0
Iteration 1 Average loss across inputs: 2.329408916780422
Accuracy:0.4
Iteration 2 Average loss across inputs: 0.6142281797161026
Accuracy:1.0
Iteration 3 Average loss across inputs: 0.5615841353104433
Accuracy:1.0
Iteration 4 Average loss across inputs: 0.5170907583270244
Accuracy:1.0
Iteration 5 Average loss across inputs: 0.4858859229038976
Accuracy:1.0
Iteration 6 Average loss across inputs: 0.4631256085707408
Accuracy:1.0
Iteration 7 Average loss across inputs: 0.4452594345213125
Accuracy:1.0
Iteration 8 Average loss across inputs: 0.4303040461830411
Accuracy:1.0
Iteration 9 Average loss across inputs: 0.41721617920584475
Accuracy:1.0
Iteration 10 Average loss across inputs: 0.40541555797040196
Accuracy:1.0
Iteration 11 Average loss across inputs: 0.39453076191533387
Accuracy:1.0
Iteration 12 Average loss across inputs: 0.3843018989945897
Accuracy:1.0
Iteration 13 Average loss across inputs: 0.3746165764885313


In [None]:
print('mean of gradients:',np.mean(parameters_grad_log, axis=1))
print('mean of parameters:',np.mean(parameters_data_log, axis=1))

mean of gradients: [ 0.01502411  0.00511081 -0.00109118 -0.00093783 -0.00074425 -0.00066191
 -0.00053507 -0.00044514 -0.00037884 -0.0003285  -0.00024089 -0.00025872
 -0.00023297 -0.00021173 -0.00019392 -0.00017879 -0.00016579 -0.00012555
 -0.00014497 -0.0001362 ]
mean of parameters: [-0.00378242 -0.00668929 -0.0071486  -0.00688297 -0.00666677 -0.00649435
 -0.00633825 -0.006211   -0.00610454 -0.00601351 -0.00593428 -0.00587626
 -0.00581351 -0.00575686 -0.00570526 -0.00565791 -0.00561419 -0.00557358
 -0.00554288 -0.0055073 ]


### Run MLP on full MNIST dataset
- TODOs:
  - Implement batched gradient descent
  - Do it efficiently by eliminating excessive for loops. (Pythonify your code)

In [36]:
np.random.permutation(5)
np.array(yy_one).shape

(60000, 10)

In [33]:
def softmax(scores):
    max_score = max(scores, key=lambda x: x.data)
    norm_scores = [(score-max_score) for score in scores]
    exp_scores = [norm_score.exp() for norm_score in norm_scores]
    sum_exp_scores = sum(exp_scores)
    return [exp_score / sum_exp_scores for exp_score in exp_scores]

# define the MLP model
in_inputs=28*28
output_dim = len(unique_integers)
model = MLP(in_inputs, [7,output_dim])

# reshape here to flatten the 2D [28,28] into 1D -> 28*28
inputs = train_X.reshape(-1,in_inputs)
parameters_data_log = []
parameters_grad_log = []
# Begin gradient descent iterations

# Hyperparameters
epochs = 5
learning_rate = 0.05
batch_size=100
n = len(inputs)

for epoch in range(epochs):
  parameters_data_log.append([])
  parameters_grad_log.append([])
  # forward the model with randomly selected batches
  indices = np.random.permutation(n)
  correct = 0
  total_loss = 0.0
  X_shuffled = inputs[indices]
  y_shuffled = yy_one[indices]
  for i in range(0, n, batch_size):
    X_batch = X_shuffled[i:i + batch_size]
    y_batch = y_shuffled[i:i + batch_size]
    # Forward propagation
    logits = model(X_batch)
    probs = [softmax(logits) for logit in logits]
    # probs_list = [p.data for prob in probs for p in prob]
    # correct_index = yi_one.index(max(yi_one))
    # if(correct_index == probs_list.index(max(probs_list))):
    #   correct += 1
  #   # Cross Entropy Loss
  #   loss = -1*(probs[correct_index]).log()
  #   total_loss += loss.data
  #   # Back propagation
  #   model.zero_grad()
  #   loss.backward()
  #   # Update parameters
  #   for p in model.parameters():
  #     parameters_data_log[epoch].append(p.data)
  #     parameters_grad_log[epoch].append(p.grad)
  #     p.data -= p.grad * learning_rate
  #   # print the probabilities at the last iteration
  #   if epoch == epochs-1:
  #     print('probabilities predicted:',probs_list)
  #     print('predicted value:',probs_list.index(max(probs_list)))
  #     print('actual value:',yi_one.index(max(yi_one)))
  # print(f'Epoch {epoch+1}, Loss: {total_loss/len(inputs)}, Accuracy: {correct/len(inputs)}')

TypeError: only integer scalar arrays can be converted to a scalar index