# Imports

In [1]:
from numpy.random import seed
seed(888)
from tensorflow import random
random.set_seed(404)

In [2]:
import os
import numpy as np
import tensorflow as tf
import tensorflow.compat.v1 as tf

# Constants

In [3]:
X_TRAIN_PATH = 'MNIST/digit_xtrain.csv'
X_TEST_PATH = 'MNIST/digit_xtest.csv'
Y_TRAIN_PATH = 'MNIST/digit_ytrain.csv'
Y_TEST_PATH = 'MNIST/digit_ytest.csv'

NR_CLASSES = 10
VALIDATION_SIZE= 10000

IMAGE_WIDTH = 28
IMAGE_HEIGHT = 28
CHANNELS = 1
TOTAL_INPUTS = (IMAGE_WIDTH*IMAGE_HEIGHT*CHANNELS)

# Get the Data

In [5]:
%%time

y_train_all = np.loadtxt(Y_TRAIN_PATH,delimiter=',',dtype=int) #read the text file and convert to numpy array

CPU times: total: 15.6 ms
Wall time: 5.99 ms


In [6]:
y_train_all.shape

(60000,)

In [7]:
y_test = np.loadtxt(Y_TEST_PATH,delimiter = ',',dtype=int)

In [8]:
y_test.shape

(10000,)

In [9]:
%%time

x_train_all = np.loadtxt(X_TRAIN_PATH,delimiter=',',dtype=int)

CPU times: total: 3.69 s
Wall time: 4.01 s


In [10]:
%%time

x_test = np.loadtxt(X_TEST_PATH,delimiter=',',dtype=int)

CPU times: total: 609 ms
Wall time: 669 ms


In [11]:
x_test.shape

(10000, 784)

# Explore

In [12]:
x_train_all.shape

(60000, 784)

In [13]:
x_train_all[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   3,  18,  18,  18,
       126, 136, 175,  26, 166, 255, 247, 127,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,  30,  36,  94, 154, 17

In [14]:
y_train_all[:5] # five first entry, our targer is sparse because has int for classes

array([5, 0, 4, 1, 9])

# Data Preprocessing

In [15]:
# Rescale

x_train_all, x_test = x_train_all/255.0, x_test/255.0

#### Convert target values to one-hot encoding

In [16]:
values = y_train_all[:5]
np.eye(10)[values]  # array element indexing  #the values in the square brackets acts as the index array.
# we're using this entire array as an index and we're pulling out several of the rows from the identity matrix.

array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [17]:
np.eye(10) # why 10? because we have got 10 different labels in our dataset 0-9

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [18]:
np.eye(10)[9]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])

In [19]:
values

array([5, 0, 4, 1, 9])

In [20]:
values[2]

4

In [21]:
y_train_all = np.eye(NR_CLASSES)[y_train_all]

In [22]:
y_train_all.shape

(60000, 10)

In [23]:
y_test = np.eye(NR_CLASSES)[y_test]
y_test.shape

(10000, 10)

### Create validation dataset from training data

**Challenge:** Split the training dataset into a smaller training dataset and a validation dataset for the features and the labels. Create four arrays: ```x_val```, ```y_val```, ```x_train```, and ```y_train``` from ```x_train_all``` and ```y_train_all```. Use the validation size of 10,000. 

In [24]:
x_val =  x_train_all[:VALIDATION_SIZE] # first 10,000 value
y_val =  y_train_all[:VALIDATION_SIZE]

In [25]:
x_train = x_train_all[VALIDATION_SIZE:]
y_train = y_train_all[VALIDATION_SIZE:]

In [26]:
y_train.shape

(50000, 10)

# Setup Tensorflow Graph

In [27]:
# set features and labels as placeholders with variable batch size and data type tf.float32
X = tf.keras.Input(shape=(None, TOTAL_INPUTS), dtype=tf.float32) # features
Y = tf.keras.Input(shape=(None, NR_CLASSES), dtype=tf.float32)  # labels

### Neural Network Architecture

#### Hyperparameters

In [28]:
nr_epochs =5
learning_rate = 1e-4    # le-4 = 0.0001

# Nr neurons and layers
# the output layers is getting 10

n_hidden1 = 512
n_hidden2 = 64

## First Hidden Layer

In [29]:
# initialized weight for the first hidden layers
initial_w1 = tf.random.truncated_normal(shape=[TOTAL_INPUTS,n_hidden1], stddev=0.1, seed=42) # nr inputs and nr layers
w1 = tf.Variable(initial_value=initial_w1) # hold on all that weights in the first hidden layer 

In [30]:
# initialized bias for the first hidden layers
initial_b1 = tf.constant(value=0, shape=[n_hidden1])
b1 = tf.Variable(initial_value=initial_b1)

In [31]:
# what the features were that were going into the first hidden layer
layer1_in = tf.matmul(X, w1) + tf.cast(b1, tf.float32) #multiply tensors together. X(tensor)* weights



In [32]:
# output of this layers
layer1_out = tf.nn.relu(layer1_in)

## Second Hidden Layer

In [33]:
# initialized weight for the second hidden layers
# in second layer instead of total input, get the second hidden layer
initial_w2 = tf.random.truncated_normal(shape=[n_hidden1,n_hidden2], stddev=0.1, seed=42)
w2 = tf.Variable(initial_value=initial_w2)

# initialized bias for the second hidden layers
initial_b2 = tf.constant(value=0, shape=[n_hidden2])
b2 = tf.Variable(initial_value=initial_b2)

# what the features were that were going into the second hidden layer. output of second layers 
layer2_in = tf.matmul(layer1_out, w2) +  tf.cast(b2, tf.float32)

# output of this layers
layer2_out = tf.nn.relu(layer2_in)



## Output Layer

In [34]:
initial_w3 = tf.random.truncated_normal(shape=[n_hidden2,NR_CLASSES], stddev=0.1, seed=42)
w3 = tf.Variable(initial_value=initial_w3)


initial_b3 = tf.constant(value=0, shape=[NR_CLASSES])
b3 = tf.Variable(initial_value=initial_b3)

 
layer3_in = tf.matmul(layer2_out, w3) +  tf.cast(b3, tf.float32)
output = tf.nn.softmax(layer3_in)



# Loss, Optimisation & Metrics

### Defining Loss function

In [35]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=Y, logits=output)) # when have individual batches-> average

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



### Defining Optimizers

In [37]:
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate)#tell tf which optimzer wanna use(Adam)/ learning rate

tf.compat.v1.reset_default_graph()
# what the operation use to minimize the loss

### Accuracy Metrics

In [None]:
# Accuracy =  compare "prediction" and "true labels" and we need to check if they're equal.

# Once we know if they are equal, then we have a correct prediction.

# And once we know how many correct predictions we have, we can work out the accuracy of our model.

In [39]:
# compare two quantities in this case the output and Y 

correct_pred = tf.equal(tf.argmax(output, axis=1), tf.argmax(Y, axis=1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))


In [None]:
def setup_layer(input, weight_dim, bias_dim, name):
    
    with tf.name_scope(name):
        initial_w = tf.truncated_normal(shape=weight_dim, stddev=0.1, seed=42)
        w = tf.Variable(initial_value=initial_w, name='W')

        initial_b = tf.constant(value=0.0, shape=bias_dim)
        b = tf.Variable(initial_value=initial_b, name='B')

        layer_in = tf.matmul(input, w) + b
        
        if name=='out':
            layer_out = tf.nn.softmax(layer_in)
        else:
            layer_out = tf.nn.relu(layer_in)
        
        tf.summary.histogram('weights', w)
        tf.summary.histogram('biases', b)
        
        return layer_out