# 1. MNIST dataset
The dataset contains 60,000 handwritten digits for training and 10,000 handwritten digits for testing. The digits have been size-normalized and centered in a fixed-size image (28x28 pixels) with values from 0 to 1. For simplicity, each image has been flattened and converted to a 1-D numpy array of 784 features (28*28).

<img src="./image/mnist.png" alt="MNIST" width="400"/>

More info [http://yann.lecun.com/exdb/mnist/](http://yann.lecun.com/exdb/mnist/)

# 2. Training a Tensorflow-based network
## 2.1 Main file ```tutorial.py```
- Import necessary libraries and files

```python
from __future__ import print_function
import tensorflow as tf
import numpy as np
import os
from datetime import datetime

# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data

# Import network files
from dnn_config import DNN_Config   # uncomment for DNN
from dnn import DNN                 # uncomment for DNN
#from cnn_config import CNN_Config  # uncomment for CNN
#from cnn import CNN                # uncomment for CNN
#from rnn_config import RNN_Config  # uncomment for RNN
#from rnn import RNN                # uncomment for RNN
```

- Some miscellaneous parameters

```python
# Whether device soft placement is allowed
tf.app.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
# Whether device placements should be logged
tf.app.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
# Where to keep some output files
tf.app.flags.DEFINE_string("out_dir", "./output/", "Point to output directory")
# Where to keep network's checkpoints
tf.app.flags.DEFINE_string("checkpoint_dir", "./checkpoint/", "Point to checkpoint directory")

# Print out the parameters
FLAGS = tf.app.flags.FLAGS
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()): # python3
    print("{}={}".format(attr.upper(), value))
print("")

# create path where some output and checkpoints are stored (if not existed)
out_path = os.path.abspath(os.path.join(os.path.curdir,FLAGS.out_dir))
checkpoint_path = os.path.abspath(os.path.join(out_path,FLAGS.checkpoint_dir))
if not os.path.isdir(os.path.abspath(out_path)): os.makedirs(os.path.abspath(out_path))
if not os.path.isdir(os.path.abspath(checkpoint_path)): os.makedirs(os.path.abspath(checkpoint_path))
```

- Create an object for network configuration and load MNIST data

```python
# Creat a network configuration object
config = DNN_Config()   # uncomment for DNN
# config = CNN_Config() # uncomment for CNN
# config = RNN_Config() # uncomment for RNN

# Load MNIST dataset
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
```

- Some training parameters

```python
# Trainging Parameters
learning_rate = 1e-3        # learning rate
num_training_step = 1000    # number of training steps
batch_size = 128            # batch size
display_every = 10          # how often to display training progress
evaluate_every = 10         # how often to evaluate the trained model on test data
```

- Training procedure

```python
# Create a Tensorflow graph
with tf.Graph().as_default():
    session_conf = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement,
                                  log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)

    # Start a Tensorflow session
    with sess.as_default():

        # Network construction
        net = DNN(config=config)    # uncomment for DNN
        # net = CNN(config=config)  # uncomment for CNN
        # net = RNN(config=config)  # uncomment for RNN

        # Define training procedure (i.e. optimization procedure)
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(learning_rate)
        grads_and_vars = optimizer.compute_gradients(net.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

        # the saver object is to save a trained model (i.e. a checkpoint) to files
        saver = tf.train.Saver(tf.all_variables(), max_to_keep = 5)

        # initialize all variables
        print("Model initialized")
        sess.run(tf.global_variables_initializer())

        # one training step
        def train(x_batch, y_batch):

            # Can you guess why we need this line?
            #seq_len = np.ones(len(x_batch),dtype=int) * config.timesteps

            feed_dict = {
                net.X: x_batch,
                net.y: y_batch,
                net.dropout_keep_prob: config.dropout_keep_prob,
                #net.seq_len: seq_len # Can you guess why we need this line?
            }
            _, step, loss, acc = sess.run([train_op, global_step, net.loss, net.accuracy], feed_dict)
            return step, loss, acc

        # one evaluation step
        def eval(x_batch, y_batch):

            # Can you guess why we need this line?
            #seq_len = np.ones(len(x_batch),dtype=int) * config.timesteps

            feed_dict = {
                net.X: x_batch,
                net.y: y_batch,
                net.dropout_keep_prob: 1.0,
                #net.seq_len: seq_len  # Can you guess why we need this line?
            }
            _, loss, yhat, acc = sess.run(
                [global_step, net.loss, net.y_hat, net.accuracy],
                feed_dict)
            return loss, acc, yhat

        # Training procedure
        for step in range(1, num_training_step + 1):
            # generate a mini-batch of data for training
            x_batch, y_batch = mnist.train.next_batch(batch_size)

            # Can you guess why we need this line?
            # x_batch = np.reshape(x_batch, (-1, config.input_height, config.input_width, config.input_channel))

            # Can you guess why we need this line?
            # reshape to (batch_size, num_input)
            # x_batch = np.reshape(x_batch, (-1, config.timesteps, config.num_input))

            # train the network with the generated mini-batch of data
            train_step, train_loss, train_acc = train(x_batch, y_batch)

            # display training progress?
            if step % display_every == 0:
                #time_str = datetime.now().isoformat()
                print("Step {}, loss {:.4f}, accuracy {:.3f}".format(train_step, train_loss, train_acc))

            # model evaluation?
            if step % evaluate_every == 0:
                test_X = mnist.test.images

                # Can you guess why we need this line?
                # test_X = np.reshape(test_X, (-1, config.input_height, config.input_width, config.input_channel))

                # Can you guess why we need this line?
                # reshape to (batch_size, num_input)
                # test_X = np.reshape(test_X, (-1, config.timesteps, config.num_input))

                test_y = mnist.test.labels
                test_loss, test_acc, test_yhat = eval(test_X, test_y)
                print("Evaluation: loss {:.4f}, accuracy {:.3f}".format(test_loss, test_acc))

                # save the current model (i.e. checkpoint) to files
                checkpoint_name = os.path.join(checkpoint_path, 'model_step' + str(step))
                save_path = saver.save(sess, checkpoint_name)
```

# 3. Deep Neural Network (DNN)

<img src="./image/dnn.jpeg" alt="DNN" width="750"/>

## 3.1 Network configuration ```DNN_Config.py```

```python
def __init__(self):

    self.n_hidden_1 = 1024 # 1st layer number of neurons
    self.n_hidden_2 = 1024 # 2nd layer number of neurons
    self.num_input = 784 # MNIST data input (img shape: 28*28)
    self.num_classes = 10 # MNIST total classes (0-9 digits)

    self.dropout_keep_prob = 0.9
    self.l2_reg_lambda = 1e-4
```

## 3.2 DNN class ```DNN.py```

- Input handling

```python
def __init__(self, config):
    self.config = config
    # place holder for input feature vectors and one-hot encoding output
    self.X = tf.placeholder("float", shape=[None, self.config.num_input], name='X')
    self.y = tf.placeholder("float", shape=[None, self.config.num_classes], name='y')
    # place holder for dropout
    self.dropout_keep_prob = tf.placeholder("float", name="dropout_keep_prob")
```

- Network construction

```python
def construct(self):
    # layers weight & bias
    self.w = {
        'h1': tf.Variable(tf.random_normal([self.config.num_input, self.config.n_hidden_1])),
        'h2': tf.Variable(tf.random_normal([self.config.n_hidden_1, self.config.n_hidden_2])),
        'out': tf.Variable(tf.random_normal([self.config.n_hidden_2, self.config.num_classes]))
    }
    self.b = {
        'h1': tf.Variable(tf.random_normal([self.config.n_hidden_1])),
        'h2': tf.Variable(tf.random_normal([self.config.n_hidden_2])),
        'out': tf.Variable(tf.random_normal([self.config.num_classes]))
    }

    #with tf.device('/gpu:0'), tf.variable_scope("fully-connected-layers"):
    with tf.device('/cpu:0'), tf.variable_scope("fully-connected-layers"):
        h1 = tf.add(tf.matmul(self.X, self.w['h1']), self.b['h1'])
        h1_relu = tf.nn.relu(h1)
        h1_dropout = tf.nn.dropout(h1_relu, self.dropout_keep_prob)

        h2 = tf.add(tf.matmul(h1_dropout, self.w['h2']), self.b['h2'])
        h2_relu = tf.nn.relu(h2)
        h2_dropout = tf.nn.dropout(h2_relu, self.dropout_keep_prob)

    # network's output
    with tf.device('/cpu:0'), tf.variable_scope("output"):
        self.output = tf.add(tf.matmul(h2_dropout, self.w['out']), self.b['out']) # logit
        self.y_hat = tf.argmax(self.output, 1, name='y_hat') # predicted labels

    # network's losses
    with tf.device('/cpu:0'), tf.name_scope("loss"):
        # cross-entropy loss
        self.output_loss = tf.nn.softmax_cross_entropy_with_logits(labels=self.y, logits=self.output)
        self.output_loss = tf.reduce_mean(self.output_loss) # summing over all samples of the batch

        # add on regularization
        l2_loss = self.config.l2_reg_lambda * \
                  sum(tf.nn.l2_loss(tf_var) for tf_var in tf.trainable_variables())

        # total loss
        self.loss = self.output_loss + l2_loss

    # calculate accuracy
    with tf.name_scope("accuracy"):
        correct_predictions = tf.equal(self.y_hat, tf.argmax(self.y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
```

# 4. Convolutional Neural Network (CNN)

<img src="./image/cnn.png" alt="GRU cell" width="750"/>

## 4.1 Network configuration ```cnn_config.py```

```python
def __init__(self):
    self.input_height = 28 # MNIST data input (img shape: 28*28*1)
    self.input_width = 28 # MNIST data input (img shape: 28*28*1)
    self.input_channel = 1 # MNIST data input (img shape: 28*28*1)
    self.num_classes = 10 # MNIST total classes (0-9 digits)

    self.dropout_keep_prob = 0.9
    self.l2_reg_lambda = 1e-4
```

## 4.2 CNN class ```cnn.py```

- Input handling

```python
def __init__(self, config):
    self.config = config
    # place holder for input feature vectors and one-hot encoding output
    self.X = tf.placeholder("float",
                            shape=[None,
                                   self.config.input_height,
                                   self.config.input_width,
                                   self.config.input_channel],
                            name='X')
    self.y = tf.placeholder("float", shape=[None, self.config.num_classes], name='y')
    # place holder for dropout
    self.dropout_keep_prob = tf.placeholder("float", name="dropout_keep_prob")
```

- Network construction

```python
# Create some wrappers for simplicity
def conv2d(self, X, W, b, stride = 1): # conv2d wrapper
    # Conv2D wrapper, with bias and relu activation
    conv = tf.nn.conv2d(X, W, strides = [1, 1, 1, 1], padding='SAME')
    conv = tf.nn.bias_add(conv, b)
    conv = tf.nn.relu(conv)
    return conv

# maxpool2d wrapper
def maxpool2d(self, X, k = 2):
    return tf.nn.max_pool(X, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding='SAME')

def construct(self):
    # layers weight & bias
    self.w = {
        # 5x5 conv, 1 input, 32 outputs
        'wc1': tf.Variable(tf.random_normal([5, 5, 1, 32])),
        # 5x5 conv, 32 inputs, 64 outputs
        'wc2': tf.Variable(tf.random_normal([5, 5, 32, 64])),
        # fully connected, 7*7*64 inputs, 1024 outputs
        'wd1': tf.Variable(tf.random_normal([7*7*64, 1024])),
        # 1024 inputs, 10 outputs (class prediction)
        'out': tf.Variable(tf.random_normal([1024, self.config.num_classes]))
    }

    self.b = {
        'bc1': tf.Variable(tf.random_normal([32])),
        'bc2': tf.Variable(tf.random_normal([64])),
        'bd1': tf.Variable(tf.random_normal([1024])),
        'out': tf.Variable(tf.random_normal([self.config.num_classes]))
    }

    # Convolutional layers
    with tf.device('/cpu:0'), tf.variable_scope("conv-layers"):
        # Conv layer 1
        conv1 = self.conv2d(self.X, self.w['wc1'], self.b['bc1'])
        # Max Pooling (down-sampling)
        conv1_pool = self.maxpool2d(conv1, k = 2)
        conv1_dropout = tf.nn.dropout(conv1_pool, self.dropout_keep_prob)

        # Conv Layer 2
        conv2 = self.conv2d(conv1_dropout, self.w['wc2'], self.b['bc2'])
        # Max Pooling (down-sampling)
        conv2_pool = self.maxpool2d(conv2, k = 2)
        conv2_dropout = tf.nn.dropout(conv2_pool, self.dropout_keep_prob)

    # fully connected layer
    with tf.device('/cpu:0'), tf.variable_scope("fully-connected-layers"):
        # flatten conv feature map
        flattened = tf.reshape(conv2_dropout, [-1, self.w['wd1'].get_shape().as_list()[0]])
        fc1 = tf.add(tf.matmul(flattened, self.w['wd1']), self.b['bd1'])
        fc1_relu = tf.nn.relu(fc1)
        fc1_dropout = tf.nn.dropout(fc1_relu, self.dropout_keep_prob)

    # network's output
    with tf.device('/cpu:0'), tf.variable_scope("output"):
        self.output = tf.add(tf.matmul(fc1_dropout, self.w['out']), self.b['out']) # logit
        self.y_hat = tf.argmax(self.output, 1, name='y_hat') # predicted labels

    # network's losses
    with tf.device('/cpu:0'), tf.name_scope("loss"):
        # cross-entropy loss
        self.output_loss = tf.nn.softmax_cross_entropy_with_logits(labels=self.y, logits=self.output)
        self.output_loss = tf.reduce_mean(self.output_loss) # summing over all samples of the batch

        # add on regularization
        l2_loss = self.config.l2_reg_lambda * \
                  sum(tf.nn.l2_loss(tf_var) for tf_var in tf.trainable_variables())

        # total loss
        self.loss = self.output_loss + l2_loss

    # calculate accuracy
    with tf.name_scope("accuracy"):
        correct_predictions = tf.equal(self.y_hat, tf.argmax(self.y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
```

# 5. Recurrent Neural Network (RNN)
<img src="./image/rnn.png" alt="GRU cell" width="750"/>


#### Gated Recurrent Unit (GRU) cell
<img src="./image/GRU.png" alt="GRU cell" width="350"/>

References:

- Sepp Hochreiter & Jurgen Schmidhuber, _Long Short Term Memory,_ Neural Computation 9(8): 1735-1780, 1997.
- K. Cho, B. van Merrienboer, C. Gulcehre, F. Bougares, H. Schwenk, and Y. Bengio, _Learning phrase representations using RNN encoderdecoder for statistical machine translation,_ in Proc. EMNLP, 2014, pp. 1724–1734.

## 5.1 Network configuration ```rnn_config.py```

```python
def __init__(self):

    self.n_hidden = 256 # 1st layer number of neurons
    self.num_input = 28 # MNIST data input (img shape: 28*28)
    self.timesteps = 28
    self.num_classes = 10 # MNIST total classes (0-9 digits)

    self.dropout_keep_prob = 0.9
    self.l2_reg_lambda = 1e-4 
```

## 5.2 Network architecture ```rnn.py```
- Input handling

```python
def __init__(self, config):
    self.config = config
    # place holder for input feature vectors and one-hot encoding output
    self.X = tf.placeholder("float",
                            shape=[None, self.config.timesteps, self.config.num_input],
                            name='X')
    self.y = tf.placeholder("float", shape=[None, self.config.num_classes], name='y')
    # place holder for dropout
    self.dropout_keep_prob = tf.placeholder("float", name="dropout_keep_prob")

    self.seq_len = tf.placeholder(tf.int32, [None]) # for the dynamic RNN
```

- Network construction

```python
def construct(self):
    # Define weights
    self.w = {
        'out': tf.Variable(tf.random_normal([self.config.n_hidden, self.config.num_classes]))
    }
    self.b = {
        'out': tf.Variable(tf.random_normal([self.config.num_classes]))
    }

    with tf.device('/cpu:0'), tf.name_scope("recurrent_layer"):
        # Define a lstm cell with tensorflow
        #rnn_cell = tf.contrib.rnn.BasicLSTMCell(self.config.n_hidden, forget_bias=1.0)
        rnn_cell = tf.contrib.rnn.GRUCell(self.config.n_hidden)
        # Get RNN cell output
        #outputs, states = tf.contrib.rnn.static_rnn(rnn_cell, self.X, dtype=tf.float32)
        outputs, states = tf.nn.dynamic_rnn(rnn_cell, self.X, sequence_length=self.seq_len, dtype=tf.float32)

        last_rnn_output = outputs[:, -1]
        last_rnn_output = tf.nn.dropout(last_rnn_output, self.dropout_keep_prob)

    # network's output
    with tf.device('/cpu:0'), tf.variable_scope("output"):
        self.output = tf.add(tf.matmul(last_rnn_output, self.w['out']), self.b['out']) # logit
        self.y_hat = tf.argmax(self.output, 1, name='y_hat') # predicted labels

    # network's losses
    with tf.device('/cpu:0'), tf.name_scope("loss"):
        # cross-entropy loss
        self.output_loss = tf.nn.softmax_cross_entropy_with_logits(labels=self.y, logits=self.output)
        self.output_loss = tf.reduce_mean(self.output_loss) # summing over all samples of the batch

        # add on regularization
        l2_loss = self.config.l2_reg_lambda * \
                  sum(tf.nn.l2_loss(tf_var) for tf_var in tf.trainable_variables())

        # total loss
        self.loss = self.output_loss + l2_loss

    # calculate accuracy
    with tf.name_scope("accuracy"):
        correct_predictions = tf.equal(self.y_hat, tf.argmax(self.y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
```