https://www.alpha-i.co/blog/MNIST-for-ML-beginners-The-Bayesian-Way.html

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
from edward.models import Categorical, Normal
import edward as ed
import pandas as pd
import pickle
ed.set_seed(314159)

In [2]:
# Use the TensorFlow method to download and/or load the data.
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [3]:
#two-layer neural network.
def neural_network(x, W_0, W_1, b_0, b_1):
    h = tf.matmul(x, W_0) + b_0
    h = tf.tanh(h)
    #h = tf.exp(tf.negative(tf.square(h)))
    h = tf.matmul(h, W_1) + b_1
    return h

def display_draws(loss, train_x, EPOCH_NUM):
    fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(16, 3.5))
    plt.plot(np.arange(EPOCH_NUM), loss / len(train_x), label='Train')
    plt.legend(fontsize=20)
    plt.xlabel('Epoch', fontsize=15)
    plt.ylabel('Negated ELBO', fontsize=15)
    plt.show()

In [4]:
%run optim.py

In [5]:
total = mnist.train.num_examples #number of training examples
N = total/10   # number of images in a minibatch.
D = 784   # number of features.
K = 10    # number of classes.
P = 100   # number of neurons.

In [6]:
m0=100
mcbatch = range(m0,m0+10*100)

In [7]:
# Create a placeholder to hold the data (in minibatches) in a TensorFlow graph.
x = tf.placeholder(tf.float32, [None, D])
# Normal(0,1) priors for the variables. Note that the syntax assumes TensorFlow 1.1.
w0 = Normal(loc=tf.zeros([D, P]), scale=tf.ones([D, P]))
b0 = Normal(loc=tf.zeros(P), scale=tf.ones(P))
w1 = Normal(loc=tf.zeros([P, K]), scale=tf.ones([P, K]))
b1 = Normal(loc=tf.zeros(K), scale=tf.ones(K))
# Categorical likelihood for classication.
y =Categorical(neural_network(x, w0,w1, b0, b1))

In [8]:
# Contruct the q(w) and q(b). in this case we assume Normal distributions.
qw0 = Normal(loc=tf.Variable(tf.random_normal([D, P])),
              scale=tf.nn.softplus(tf.Variable(tf.random_normal([D, P]))))
qb0 = Normal(loc=tf.Variable(tf.random_normal([P])),
              scale=tf.nn.softplus(tf.Variable(tf.random_normal([P]))))
qw1 = Normal(loc=tf.Variable(tf.random_normal([P, K])),
              scale=tf.nn.softplus(tf.Variable(tf.random_normal([P, K]))))
qb1 = Normal(loc=tf.Variable(tf.random_normal([K])),
              scale=tf.nn.softplus(tf.Variable(tf.random_normal([K]))))

In [9]:
epoch_num = 200
train_losssgd = np.zeros(epoch_num)

In [10]:
# We use a placeholder for the labels in anticipation of the traning data.
y_ph = tf.placeholder(tf.int32, [N])
# Define the VI inference technique, ie. minimise the KL divergence between q and p.
inference = elbo_optimizer({w0: qw0, b0: qb0,w1:qw1,b1:qb1}, data={y:y_ph})

In [11]:
learning_rate = 5e-6
optimizer = tf.train.GradientDescentOptimizer(learning_rate)

In [14]:
inference.initialize(optimizer=optimizer,n_print=100)

In [15]:
# We will use an interactive session.
sess = tf.InteractiveSession()
# Initialise all the vairables in the session.
tf.global_variables_initializer().run()

In [16]:
obj = inference.build_loss_and_gradients(tf.trainable_variables())

In [17]:
newloss = obj[0]
newvars = obj[1][0:4]
newgrads = obj[1][4:8]

# SGD

In [15]:
for i in (0,total/N):
    X_batch, Y_batch = mnist.train.next_batch(N)
    # TensorFlow method gives the label data in a one hot vetor format. We convert that into a single label.
    Y_batch = np.argmax(Y_batch,axis=1)
    grads = tf.gradients(newloss, tf.trainable_variables())
    var_updates = []
    var_list = tf.trainable_variables()
    for grad, var in zip(grads, var_list):
        var_updates.append(var.assign_sub(learning_rate * grad))
    train_op = tf.group(*var_updates)
    sess.run(train_op,feed_dict={x: X_batch, y_ph: Y_batch})
    #info_dict = inference.update(feed_dict={x: X_batch, y_ph: Y_batch})
        
train_losssgd[0] = sess.run(newloss,feed_dict={x: X_batch, y_ph: Y_batch})

In [15]:
for epoch in range(1,epoch_num):
    for i in (0,total/N):
        print(epoch)
        X_batch, Y_batch = mnist.train.next_batch(N)
        # TensorFlow method gives the label data in a one hot vetor format. We convert that into a single label.
        Y_batch = np.argmax(Y_batch,axis=1)
        grads = tf.gradients(newloss, tf.trainable_variables())
        var_updates = []
        var_list = tf.trainable_variables()
        for grad, var in zip(grads, var_list):
            var_updates.append(var.assign_sub(learning_rate * grad))
        train_op = tf.group(*var_updates)
        sess.run(train_op,feed_dict={x: X_batch, y_ph: Y_batch})
        #info_dict = inference.update(feed_dict={x: X_batch, y_ph: Y_batch})
        
    train_losssgd[epoch] = sess.run(newloss,feed_dict={x: X_batch, y_ph: Y_batch})

0
0
1
1
2
2
3
3
4
4
5
5
6
6
7
7
8
8
9
9
10
10
11
11
12
12
13
13
14
14
15
15
16
16
17
17
18
18
19
19
20
20
21
21
22
22
23
23
24
24
25
25
26
26
27
27
28
28
29
29


In [16]:
for epoch in range(1,epoch_num):
    for _ in range(0, total, N):
        X_batch, Y_batch = mnist.train.next_batch(N)
        # TensorFlow method gives the label data in a one hot vetor format. We convert that into a single label.
        Y_batch = np.argmax(Y_batch,axis=1)
        info_dict = inference.update(feed_dict={x: X_batch, y_ph: Y_batch})
        
    train_losssgd[epoch] = info_dict['loss']
    inference.print_progress(info_dict)

1900/5000 [ 38%] ███████████                    ETA: 131s | Loss: 91506.188 8

In [19]:
sgd = train_losssgd[0:100]
#save loss
with open('sgdloss', 'wb') as fp:
    pickle.dump(sgd, fp)