## 2 Layer MLP for Dark Knoweldge Experiments w/AlignMNIST

75 epochs of RMSProp on AlignMNIST

Bigger network with dropout yet, and basic weight max norm constraints

the initial values are

     [54.569935, 62.339447, 6.9963851, 3.448611, 1.0789102, 0.082129896]


see  http://www.r2rt.com/posts/implementations/2016-03-29-implementing-batch-normalization-tensorflow/

https://github.com/tensorflow/tensorflow/blob/b3621c95160a916d4d255f9f44318b9d465701af/tensorflow/contrib/layers/python/layers/layers.py


https://www.reddit.com/r/MachineLearning/comments/2bopxs/question_about_the_maxnorm_constraint_used_with/



https://arxiv.org/pdf/1207.0580.pdf

https://arxiv.org/pdf/1503.02531v1.pdf

http://www.kdnuggets.com/2015/04/preventing-overfitting-neural-networks.html

https://www.reddit.com/r/MachineLearning/comments/2bopxs/question_about_the_maxnorm_constraint_used_with/

and https://github.com/tensorflow/tensorflow/issues/608

In [1]:
import tensorflow as tf
import numpy as np

In [2]:
%run augmentmnist.py

In [3]:
num_epochs = 75
batch_size = 125

learning_rate = 0.001


In [4]:
# Network Parameters
n_hidden_1 = 1200 # 1st layer num features
n_hidden_2 = 1200 # 2nd layer num features
n_input = 784 # MNIST data input (img shape: 28*28)
n_classes = 10 # MNIST total classes (0-9 digits)

std_0  = np.sqrt(6.0/(n_input+n_hidden_1))
std_h1 = np.sqrt(6.0/(n_hidden_1+n_hidden_2))
std_h2 = np.sqrt(6.0/(n_hidden_2+n_classes))

logfile = "2BwD-layer-dlk-alignmnist-4"

In [5]:
# tf Graph input
x = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None, n_classes])

xt = tf.placeholder("float", [None, n_input])
yt = tf.placeholder("float", [None, n_classes])

In [6]:
def multilayer_perceptron(_X, _weights, _biases):
    layer_1 = tf.nn.dropout(tf.nn.relu(tf.add(tf.matmul(_X, _weights['h1']), _biases['b1'])),  0.2)
    layer_2 = tf.nn.dropout(tf.nn.relu(tf.add(tf.matmul(layer_1, _weights['h2']), _biases['b2'])), 0.5)    
    return tf.matmul(layer_2, weights['out']) + biases['out']

In [7]:
# Store layers weight & bias
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1], stddev=std_0)),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2], stddev=std_h1)),
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes], stddev=std_h2))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1], stddev=0.1)),
    'b2': tf.Variable(tf.random_normal([n_hidden_2], stddev=0.01)),
    'out': tf.Variable(tf.random_normal([n_classes], stddev=0.001))
}

In [8]:
mlp = multilayer_perceptron(x, weights, biases )
mlp_test = multilayer_perceptron(xt, weights, biases  )

In [9]:
cost =  tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(mlp, y)) 
gs = tf.get_variable("global_step",[],trainable=False,initializer=tf.constant_initializer(0))
lr = tf.constant(learning_rate) 

In [10]:
train_op = tf.contrib.layers.optimize_loss(cost, global_step=gs, learning_rate=lr,optimizer="RMSProp", clip_gradients=1.0)

In [11]:
#with tf.name_scope("training accuracy"):
pred = tf.equal(tf.argmax(mlp, 1), tf.argmax(y, 1)) # Count correct predictions
train_acc_op = tf.reduce_mean(tf.cast(pred, "float"))  # Cast boolean to float to average
tf.scalar_summary("training accuracy", train_acc_op)

<tf.Tensor 'ScalarSummary_2:0' shape=() dtype=string>

In [12]:
test_pred = tf.equal(tf.argmax(mlp_test, 1), tf.argmax(yt, 1)) # Count correct predictions
test_acc_op = tf.reduce_mean(tf.cast(test_pred, "float"))  # Cast boolean to float to average
tf.scalar_summary("test 0 accuracy", test_acc_op)

<tf.Tensor 'ScalarSummary_3:0' shape=() dtype=string>

In [13]:
!rm -rf ./logs/{logfile}
!ls logs

[1m[36m2-layer-dk-alignmnist[m[m           [1m[36m2-layer-modern-mlp-alignmnist[m[m
[1m[36m2-layer-dlk-alignmnist[m[m          [1m[36m2BwD-layer-dlk-alignmnist[m[m
[1m[36m2-layer-mlp-alignmnist-wdropout[m[m [1m[36m5-layer-mlp-alignmnist-wdropout[m[m
[1m[36m2-layer-mlp-mnist-temp[m[m          [1m[36m5-layer-mlp-infimnist[m[m
[1m[36m2-layer-mlp-mnist-watch[m[m


### Original MNIST Data

In [14]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets(".", one_hot=True)
trX_0, trY_0 = mnist.train.images, mnist.train.labels
teX_0, teY_0 = mnist.test.images, mnist.test.labels

Extracting ./train-images-idx3-ubyte.gz
Extracting ./train-labels-idx1-ubyte.gz
Extracting ./t10k-images-idx3-ubyte.gz
Extracting ./t10k-labels-idx1-ubyte.gz


In [15]:
alignmnist = AlignMNIST()

In [16]:
# final norms...[102.45666, 162.9534, 40.922997, 18.756783, 53.30151, 3.2270765]

In [17]:
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import math_ops

from tensorflow.python.ops import variable_scope as vs


In [18]:
def max_norm_constraint(_weights, _biases):
    
    norms0 = np.array([54.569935, 62.339447, 6.9963851, 3.448611, 1.0789102, 0.082129896])
    norms0 = norms0*1.2

    n0 = tf.constant(norms0[0], dtype=tf.float32)
    n1 = tf.constant(norms0[1],  dtype=tf.float32)
    n2 = tf.constant(norms0[2],  dtype=tf.float32)
    n3 = tf.constant(norms0[3],  dtype=tf.float32)
    n4 = tf.constant(norms0[4],  dtype=tf.float32)
    n5 = tf.constant(norms0[5],  dtype=tf.float32)
    
    pred = tf.greater(tf.sqrt(tf.reduce_sum(tf.square(_weights['h1']))) , n0)
    _weights['h1'] = control_flow_ops.cond(pred, lambda: tf.div(_weights['h1'], n0), lambda: _weights['h1'])
    
    pred = tf.greater(tf.sqrt(tf.reduce_sum(tf.square(_weights['h2']))) , n1)
    _weights['h2'] = control_flow_ops.cond(pred, lambda: tf.div(_weights['h2'], n1), lambda: _weights['h2'])
    
    pred = tf.greater(tf.sqrt(tf.reduce_sum(tf.square(_weights['out']))) , n2)
    _weights['out'] = control_flow_ops.cond(pred, lambda: tf.div(_weights['out'], n2), lambda: _weights['out'])
            
    pred = tf.greater(tf.sqrt(tf.reduce_sum(tf.square(_biases['b1']))) , n3)
    _biases['b1'] = control_flow_ops.cond(pred, lambda: tf.div(_biases['b1'], n3), lambda: _biases['b1'])
    
    pred = tf.greater(tf.sqrt(tf.reduce_sum(tf.square(_biases['b2']))) , n4)
    _biases['b2'] = control_flow_ops.cond(pred, lambda: tf.div(_biases['b2'], n4), lambda: _biases['b2'])

    pred = tf.greater(tf.sqrt(tf.reduce_sum(tf.square(_biases['out']))) , n5)
    _biases['out'] = control_flow_ops.cond(pred, lambda: tf.div(_biases['out'], n5), lambda: _biases['out'])
    
    return 0
   

In [19]:
max_norm_op = max_norm_constraint(weights, biases)

In [None]:
test_accuracies = []
train_accuracies = []
norms = []
with tf.Session() as sess:
    # create a log writer. run 'tensorboard --logdir=./logs/{logfile}'
    writer = tf.train.SummaryWriter("./logs/{0}".format(logfile), sess.graph) # for 0.8
    merged = tf.merge_all_summaries()
 
    tf.initialize_all_variables().run()

    for epoch in range(num_epochs):
        trX, trY = alignmnist.next_epoch()
      
        for start, end in zip(range(0, len(trX), batch_size), range(batch_size, len(trX), batch_size)):
            sess.run(train_op, feed_dict={x: trX[start:end], y: trY[start:end]})
                
        #sess.run(max_norm_op)
        
        summary, trn_acc, tst_acc = sess.run([merged, train_acc_op, test_acc_op], feed_dict={x: trX, y: trY, xt: teX_0, yt: teY_0})
        writer.add_summary(summary, epoch)  
    
        
        print(epoch, trn_acc, tst_acc)
        train_accuracies.append(trn_acc)
        test_accuracies.append(tst_acc)

        nrms= [np.linalg.norm(weights['h1'].eval()),np.linalg.norm(weights['h2'].eval()), np.linalg.norm(weights['out'].eval()), np.linalg.norm(biases['b1'].eval()),np.linalg.norm(biases['b2'].eval()),np.linalg.norm(biases['out'].eval())]
        print nrms
        norms.append(nrms)
        writer.flush()
        

(0, 0.815, 0.884)
[54.544594, 62.16592, 7.0781412, 3.4363286, 0.99740976, 0.073847778]
(1, 0.86086667, 0.91799998)
[56.009369, 65.056999, 6.656517, 3.472435, 1.6046538, 1.6776466]
(2, 0.88129997, 0.93370003)
[57.315178, 67.659279, 6.3940864, 3.5244188, 2.5057023, 2.649864]
(3, 0.89335001, 0.93550003)
[58.549725, 70.100227, 6.3261886, 3.588649, 3.3910596, 3.6299446]


In [None]:
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline  

plt.plot(train_accuracies)
plt.plot(test_accuracies)

In [None]:
np.max(test_accuracies)

Number test errors is

In [None]:
10000*(1.0-np.max(test_accuracies))

In [None]:
print norms[0]
print norms[-1]

#### What is the best way to implement the cutoff ?

by the norm of the weights at each layer ?

see also:  http://keras.io/constraints/

Does this just reflect the loss decreasing ?

Can we simply bound the norm?

In [None]:
for i in range(6):
    plt.plot([nrm[i] for nrm in norms])