In [35]:
import os
import tensorflow as tf
import numpy as np
import math
import timeit
import matplotlib.pyplot as plt

%matplotlib inline

In [36]:
def load_cifar10(num_training=49000, num_validation=1000, num_test=10000):
    cifar10 = tf.keras.datasets.cifar10.load_data()
    (X_train, y_train), (X_test, y_test) = cifar10
    X_train = np.asarray(X_train, dtype=np.float32)
    y_train = np.asarray(y_train, dtype=np.int32).flatten()
    X_test = np.asarray(X_test, dtype=np.float32)
    y_test = np.asarray(y_test, dtype=np.int32).flatten()

    mask = range(num_training, num_training + num_validation)
    X_val = X_train[mask]
    y_val = y_train[mask]
    mask = range(num_training)
    X_train = X_train[mask]
    y_train = y_train[mask]
    mask = range(num_test)
    X_test = X_test[mask]
    y_test = y_test[mask]
    
    mean_pixel = X_train.mean(axis=(0, 1, 2), keepdims=True)
    std_pixel = X_train.std(axis=(0, 1, 2), keepdims=True)
    X_train = (X_train - mean_pixel) / std_pixel
    X_val = (X_val - mean_pixel) / std_pixel
    X_test = (X_test - mean_pixel) / std_pixel

    return X_train, y_train, X_val, y_val, X_test, y_test

NHW = (0, 1, 2)
X_train, y_train, X_val, y_val, X_test, y_test = load_cifar10()
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape, y_train.dtype)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)

Train data shape:  (49000, 32, 32, 3)
Train labels shape:  (49000,) int32
Validation data shape:  (1000, 32, 32, 3)
Validation labels shape:  (1000,)
Test data shape:  (10000, 32, 32, 3)
Test labels shape:  (10000,)


In [37]:
class Dataset(object):
    def __init__(self, x, y, batch_size, shuffle=True):
        assert x.shape[0] == y.shape[0], 'Got different numbers of data and labels'
        self.x = x 
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        
    def __iter__(self):
        N = self.x.shape[0]
        B = self.batch_size
        idx = np.arange(N)
        if self.shuffle:
            np.random.shuffle(idx)
        return iter((self.x[i:i+B], self.y[i:i+B]) for i in range(0, N, B))
    
train_dset = Dataset(X_train, y_train, batch_size=64, shuffle=True)
val_dset = Dataset(X_val, y_val, batch_size=64, shuffle=False)
test_dset = Dataset(X_test, y_test, batch_size=64)

In [38]:
for t, (x, y) in enumerate(train_dset):
    print(t, x.shape, y.shape)
    print(y)
    if t > 5: break

0 (64, 32, 32, 3) (64,)
[6 9 9 4 1 1 2 7 8 3 4 7 7 2 9 9 9 3 2 6 4 3 6 6 2 6 3 5 4 0 0 9 1 3 4 0 3
 7 3 3 5 2 2 7 1 1 1 2 2 0 9 5 7 9 2 2 5 2 4 3 1 1 8 2]
1 (64, 32, 32, 3) (64,)
[1 1 4 9 7 8 5 9 6 7 3 1 9 0 3 1 3 5 4 5 7 7 4 7 9 4 2 3 8 0 1 6 1 1 4 1 8
 3 9 6 6 1 8 5 2 9 9 8 1 7 7 0 0 6 9 1 2 2 9 2 6 6 1 9]
2 (64, 32, 32, 3) (64,)
[5 0 4 7 6 7 1 8 1 1 2 8 1 3 3 6 2 4 9 9 5 4 3 6 7 4 6 8 5 5 4 3 1 8 4 7 6
 0 9 5 1 3 8 2 7 5 3 4 1 5 7 0 4 7 5 5 1 0 9 6 9 0 8 7]
3 (64, 32, 32, 3) (64,)
[8 8 2 5 2 3 5 0 6 1 9 3 6 9 1 3 9 6 6 7 1 0 9 5 8 5 2 9 0 8 8 0 6 9 1 1 6
 3 7 6 6 0 6 6 1 7 1 5 8 3 6 6 8 6 8 4 6 6 1 3 8 3 4 1]
4 (64, 32, 32, 3) (64,)
[7 1 3 8 5 1 1 4 0 9 3 7 4 9 9 2 4 9 9 1 0 5 9 0 8 2 1 2 0 5 6 3 2 7 8 8 6
 0 7 9 4 5 6 4 2 1 1 2 1 5 9 9 0 8 4 1 1 6 3 3 9 0 7 9]
5 (64, 32, 32, 3) (64,)
[7 7 9 1 5 1 6 6 8 7 1 3 0 3 3 2 4 5 7 5 9 0 3 4 0 4 4 6 0 0 6 6 0 8 1 6 2
 9 2 5 9 6 7 4 1 8 7 3 6 9 3 0 4 0 5 1 0 3 4 8 5 4 7 2]
6 (64, 32, 32, 3) (64,)
[3 9 7 6 7 1 4 7 0 1 7 3 1 8 4 4 2 0 2 2 0 0 9

In [39]:
USE_GPU = False
print_every = 100

if USE_GPU:
    device = '/device:GPU:0'
else:
    device = '/cpu:0'

print('Using device: ', device)

Using device:  /cpu:0


Barebone TensorFlow
---

In [40]:
def flatten(x):
    N = tf.shape(x)[0]
    return tf.reshape(x, (N, -1))

In [41]:
def test_flatten():
    tf.reset_default_graph()
    with tf.device(device):
        x = tf.placeholder(tf.float32)
        x_flat = flatten(x)
    print('x: ', type(x), x)
    print('x_flat: ', type(x_flat), x_flat)
    print()
    
    with tf.Session() as sess:
        x_np = np.arange(24).reshape((2, 3, 4))
        print('x_np:\n', x_np.shape, '\n')
        x_np_flat = sess.run(x_flat, feed_dict={x: x_np})
        print('x_np_flat:\n', x_np_flat.shape, '\n')
        
        x_np = np.arange(12).reshape((2, 3, 2))
        print('x_np:\n', x_np.shape, '\n')
        x_flat_np = sess.run(x_flat, feed_dict={x: x_np})
        print('x_flat_np:\n', x_flat_np.shape)
test_flatten()

x:  <class 'tensorflow.python.framework.ops.Tensor'> Tensor("Placeholder:0", dtype=float32, device=/device:CPU:0)
x_flat:  <class 'tensorflow.python.framework.ops.Tensor'> Tensor("Reshape:0", shape=(?, ?), dtype=float32, device=/device:CPU:0)

x_np:
 (2, 3, 4) 

x_np_flat:
 (2, 12) 

x_np:
 (2, 3, 2) 

x_flat_np:
 (2, 6)


### Two-Layer Network

In [42]:
def two_layer_fc(x, params):
    w1, w2 = params
    x = flatten(x)
    h = tf.nn.relu(tf.matmul(x, w1))
    scores = tf.matmul(h, w2)
    return scores

In [43]:
def two_layer_fc_test():
    tf.reset_default_graph()
    hidden_layer_size = 42
    
    with tf.device(device):
        x = tf.placeholder(tf.float64)
        w1 = tf.zeros((3*32*32, hidden_layer_size), dtype=tf.float64)
        w2 = tf.zeros((hidden_layer_size, 10), dtype=tf.float64)
        scores = two_layer_fc(x, (w1, w2))
    x_np = np.zeros((64, 32, 32, 3))
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        scores_np = sess.run(scores, feed_dict={x: x_np})
        print(scores_np.shape)
        
two_layer_fc_test()

(64, 10)


### Three-Layer ConvNet

In [44]:
def three_layer_convnet(x, params):
    (w1, b1, w2, b2, w3, b3) = params
    out_1 = tf.nn.relu(tf.nn.conv2d(x, w1, strides=[1, 1, 1, 1], padding='SAME') + b1)
    out_2 = tf.nn.relu(tf.nn.conv2d(out_1, w2, strides=[1, 1, 1, 1], padding='SAME') + b2)
    scores = tf.matmul(flatten(out_2), w3) + b3
    return scores

In [45]:
def three_layer_convnet_test():
    tf.reset_default_graph()
    with tf.device(device):
        x = tf.placeholder(tf.float32)
        w1 = tf.zeros((5, 5, 3, 6))
        b1 = tf.zeros((6, ))
        w2 = tf.zeros((3, 3, 6, 9))
        b2 = tf.zeros((9, ))
        w3 = tf.zeros((9*32*32, 10))
        b3 = tf.zeros((10, ))
        params = (w1, b1, w2, b2, w3, b3)
        scores = three_layer_convnet(x, params)
    
    x_np = np.zeros((64, 32, 32, 3))
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        scores_np = sess.run(scores, feed_dict={x: x_np})
        print('scores_np has shape: ', scores_np.shape)
        
with tf.device('/cpu:0'):
    three_layer_convnet_test()

scores_np has shape:  (64, 10)


### Training Step

In [46]:
def training_step(scores, y, params, learning_rate):
    losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=scores)
    loss = tf.reduce_mean(losses)
    
    grad_params = tf.gradients(loss, params)
    new_weights = []
    for w, grad_w in zip(params, grad_params):
        new_w = tf.assign_sub(w, learning_rate * grad_w)
        new_weights.append(new_w)
        
    with tf.control_dependencies(new_weights):
        return tf.identity(loss)

### Training Loop

In [51]:
def train(model_fn, init_fn, learning_rate):
    tf.reset_default_graph()
    is_training = tf.placeholder(tf.bool, name='is_training')
    with tf.device(device):
        x = tf.placeholder(tf.float32, [None, 32, 32, 3])
        y = tf.placeholder(tf.int32, [None])
        params = init_fn()
        scores = model_fn(x, params)
        loss = training_step(scores, y, params, learning_rate)
        
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for t, (x_np, y_np) in enumerate(train_dset):
            feed_dict={x: x_np, y: y_np}
            loss_np = sess.run(loss, feed_dict=feed_dict)
            if t % print_every == 0:
                print('Iteration %d, loss = %.4f' % (t, loss_np))
                check_accuracy(sess, val_dset, x, scores, is_training)            

### Check Accuracy

In [52]:
def check_accuracy(sess, dset, x, scores, is_training=None):
    num_correct, num_samples = 0, 0
    for x_batch, y_batch in dset:
        feed_dict = {x: x_batch, is_training: 0}
        scores_np = sess.run(scores, feed_dict=feed_dict)
        y_pred = scores_np.argmax(axis=1)
        num_samples += x_batch.shape[0]
        num_correct += (y_pred == y_batch).sum()
    acc = float(num_correct) / num_samples
    print('Got %d / %d correct (%.2f%%)' % (num_correct, num_samples, 100 * acc))        

### Initialization

In [53]:
def kaiming_normal(shape):
    if len(shape) == 2:
        fan_in, fan_out = shape[0], shape[1]
    elif len(shape) == 4:
        fan_in, fan_out = np.prod(shape[:3]), shape[3]
    return tf.random_normal(shape) * np.sqrt(2.0 / fan_in)

### Train a Two-Layer Network

In [54]:
def two_layer_fc_init():
    hidden_layer_size = 4000
    w1 = tf.Variable(kaiming_normal((3*32*32, 4000)))
    w2 = tf.Variable(kaiming_normal((4000, 10)))
    return [w1, w2]

learning_rate = 1e-2
train(two_layer_fc, two_layer_fc_init, learning_rate)

Iteration 0, loss = 3.2512
Got 117 / 1000 correct (11.70%)
Iteration 100, loss = 2.0240
Got 382 / 1000 correct (38.20%)
Iteration 200, loss = 1.4809
Got 392 / 1000 correct (39.20%)
Iteration 300, loss = 1.8005
Got 376 / 1000 correct (37.60%)
Iteration 400, loss = 1.8020
Got 420 / 1000 correct (42.00%)
Iteration 500, loss = 1.7362
Got 432 / 1000 correct (43.20%)
Iteration 600, loss = 1.8651
Got 414 / 1000 correct (41.40%)
Iteration 700, loss = 1.9470
Got 437 / 1000 correct (43.70%)


### Train a three-layer ConvNet

In [56]:
def three_layer_convnet_init():
    w1 = tf.Variable(kaiming_normal((5, 5, 3, 32)))
    b1 = tf.Variable(tf.zeros((32, )))
    w2 = tf.Variable(kaiming_normal((3, 3, 32, 16)))
    b2 = tf.Variable(tf.zeros((16, )))
    w3 = tf.Variable(kaiming_normal((16*32*32, 10)))
    b3 = tf.Variable(tf.zeros((10, )))
    params = (w1, b1, w2, b2, w3, b3)
    
    return params

learning_rate = 3e-3
train(three_layer_convnet, three_layer_convnet_init, learning_rate)

Iteration 0, loss = 2.8190
Got 105 / 1000 correct (10.50%)
Iteration 100, loss = 1.8904
Got 362 / 1000 correct (36.20%)
Iteration 200, loss = 1.5929
Got 394 / 1000 correct (39.40%)
Iteration 300, loss = 1.7238
Got 377 / 1000 correct (37.70%)
Iteration 400, loss = 1.6944
Got 416 / 1000 correct (41.60%)
Iteration 500, loss = 1.6929
Got 421 / 1000 correct (42.10%)
Iteration 600, loss = 1.6278
Got 459 / 1000 correct (45.90%)
Iteration 700, loss = 1.6345
Got 450 / 1000 correct (45.00%)


Keras Model API
---

### Two-Layer Network

In [58]:
class TwoLayerFC(tf.keras.Model):
    def __init__(self, hidden_size, num_classes):
        super().__init__()
        initializer = tf.variance_scaling_initializer(scale=2.0)
        self.fc1 = tf.layers.Dense(hidden_size, activation=tf.nn.relu, kernel_initializer=initializer)
        self.fc2 = tf.layers.Dense(num_classes, kernel_initializer=initializer)
    def call(self, x, training=None):
        x = tf.layers.flatten(x)
        x = self.fc1(x)
        x = self.fc2(x)
        return x
    
def test_TwoLayerFC():
    tf.reset_default_graph()
    input_size, hidden_size, num_classes = 50, 42, 10
    model = TwoLayerFC(hidden_size, num_classes)
    with tf.device(device):
        x = tf.zeros((64, input_size))
        scores = model(x)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        scores_np = sess.run(scores)
        print(scores_np.shape)
        
test_TwoLayerFC()

(64, 10)


In [62]:
def two_layer_fc_functional(inputs, hidden_size, num_classes):
    initializer = tf.variance_scaling_initializer(scale=2.0)
    flatten_inputs = tf.layers.flatten(inputs)
    fc1_out = tf.layers.dense(flatten_inputs, hidden_size, activation=tf.nn.relu, kernel_initializer=initializer)
    scores = tf.layers.dense(fc1_out, num_classes, kernel_initializer=initializer)
    return scores

def test_two_layer_fc_functional():
    tf.reset_default_graph()
    input_size, hidden_size, num_classes = 50, 42, 10
    with tf.device(device):
        x = tf.zeros((64, input_size))
        scores = two_layer_fc_functional(x, hidden_size, num_classes)
        
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        scores_np = sess.run(scores)
        print(scores_np.shape)
test_two_layer_fc_functional()

(64, 10)


### Three-Layer ConvNet

In [63]:
class ThreeLayerConvNet(tf.keras.Model):
    def __init__(self, channel_1, channel_2, num_classes):
        super().__init__()
        initializer = tf.variance_scaling_initializer(scale=2.0)
        self.conv_1 = tf.layers.Conv2D(filters=channel_1, kernel_size=(5, 5), strides=(1, 1), \
                        padding='same', activation=tf.nn.relu, kernel_initializer=initializer)
        self.conv_2 = tf.layers.Conv2D(filters=channel_2, kernel_size=(3, 3), strides=(1, 1), \
                        padding='same', activation=tf.nn.relu, kernel_initializer=initializer)
        self.fc = tf.layers.Dense(units=num_classes, kernel_initializer=initializer)
        
    def call(self, x, training=None):
        x = self.conv_1(x)
        x = self.conv_2(x)
        scores = self.fc(tf.layers.flatten(x))
        return scores

In [65]:
def test_ThreeLayerConvNet():
    tf.reset_default_graph()
    channel_1, channel_2, num_classes = 12, 8, 10
    model = ThreeLayerConvNet(channel_1, channel_2, num_classes)
    with tf.device(device):
        x = tf.zeros((64, 3, 32, 32))
        scores = model(x)
        
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        scores_np = sess.run(scores)
        print(scores_np.shape)
        
test_ThreeLayerConvNet()

(64, 10)


### Training Loop

In [69]:
def train2(model_init_fn, optimizer_init_fn, num_epochs=1):
    tf.reset_default_graph()
    with tf.device(device):
        x = tf.placeholder(tf.float32, [None, 32, 32, 3])
        y = tf.placeholder(tf.int32, [None])
        is_training = tf.placeholder(tf.bool, name='is_training')
        scores = model_init_fn(x, is_training)
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=scores)
        loss = tf.reduce_mean(loss)
        
        optimizer = optimizer_init_fn()
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss)
            
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        t = 0
        for epoch in range(num_epochs):
            print('Starting epoch %d' % epoch)
            for x_np, y_np in train_dset:
                feed_dict = {x: x_np, y: y_np}
                loss_np, _ = sess.run([loss, train_op], feed_dict=feed_dict)
                if t % print_every == 0:
                    print('Iteration %d, loss = %.4f' % (t, loss_np))
                    check_accuracy(sess, val_dset, x, scores, is_training=is_training)
                    print()
                t += 1

### Train a Two-Layer Network

In [70]:
hidden_size, num_classes = 4000, 10
learning_rate = 1e-2

def model_init_fn(inputs, is_training):
    return TwoLayerFC(hidden_size, num_classes)(inputs)

def optimizer_init_fn():
    return tf.train.GradientDescentOptimizer(learning_rate)

train2(model_init_fn, optimizer_init_fn)

Starting epoch 0
Iteration 0, loss = 2.6712
Got 136 / 1000 correct (13.60%)

Iteration 100, loss = 1.8448
Got 381 / 1000 correct (38.10%)

Iteration 200, loss = 1.3787
Got 407 / 1000 correct (40.70%)

Iteration 300, loss = 1.7410
Got 399 / 1000 correct (39.90%)

Iteration 400, loss = 1.7544
Got 431 / 1000 correct (43.10%)

Iteration 500, loss = 1.8186
Got 454 / 1000 correct (45.40%)

Iteration 600, loss = 1.8170
Got 427 / 1000 correct (42.70%)

Iteration 700, loss = 1.8482
Got 447 / 1000 correct (44.70%)



### Train a Two-Layer Network

In [72]:
hidden_size, num_classes = 4000, 10
learning_rate = 1e-2

def model_init_fn(inputs, is_training):
    return two_layer_fc_functional(inputs, hidden_size, num_classes)

def optimizer_init_fn():
    return tf.train.GradientDescentOptimizer(learning_rate)

train2(model_init_fn, optimizer_init_fn)

Starting epoch 0
Iteration 0, loss = 2.8083
Got 128 / 1000 correct (12.80%)

Iteration 100, loss = 1.9207
Got 384 / 1000 correct (38.40%)

Iteration 200, loss = 1.4632
Got 411 / 1000 correct (41.10%)

Iteration 300, loss = 1.7852
Got 369 / 1000 correct (36.90%)

Iteration 400, loss = 1.7783
Got 425 / 1000 correct (42.50%)

Iteration 500, loss = 1.7788
Got 434 / 1000 correct (43.40%)

Iteration 600, loss = 1.7856
Got 433 / 1000 correct (43.30%)

Iteration 700, loss = 1.8244
Got 434 / 1000 correct (43.40%)



### Train a Three-Layer ConvNet

In [None]:
learning_rate = 3e-3
channel_1, channel_2, num_classes = 32, 16, 10

def model_init_fn(inputs, is_training):
    return ThreeLayerConvNet(channel_1, channel_2, num_classes)(inputs)

def optimizer_init_fn():
    return tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9, use_nesterov=True)

train2(model_init_fn, optimizer_init_fn)

Starting epoch 0
Iteration 0, loss = 2.6920
Got 90 / 1000 correct (9.00%)

Iteration 100, loss = 1.6495
Got 414 / 1000 correct (41.40%)



Keras Sequential API
---