Copyright (C) 2018 Software Platform Lab, Seoul National University


Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

# KMOOC_HW2: Training a ResNet model

- Create input pipeline using Tensorflow Dataset API
- Define fully connected layer
- Define variable update operation using optimizer

In [0]:
#@title Run me to download the CIFAR-10 dataset!
# https://blog.shichao.io/2012/10/04/progress_speed_indicator_for_urlretrieve_in_python.html

import os, sys, time
import tarfile
import urllib

def reporthook(count, block_size, total_size):
  global start_time
  if count == 0:
    start_time = time.time()
    return
  duration = time.time() - start_time
  progress_size = int(count * block_size)
  percent = int(count * block_size * 100 / total_size)
  sys.stdout.write('\r...%d%%, %d MB, %d seconds passed' %
                   (percent, progress_size / (1024 * 1024), duration))
  sys.stdout.flush()

cifar10url = 'https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz'
cifar10 = cifar10url.split('/')[-1]

if not os.path.isfile(cifar10):
  urllib.urlretrieve(cifar10url, cifar10, reporthook)
print()
print('Download finished!')

cifar10_extracted = 'cifar-10-batches-bin'

if not os.path.isdir(cifar10_extracted):
  tarfile.open(cifar10, 'r:gz').extractall()
print('Uncompression finished!')

In [0]:
!mkdir train_ckpt

In [0]:
!ls

## Problem 1. CIFAR10 input

In [0]:
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""CIFAR dataset input module.
"""

import tensorflow as tf

def build_input(dataset, data_path, batch_size, mode):
  """Build CIFAR image and labels.

  Args:
    dataset: Either 'cifar10' or 'cifar100'.
    data_path: Filename for data.
    batch_size: Input batch size.
    mode: Either 'train' or 'eval'.
  Returns:
    images: Batches of images. [batch_size, image_size, image_size, 3]
    labels: Batches of labels. [batch_size, num_classes]
  Raises:
    ValueError: when the specified dataset is not supported.
  """
  image_size = 32
  if dataset == 'cifar10':
    label_bytes = 1
    label_offset = 0
    num_classes = 10
  else:
    raise ValueError('Not supported dataset %s', dataset)

  depth = 3
  image_bytes = image_size * image_size * depth
  record_bytes = label_bytes + label_offset + image_bytes

  def parse_data(value): 
    # Convert these examples to dense labels and processed images.
    record = tf.reshape(tf.decode_raw(value, tf.uint8), [record_bytes])
    label = tf.cast(tf.slice(record, [label_offset], [label_bytes]), tf.int32)

    # Convert from string to [depth * height * width] to [depth, height, width].
    depth_major = tf.reshape(tf.slice(record, [label_offset + label_bytes], [image_bytes]),
                           [depth, image_size, image_size])
    # Convert from [depth, height, width] to [height, width, depth].
    image = tf.cast(tf.transpose(depth_major, [1, 2, 0]), tf.float32)

    if mode == 'train':
      image = tf.image.resize_image_with_crop_or_pad(
        image, image_size+4, image_size+4)
      image = tf.random_crop(image, [image_size, image_size, 3])
      image = tf.image.random_flip_left_right(image)
      image = tf.image.per_image_standardization(image)

    else:
      image = tf.image.resize_image_with_crop_or_pad(
        image, image_size, image_size)
      image = tf.image.per_image_standardization(image)

    return image, label

  data_files = tf.gfile.Glob(data_path)
  data_files.sort()
  
  #############################################################################
  #### FIXME: Create an input pipline using tf.data.Dataset and parse_data ####
  #############################################################################
  ds = None
 
  iterator = ds.make_one_shot_iterator()
  images, labels = iterator.get_next()
  
  assert images.shape[1] ==  images.shape[2] == image_size
  assert images.shape[3] == depth
  assert labels.shape[1] == 1
  
  images = tf.reshape(images, [batch_size, image_size, image_size, depth])
  labels = tf.reshape(labels, [batch_size, 1])
  indices = tf.reshape(tf.range(0, batch_size, 1), [batch_size, 1])
  labels = tf.sparse_to_dense(
      tf.concat(values=[indices, labels], axis=1),
      [batch_size, num_classes], 1.0, 0.0) 
  return images, labels

train_data_path = './cifar-10-batches-bin/data_batch*'
batch_size = 128
with tf.Graph().as_default():
    images, labels = build_input(
        'cifar10', train_data_path, batch_size, 'train')

## Problem 2. Define FC layer

In [0]:
def fully_connected(batch_size, x, out_dim):
    """FullyConnected layer for final output."""
    x = tf.reshape(x, [batch_size, -1])
    w = tf.get_variable(
        'DW', [x.get_shape()[1], out_dim],
        initializer=tf.uniform_unit_scaling_initializer(factor=1.0))
    
    ################################################
    #### FIXME: Create an variable 'b'          ####
    #### HINT: name: 'biases', shape: [out_dim] ####
    ####       use constant_initializer         ####
    ################################################
    b = None
    
    ##################################################
    #### FIXME: Create an xw_plus_b op            ####
    #### HINT: xw+b (xw is matrix multiplication) ####
    ##################################################
    xw_plus_b = None
    return xw_plus_b

## Problem 3. Define optimizer and update operation

In [0]:
def build_train_op(loss, lrn_rate):
    """Build training specific ops for the graph."""

    #########################################################################
    #### FIXME: Create an optimizer using self.lrn_rate as learning rate ####
    #########################################################################
    update_op = None
    
    return update_op

## Define ResNet model

In [0]:
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""ResNet model.

Related papers:
https://arxiv.org/pdf/1603.05027v2.pdf
https://arxiv.org/pdf/1512.03385v1.pdf
https://arxiv.org/pdf/1605.07146v1.pdf
"""
from collections import namedtuple

import numpy as np
import tensorflow as tf
import six

from tensorflow.python.training import moving_averages


HParams = namedtuple('HParams',
                     'batch_size, num_classes, min_lrn_rate, lrn_rate, '
                     'num_residual_units, use_bottleneck, weight_decay_rate, '
                     'relu_leakiness')


class ResNet(object):
  """ResNet model."""

  def __init__(self, hps, images, labels, mode):
    """ResNet constructor.

    Args:
      hps: Hyperparameters.
      images: Batches of images. [batch_size, image_size, image_size, 3]
      labels: Batches of labels. [batch_size, num_classes]
      mode: One of 'train' and 'eval'.
    """
    self.hps = hps
    self._images = images
    self.labels = labels
    self.mode = mode

    self._extra_train_ops = []

  def build_graph(self):
    """Build a whole graph for the model."""
    self.global_step = tf.train.get_or_create_global_step()
    self._build_model()
    if self.mode == 'train':
      self.lrn_rate =  tf.constant(self.hps.lrn_rate, tf.float32)
      update_op = build_train_op(self.cost, self.lrn_rate)
      
      with tf.control_dependencies([update_op]):
          apply_op = tf.assign_add(self.global_step, 1)
   
      train_ops = [apply_op] + self._extra_train_ops
      self.train_op = tf.group(*train_ops)
      
    self.summaries = tf.summary.merge_all()

  def _stride_arr(self, stride):
    """Map a stride scalar to the stride array for tf.nn.conv2d."""
    return [1, stride, stride, 1]

  def _build_model(self):
    """Build the core model within the graph."""
    with tf.variable_scope('init'):
      x = self._images
      x = self._conv('init_conv', x, 3, 3, 16, self._stride_arr(1))

    strides = [1, 2, 2]
    activate_before_residual = [True, False, False]
    if self.hps.use_bottleneck:
      res_func = self._bottleneck_residual
      filters = [16, 64, 128, 256]
    else:
      res_func = self._residual
      filters = [16, 16, 32, 64]
      # Uncomment the following codes to use w28-10 wide residual network.
      # It is more memory efficient than very deep residual network and has
      # comparably good performance.
      # https://arxiv.org/pdf/1605.07146v1.pdf
      # filters = [16, 160, 320, 640]
      # Update hps.num_residual_units to 4

    with tf.variable_scope('unit_1_0'):
      x = res_func(x, filters[0], filters[1], self._stride_arr(strides[0]),
                   activate_before_residual[0])
    for i in six.moves.range(1, self.hps.num_residual_units):
      with tf.variable_scope('unit_1_%d' % i):
        x = res_func(x, filters[1], filters[1], self._stride_arr(1), False)

    with tf.variable_scope('unit_2_0'):
      x = res_func(x, filters[1], filters[2], self._stride_arr(strides[1]),
                   activate_before_residual[1])
    for i in six.moves.range(1, self.hps.num_residual_units):
      with tf.variable_scope('unit_2_%d' % i):
        x = res_func(x, filters[2], filters[2], self._stride_arr(1), False)

    with tf.variable_scope('unit_3_0'):
      x = res_func(x, filters[2], filters[3], self._stride_arr(strides[2]),
                   activate_before_residual[2])
    for i in six.moves.range(1, self.hps.num_residual_units):
      with tf.variable_scope('unit_3_%d' % i):
        x = res_func(x, filters[3], filters[3], self._stride_arr(1), False)

    with tf.variable_scope('unit_last'):
      x = self._batch_norm('final_bn', x)
      x = self._relu(x, self.hps.relu_leakiness)
      x = self._global_avg_pool(x)

    with tf.variable_scope('logit'):
      logits = fully_connected(self.hps.batch_size, x, self.hps.num_classes)
      self.predictions = tf.nn.softmax(logits)

    with tf.variable_scope('costs'):
      xent = tf.nn.softmax_cross_entropy_with_logits(
          logits=logits, labels=self.labels)
      self.cost = tf.reduce_mean(xent, name='xent')
      self.cost += self._decay()

      tf.summary.scalar('cost', self.cost)

  # TODO(xpan): Consider batch_norm in contrib/layers/python/layers/layers.py
  def _batch_norm(self, name, x):
    """Batch normalization."""
    with tf.variable_scope(name):
      params_shape = [x.get_shape()[-1]]

      beta = tf.get_variable(
          'beta', params_shape, tf.float32,
          initializer=tf.constant_initializer(0.0, tf.float32))
      gamma = tf.get_variable(
          'gamma', params_shape, tf.float32,
          initializer=tf.constant_initializer(1.0, tf.float32))

      if self.mode == 'train':
        mean, variance = tf.nn.moments(x, [0, 1, 2], name='moments')

        moving_mean = tf.get_variable(
            'moving_mean', params_shape, tf.float32,
            initializer=tf.constant_initializer(0.0, tf.float32),
            trainable=False)
        moving_variance = tf.get_variable(
            'moving_variance', params_shape, tf.float32,
            initializer=tf.constant_initializer(1.0, tf.float32),
            trainable=False)

        self._extra_train_ops.append(moving_averages.assign_moving_average(
            moving_mean, mean, 0.9))
        self._extra_train_ops.append(moving_averages.assign_moving_average(
            moving_variance, variance, 0.9))
      else:
        mean = tf.get_variable(
            'moving_mean', params_shape, tf.float32,
            initializer=tf.constant_initializer(0.0, tf.float32),
            trainable=False)
        variance = tf.get_variable(
            'moving_variance', params_shape, tf.float32,
            initializer=tf.constant_initializer(1.0, tf.float32),
            trainable=False)
        tf.summary.histogram(mean.op.name, mean)
        tf.summary.histogram(variance.op.name, variance)
      # epsilon used to be 1e-5. Maybe 0.001 solves NaN problem in deeper net.
      y = tf.nn.batch_normalization(
          x, mean, variance, beta, gamma, 0.001)
      y.set_shape(x.get_shape())
      return y

  def _residual(self, x, in_filter, out_filter, stride,
                activate_before_residual=False):
    """Residual unit with 2 sub layers."""
    if activate_before_residual:
      with tf.variable_scope('shared_activation'):
        x = self._batch_norm('init_bn', x)
        x = self._relu(x, self.hps.relu_leakiness)
        orig_x = x
    else:
      with tf.variable_scope('residual_only_activation'):
        orig_x = x
        x = self._batch_norm('init_bn', x)
        x = self._relu(x, self.hps.relu_leakiness)

    with tf.variable_scope('sub1'):
      x = self._conv('conv1', x, 3, in_filter, out_filter, stride)

    with tf.variable_scope('sub2'):
      x = self._batch_norm('bn2', x)
      x = self._relu(x, self.hps.relu_leakiness)
      x = self._conv('conv2', x, 3, out_filter, out_filter, [1, 1, 1, 1])

    with tf.variable_scope('sub_add'):
      if in_filter != out_filter:
        orig_x = tf.nn.avg_pool(orig_x, stride, stride, 'VALID')
        orig_x = tf.pad(
            orig_x, [[0, 0], [0, 0], [0, 0],
                     [(out_filter-in_filter)//2, (out_filter-in_filter)//2]])
      x += orig_x

    tf.logging.debug('image after unit %s', x.get_shape())
    return x

  def _bottleneck_residual(self, x, in_filter, out_filter, stride,
                           activate_before_residual=False):
    """Bottleneck residual unit with 3 sub layers."""
    if activate_before_residual:
      with tf.variable_scope('common_bn_relu'):
        x = self._batch_norm('init_bn', x)
        x = self._relu(x, self.hps.relu_leakiness)
        orig_x = x
    else:
      with tf.variable_scope('residual_bn_relu'):
        orig_x = x
        x = self._batch_norm('init_bn', x)
        x = self._relu(x, self.hps.relu_leakiness)

    with tf.variable_scope('sub1'):
      x = self._conv('conv1', x, 1, in_filter, out_filter/4, stride)

    with tf.variable_scope('sub2'):
      x = self._batch_norm('bn2', x)
      x = self._relu(x, self.hps.relu_leakiness)
      x = self._conv('conv2', x, 3, out_filter/4, out_filter/4, [1, 1, 1, 1])

    with tf.variable_scope('sub3'):
      x = self._batch_norm('bn3', x)
      x = self._relu(x, self.hps.relu_leakiness)
      x = self._conv('conv3', x, 1, out_filter/4, out_filter, [1, 1, 1, 1])

    with tf.variable_scope('sub_add'):
      if in_filter != out_filter:
        orig_x = self._conv('project', orig_x, 1, in_filter, out_filter, stride)
      x += orig_x

    tf.logging.info('image after unit %s', x.get_shape())
    return x

  def _decay(self):
    """L2 weight decay loss."""
    costs = []
    for var in tf.trainable_variables():
      if var.op.name.find(r'DW') > 0:
        costs.append(tf.nn.l2_loss(var))
        # tf.summary.histogram(var.op.name, var)

    return tf.multiply(self.hps.weight_decay_rate, tf.add_n(costs))

  def _conv(self, name, x, filter_size, in_filters, out_filters, strides):
    """Convolution."""
    with tf.variable_scope(name):
      n = filter_size * filter_size * out_filters
      kernel = tf.get_variable(
          'DW', [filter_size, filter_size, in_filters, out_filters],
          tf.float32, initializer=tf.random_normal_initializer(
              stddev=np.sqrt(2.0/n)))
      return tf.nn.conv2d(x, kernel, strides, padding='SAME')

  def _relu(self, x, leakiness=0.0):
    """Relu, with optional leaky support."""
    return tf.where(tf.less(x, 0.0), leakiness * x, x, name='leaky_relu')

  def _global_avg_pool(self, x):
    assert x.get_shape().ndims == 4
    return tf.reduce_mean(x, [1, 2])

## Train ResNet model
Do not be frightened if you face such an error: 
`An exception has occurred, use %tb to see the full traceback.  SystemExit`

In [0]:
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""ResNet Train/Eval module.
"""
import time
import six
import sys

import numpy as np
import tensorflow as tf

# Global
train_data_path = './cifar-10-batches-bin/data_batch*'
image_size = 32
ckpt_dir = './train_ckpt'
ckpt_prefix = ckpt_dir + '/cifar10-train'

def train(hps):
  """Training loop."""
  with tf.Graph().as_default():
    images, labels = build_input(
    'cifar10', train_data_path, hps.batch_size, 'train')
    model = ResNet(hps, images, labels, 'train')
    model.build_graph()

    truth = tf.argmax(model.labels, axis=1)
    predictions = tf.argmax(model.predictions, axis=1)
    precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth)))
  
    init = tf.global_variables_initializer()

    saver = tf.train.Saver(max_to_keep=10000)
  
    with tf.Session() as sess:
      sess.run(init)
      for i in range(3001):
        _, global_step, cost, precision_ = \
          sess.run([model.train_op, model.global_step, model.cost, precision])
    
        if global_step % 100 == 0:
          print('step: %d, loss: %.3f, precision: %.3f' % (global_step, cost, precision_))

          saver.save(sess, ckpt_prefix, global_step=i)

        
def main(_):
  batch_size = 128

  hps = HParams(batch_size=batch_size,
                             num_classes=10,
                             min_lrn_rate=0.0001,
                             lrn_rate=0.1,
                             num_residual_units=5,
                             use_bottleneck=False,
                             weight_decay_rate=0.0002,
                             relu_leakiness=0.1)

  train(hps)

if __name__ == '__main__':
  tf.logging.set_verbosity(tf.logging.INFO)
  tf.app.run()

### You can see *cifar10-train-0~3000* checkpoint files when you run following code, after you train model.

In [0]:
!ls train_ckpt

## Evaluate trained ResNet model

Before you run this code
click Runtime->**restart runtime**

(If you want to erase all the local files, then click *RESET ALL RUNTIMES* or **DO NOT CLICK!**)

and restart **Define the Resnet50 Model**,  **CIFAR10 input**

Do not be frightened if you face such an error: 
`An exception has occurred, use %tb to see the full traceback.  SystemExit`

In [0]:
!rm -rf './tensorboard'
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""ResNet Train/Eval module.
"""
import time
import six
import sys

import numpy as np
import tensorflow as tf

eval_data_path = './cifar-10-batches-bin/test_batch.bin'
ckpt_dir = './train_ckpt'
tensorboard_path = './tensorboard'

def evaluate(hps):
  """Eval loop."""
  with tf.Graph().as_default():
    images, labels = build_input(
      'cifar10', './cifar-10-batches-bin/test_batch.bin', hps.batch_size, 'eval')
    model = ResNet(hps, images, labels, 'eval')
    model.build_graph()

    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
  
    saver = tf.train.Saver() 
    
    summary_writer = tf.summary.FileWriter('./tensorboard', sess.graph)

    try:
      ckpt_state = tf.train.get_checkpoint_state(ckpt_dir)
    except tf.errors.OutOfRangeError as e:
      tf.logging.error('Cannot restore checkpoint: %s', e)
    if not (ckpt_state):
      tf.logging.info('No model to eval yet at %s', ckpt_dir)
    
    best_precision = 0.
    for i in range(len(ckpt_state.all_model_checkpoint_paths)):
      tf.logging.info('Loading checkpoint %s', ckpt_state.all_model_checkpoint_paths[i])
      saver.restore(sess, ckpt_state.all_model_checkpoint_paths[i])
      total_prediction, correct_prediction = 0, 0

      for _ in six.moves.range(100):
        (summaries, loss, predictions, truth, train_step) = sess.run(
          [model.summaries, model.cost, model.predictions,
           model.labels, model.global_step])

        truth = np.argmax(truth, axis=1)
        predictions = np.argmax(predictions, axis=1)
        correct_prediction += np.sum(truth == predictions)
        total_prediction += predictions.shape[0]

      precision = 1.0 * correct_prediction / total_prediction
      best_precision = max(precision, best_precision)
    
      precision_summ = tf.Summary()
      precision_summ.value.add(
        tag='Precision', simple_value=precision)
      summary_writer.add_summary(precision_summ, train_step)
      best_precision_summ = tf.Summary()
      best_precision_summ.value.add(
        tag='Best Precision', simple_value=best_precision)
      summary_writer.add_summary(best_precision_summ, train_step)
      summary_writer.add_summary(summaries, train_step)
     
      tf.logging.info('loss: %.3f, precision: %.3f, best precision: %.3f' %
                      (loss, precision, best_precision))
      summary_writer.flush()

      tf.logging.info('step: %d, loss: %.3f, precision: %.3f' %
                      (i * 100, loss, precision))


def main(_):

  hps = HParams(batch_size=100,
                num_classes=10,
                min_lrn_rate=0.0001,
                lrn_rate=0.1,
                num_residual_units=5,
                use_bottleneck=False,
                weight_decay_rate=0.0002,
                relu_leakiness=0.1)

  evaluate(hps)


if __name__ == '__main__':
  tf.logging.set_verbosity(tf.logging.INFO)
  tf.app.run()

### Display our graph on tensorboard!

In [0]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip

#run tensorboard
LOG_DIR = './tensorboard'
get_ipython().system_raw(
    'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'
    .format(LOG_DIR)
)
#run ngrok
get_ipython().system_raw('./ngrok http 6006 &')

In [0]:
! curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"