<a href="https://colab.research.google.com/github/Abhinavl3v3l/EVA3/blob/master/Assignment13.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import time, math
from tqdm import tqdm_notebook as tqdm

import tensorflow as tf
import tensorflow.contrib.eager as tfe
import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
from tensorflow.python.framework import ops
from tensorflow.python.ops import math_ops
from tensorflow.python.eager import context

def cyclic_learning_rate(global_step,
                         learning_rate=0.01,
                         max_lr=0.1,
                         step_size=20.,
                         gamma=0.99994,
                         mode='triangular',
                         name=None):
  """Applies cyclic learning rate (CLR).
     From the paper:
     Smith, Leslie N. "Cyclical learning
     rates for training neural networks." 2017.
     [https://arxiv.org/pdf/1506.01186.pdf]
      This method lets the learning rate cyclically
     vary between reasonable boundary values
     achieving improved classification accuracy and
     often in fewer iterations.
      This code varies the learning rate linearly between the
     minimum (learning_rate) and the maximum (max_lr).
      It returns the cyclic learning rate. It is computed as:
       ```python
       cycle = floor( 1 + global_step /
        ( 2 * step_size ) )
      x = abs( global_step / step_size – 2 * cycle + 1 )
      clr = learning_rate +
        ( max_lr – learning_rate ) * max( 0 , 1 - x )
       ```
      Polices:
        'triangular':
          Default, linearly increasing then linearly decreasing the
          learning rate at each cycle.
         'triangular2':
          The same as the triangular policy except the learning
          rate difference is cut in half at the end of each cycle.
          This means the learning rate difference drops after each cycle.
         'exp_range':
          The learning rate varies between the minimum and maximum
          boundaries and each boundary value declines by an exponential
          factor of: gamma^global_step.
       Example: 'triangular2' mode cyclic learning rate.
        '''python
        ...
        global_step = tf.Variable(0, trainable=False)
        optimizer = tf.train.AdamOptimizer(learning_rate=
          clr.cyclic_learning_rate(global_step=global_step, mode='triangular2'))
        train_op = optimizer.minimize(loss_op, global_step=global_step)
        ...
         with tf.Session() as sess:
            sess.run(init)
            for step in range(1, num_steps+1):
              assign_op = global_step.assign(step)
              sess.run(assign_op)
        ...
         '''
       Args:
        global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
          Global step to use for the cyclic computation.  Must not be negative.
        learning_rate: A scalar `float32` or `float64` `Tensor` or a
        Python number.  The initial learning rate which is the lower bound
          of the cycle (default = 0.1).
        max_lr:  A scalar. The maximum learning rate boundary.
        step_size: A scalar. The number of iterations in half a cycle.
          The paper suggests step_size = 2-8 x training iterations in epoch.
        gamma: constant in 'exp_range' mode:
          gamma**(global_step)
        mode: one of {triangular, triangular2, exp_range}.
            Default 'triangular'.
            Values correspond to policies detailed above.
        name: String.  Optional name of the operation.  Defaults to
          'CyclicLearningRate'.
       Returns:
        A scalar `Tensor` of the same type as `learning_rate`.  The cyclic
        learning rate.
      Raises:
        ValueError: if `global_step` is not supplied.
      @compatibility(eager)
      When eager execution is enabled, this function returns
      a function which in turn returns the decayed learning
      rate Tensor. This can be useful for changing the learning
      rate value across different invocations of optimizer functions.
      @end_compatibility
  """
  if global_step is None:
    raise ValueError("global_step is required for cyclic_learning_rate.")
  with ops.name_scope(name, "CyclicLearningRate",
                      [learning_rate, global_step]) as name:
    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
    dtype = learning_rate.dtype
    global_step = math_ops.cast(global_step, dtype)
    step_size = math_ops.cast(step_size, dtype)
    def cyclic_lr():
      """Helper to recompute learning rate; most helpful in eager-mode."""
      # computing: cycle = floor( 1 + global_step / ( 2 * step_size ) )
      double_step = math_ops.multiply(2., step_size)
      global_div_double_step = math_ops.divide(global_step, double_step)
      cycle = math_ops.floor(math_ops.add(1., global_div_double_step))
      # computing: x = abs( global_step / step_size – 2 * cycle + 1 )
      double_cycle = math_ops.multiply(2., cycle)
      global_div_step = math_ops.divide(global_step, step_size)
      tmp = math_ops.subtract(global_div_step, double_cycle)
      x = math_ops.abs(math_ops.add(1., tmp))
      # computing: clr = learning_rate + ( max_lr – learning_rate ) * max( 0, 1 - x )
      a1 = math_ops.maximum(0., math_ops.subtract(1., x))
      a2 = math_ops.subtract(max_lr, learning_rate)
      clr = math_ops.multiply(a1, a2)
      if mode == 'triangular2':
        clr = math_ops.divide(clr, math_ops.cast(math_ops.pow(2, math_ops.cast(
            cycle-1, tf.int32)), tf.float32))
      if mode == 'exp_range':
        clr = math_ops.multiply(math_ops.pow(gamma, global_step), clr)
      return math_ops.add(clr, learning_rate, name=name)
    if not context.executing_eagerly():
      cyclic_lr = cyclic_lr()
    return cyclic_lr

In [0]:
# # lr_schedule = lambda t: np.interp([t], [0, (11250+1)//5, 11250], [0, 1, 0])[0]
# lr_schedule = lambda t: np.interp([t], [0, (EPOCHS+1)//5, EPOCHS], [0, LEARNING_RATE, 0])[0]
# lr_schedule(200/400)

# Assignment 13
1. Refer to your Assignment 12.
2. Replace whatever model you have there with the ResNet18 model as shown below.
Your model must look like Conv->B1->B2->B3->B4 and not individually called Convs. 
3. If not already using, then:
  - Use Batch Size 128
  - Use Normalization values of: (0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)
  - Random Crop of 32 with padding of 4px
  - Horizontal Flip (0.5)
  - Optimizer: SGD, Weight-Decay: 5e-4
  - NOT-OneCycleLR
  - Save model (to drive) after every 50 epochs or best model till now
4. Describe your blocks, and the stride strategy you have picked
5. Train for 300 Epochs
6. Assignment Target Accuracy is 90%, so exit gracefully if you reach 90% (you can target more, it can go till ~93%)
7. Assignment has hard deadline and any assignment submitted post deadline will not be accepted. 

In [0]:
tf.enable_eager_execution()

In [0]:
BATCH_SIZE = 128 #@param {type:"integer"}
MOMENTUM = 0.9 #@param {type:"number"}
LEARNING_RATE = 0.8 #@param {type:"number"}
WEIGHT_DECAY = 5e-4 #@param {type:"number"}
EPOCHS = 300 #@param {type:"integer"}
ITERATIONS = 11250 #@param {type:"integer"} 

# (60000/128) * 24 =  11250

https://mc.ai/tutorial-1-cifar10-with-google-colabs-free-gpu%E2%80%8A-%E2%80%8A92-5/

In [0]:
def init_pytorch(shape, dtype=tf.float32, partition_info=None):
  fan = np.prod(shape[:-1])
  bound = 1 / math.sqrt(fan)
  return tf.random.uniform(shape, minval=-bound, maxval=bound, dtype=dtype)

class ConvBN(tf.keras.Model):
  def __init__(self, c_out):
    super().__init__()
    self.conv = tf.keras.layers.Conv2D(filters=c_out, kernel_size=3, padding="SAME", kernel_initializer=init_pytorch, use_bias=False)
    self.bn = tf.keras.layers.BatchNormalization(momentum=0.9, epsilon=1e-5)
    self.drop = tf.keras.layers.Dropout(0.05)

  def call(self, inputs):
    return tf.nn.relu(self.bn(self.drop(self.conv(inputs)))) 

class ResBlk(tf.keras.Model):
  def __init__(self, c_out, pool, res = False):
    super().__init__()
    self.conv_bn = ConvBN(c_out)
    self.pool = pool
    self.res = res
    if self.res:
      self.res1 = ConvBN(c_out)
      self.res2 = ConvBN(c_out)

  def call(self, inputs):
    h = self.pool(self.conv_bn(inputs))
    if self.res:
      h = h + self.res2(self.res1(h))
    return h

class DavidNet(tf.keras.Model):
  def __init__(self, c=64, weight=0.125):
    super().__init__()
    pool = tf.keras.layers.MaxPooling2D()
    self.init_conv_bn = ConvBN(c)
    self.blk1 = ResBlk(c*2, pool, res = True)
    self.blk2 = ResBlk(c*4, pool)
    self.blk3 = ResBlk(c*8, pool, res = True)
    self.pool = tf.keras.layers.GlobalMaxPool2D()
    self.linear = tf.keras.layers.Dense(10, kernel_initializer=init_pytorch, use_bias=False)
    self.weight = weight

  def call(self, x, y):
    h = self.pool(self.blk3(self.blk2(self.blk1(self.init_conv_bn(x)))))
    h = self.linear(h) * self.weight
    ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=h, labels=y)
    loss = tf.reduce_sum(ce)
    correct = tf.reduce_sum(tf.cast(tf.math.equal(tf.argmax(h, axis = 1), y), tf.float32))
    return loss, correct

In [0]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()
len_train, len_test = len(x_train), len(x_test)
y_train = y_train.astype('int64').reshape(len_train)
y_test = y_test.astype('int64').reshape(len_test)

train_mean = np.mean(x_train, axis=(0,1,2))
train_std = np.std(x_train, axis=(0,1,2))

normalize = lambda x: ((x - train_mean) / train_std).astype('float32') # todo: check here
pad4 = lambda x: np.pad(x, [(0, 0), (4, 4), (4, 4), (0, 0)], mode='reflect')


x_train = normalize(pad4(x_train))
x_test = normalize(x_test)

In [29]:
model = DavidNet()
batches_per_epoch = len_train//BATCH_SIZE + 1

# lr_schedule = lambda t: np.interp([t], [0, (EPOCHS+1)//5, EPOCHS], [0, LEARNING_RATE, 0])[0]

lr_schedule = lambda t: np.interp([t], [0, (BATCH_SIZE+1)//10, BATCH_SIZE], [0, LEARNING_RATE, 0])[0]

# import matplotlib.pyplot as plt
# plt.plot([0, (BATCH_SIZE+1)//2, BATCH_SIZE], [0, LEARNING_RATE, 0], 'o')

rates = []

for i in range(0, 11250):
    x = cyclic_learning_rate(i, mode='triangular', gamma=1)
    rates.append(x())

# plt.xlabel('iterations (epochs)')
# plt.ylabel('learning rate')
# plt.plot(range(11250), rates)


global_step = tf.train.get_or_create_global_step() # Number of batches seen.
lr_func = lambda: lr_schedule(rates[global_step/1])
opt = tf.train.MomentumOptimizer(lr_func, momentum=MOMENTUM, use_nesterov=True)
data_aug = lambda x, y: (tf.image.random_flip_left_right(tf.random_crop(x, [32, 32, 3])), y)

print(global_step)


<tf.Variable 'global_step:0' shape=() dtype=int64, numpy=0>


In [30]:
t = time.time()
test_set = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(BATCH_SIZE)

for epoch in range(EPOCHS):
  train_loss = test_loss = train_acc = test_acc = 0.0
  train_set = tf.data.Dataset.from_tensor_slices((x_train, y_train)).map(data_aug).shuffle(len_train).batch(BATCH_SIZE).prefetch(1)

  tf.keras.backend.set_learning_phase(1)
  for (x, y) in tqdm(train_set):
    with tf.GradientTape() as tape:
      loss, correct = model(x, y)

    var = model.trainable_variables
    grads = tape.gradient(loss, var)
    for g, v in zip(grads, var):
      g += v * WEIGHT_DECAY * BATCH_SIZE
    opt.apply_gradients(zip(grads, var), global_step=global_step)

    train_loss += loss.numpy()
    train_acc += correct.numpy()

  tf.keras.backend.set_learning_phase(0)
  for (x, y) in test_set:
    loss, correct = model(x, y)
    test_loss += loss.numpy()
    test_acc += correct.numpy()
    
  print('epoch:', epoch+1, 'lr:', lr_schedule(epoch+1), 'train loss:', train_loss / len_train, 'train acc:', train_acc / len_train, 'val loss:', test_loss / len_test, 'val acc:', test_acc / len_test, 'time:', time.time() - t)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 1 lr: 0.06666666666666667 train loss: 1.5038237765502929 train acc: 0.448 val loss: 1.2151404479980468 val acc: 0.5803 time: 43.07599139213562


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 2 lr: 0.13333333333333333 train loss: 0.8989567930603027 train acc: 0.68322 val loss: 0.8161148460388183 val acc: 0.7194 time: 85.62177014350891


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 3 lr: 0.2 train loss: 0.671749254989624 train acc: 0.76804 val loss: 0.6313745498657226 val acc: 0.7894 time: 128.7939326763153


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 4 lr: 0.26666666666666666 train loss: 0.5578641076660156 train acc: 0.80658 val loss: 0.4782954850673676 val acc: 0.838 time: 171.98927235603333


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 5 lr: 0.3333333333333333 train loss: 0.4849929810333252 train acc: 0.83192 val loss: 0.5161942651748658 val acc: 0.8257 time: 215.44320464134216


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 6 lr: 0.4 train loss: 0.4279890584564209 train acc: 0.85086 val loss: 0.6287419741630554 val acc: 0.7956 time: 258.8276550769806


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 7 lr: 0.4666666666666667 train loss: 0.38667984577178954 train acc: 0.8672 val loss: 0.477518648147583 val acc: 0.8435 time: 302.8385970592499


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 8 lr: 0.5333333333333333 train loss: 0.35406581352233885 train acc: 0.87716 val loss: 0.3731740145683289 val acc: 0.8736 time: 345.7735712528229


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 9 lr: 0.6 train loss: 0.31773585983276365 train acc: 0.89068 val loss: 0.4142393620491028 val acc: 0.8644 time: 388.52957105636597


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 10 lr: 0.6666666666666666 train loss: 0.2990514247894287 train acc: 0.89672 val loss: 0.4687415581226349 val acc: 0.8507 time: 431.0772604942322


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 11 lr: 0.7333333333333333 train loss: 0.27389418518066405 train acc: 0.90548 val loss: 0.5538924253463745 val acc: 0.8289 time: 473.3466465473175


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 12 lr: 0.8 train loss: 0.26043057695388794 train acc: 0.90972 val loss: 0.37922341661453246 val acc: 0.8806 time: 515.8748760223389


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 13 lr: 0.7931034482758621 train loss: 0.2404197744178772 train acc: 0.9162 val loss: 0.3383291801452637 val acc: 0.891 time: 558.6286861896515


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 14 lr: 0.7862068965517242 train loss: 0.21914359477996825 train acc: 0.92458 val loss: 0.4363450415611267 val acc: 0.8715 time: 601.9751224517822


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 15 lr: 0.7793103448275862 train loss: 0.20848481552124024 train acc: 0.92722 val loss: 0.43996654143333436 val acc: 0.8715 time: 644.4106891155243


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 16 lr: 0.7724137931034483 train loss: 0.19450756180763246 train acc: 0.93022 val loss: 0.6318338095664978 val acc: 0.8371 time: 686.7824549674988


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 17 lr: 0.7655172413793104 train loss: 0.18681525524139403 train acc: 0.93624 val loss: 0.31039792051315307 val acc: 0.9045 time: 729.483072757721


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 18 lr: 0.7586206896551725 train loss: 0.16976296954154968 train acc: 0.9399 val loss: 0.3584294054031372 val acc: 0.8951 time: 772.1009809970856


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 19 lr: 0.7517241379310345 train loss: 0.16596036288261415 train acc: 0.94102 val loss: 0.43061868162155154 val acc: 0.8769 time: 814.9828717708588


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 20 lr: 0.7448275862068966 train loss: 0.15539349521636964 train acc: 0.94502 val loss: 0.5248019565582276 val acc: 0.8544 time: 857.7091579437256


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 21 lr: 0.7379310344827587 train loss: 0.14806273062705994 train acc: 0.94698 val loss: 0.3710252786397934 val acc: 0.8942 time: 901.703337430954


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 22 lr: 0.7310344827586207 train loss: 0.14397074360847473 train acc: 0.94912 val loss: 0.3457326096534729 val acc: 0.9017 time: 944.0067682266235


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 23 lr: 0.7241379310344828 train loss: 0.12670192286491394 train acc: 0.95502 val loss: 0.4312111001968384 val acc: 0.8825 time: 986.281660079956


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 24 lr: 0.7172413793103448 train loss: 0.12614163744926452 train acc: 0.95592 val loss: 0.5276438403129577 val acc: 0.8638 time: 1028.7087233066559


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 25 lr: 0.710344827586207 train loss: 0.11684790924072265 train acc: 0.9591 val loss: 0.3791629760742187 val acc: 0.8953 time: 1071.2080278396606


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 26 lr: 0.703448275862069 train loss: 0.11587140006065369 train acc: 0.95976 val loss: 0.337919495010376 val acc: 0.9091 time: 1113.8568823337555


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 27 lr: 0.6965517241379311 train loss: 0.11020081942558288 train acc: 0.9613 val loss: 0.359435844039917 val acc: 0.9059 time: 1156.6763968467712


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 28 lr: 0.6896551724137931 train loss: 0.10389060346126557 train acc: 0.96382 val loss: 0.47026828804016113 val acc: 0.8857 time: 1199.397867679596


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

IndexError: ignored