# @fchollet's Tensorflow 2.0 Crash Course

In [3]:
import tensorflow as tf
from tensorflow.keras.layers import Layer
import math

## Manually Create A Layer

In [4]:
class Linear(Layer):
    def __init__(self, input_dim=784, output_dim=32, c=2):
        super(Linear,self).__init__()
        w_init = tf.random_normal_initializer()
        self.w = tf.Variable(
            initial_value=w_init(shape=(input_dim,output_dim), dtype='float32') * c/math.sqrt(input_dim),
            trainable=True
        )
        b_init = tf.zeros_initializer()
        self.b = tf.Variable(
            initial_value=b_init(shape=(output_dim,),dtype='float32'),
            trainable=True
        )
    def call(self,x):
        return tf.matmul(x,self.w) + self.b

In [5]:
x = tf.random.normal((10,784))
l = Linear(784,32)
l(x).shape

TensorShape([10, 32])

## Build Method

In [1]:
class Linear(Layer):
    def __init__(self, output_dim=32):
        super(Linear,self).__init__()
        self.output_dim = output_dim
    def build(self,x_shape):
        self.w = self.add_weight(shape=(x_shape[-1],self.output_dim),
                                initializer='he_normal',trainable=True)
        self.b = self.add_weight(shape=(self.output_dim,),
                                initializer='zeros',trainable=True)
    def call(self,x):
        return tf.matmul(x,self.w) + self.b

NameError: name 'Layer' is not defined

In [7]:
x = tf.random.normal((10,784))
l = Linear(32)
l(x).shape

TensorShape([10, 32])

## Dataset

In [10]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((60000, 28, 28), (10000, 28, 28), (60000,), (10000,))

In [31]:
#datasets
dataset = tf.data.Dataset.from_tensor_slices(
    (x_train.reshape(60000,-1).astype('float32') / 255.,y_train),
    ).shuffle(buffer_size=y_train.shape[0]).batch(64)
dataset

<BatchDataset shapes: ((None, 784), (None,)), types: (tf.float32, tf.uint8)>

## Loss Function

In [32]:
#sparse means not one-hot encoded
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [33]:
loss_fn(tf.constant([0,1,2]), #y
        tf.constant([[0,1,0], #pred for each y; tensorflow calls raw prediction logits
         [1,0.5,0.2],
         [0.5,0.5,0]]))

<tf.Tensor: id=172948, shape=(), dtype=float32, numpy=1.4100529>

## Optimizer

In [34]:
optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-3,
                                    beta_1=0.9, beta_2=0.999)

## Training Loop

In [72]:
l = Linear(10)
for i,(x,y) in enumerate(dataset):
    with tf.GradientTape() as tape:
        preds = l(x) #prediction
        loss = loss_fn(y,preds) #record loss
        
        gradients = tape.gradient(loss,l.trainable_weights) #record gradients
    #update 
    optimizer.apply_gradients(zip(gradients,l.trainable_weights))
    #log
    if i % 100==0:
        acc = (tf.nn.softmax(preds).numpy().argmax(1)==y.numpy()).mean()
        print(i,loss.numpy(), acc)

0 2.550394 0.015625
100 0.5202041 0.828125
200 0.5742572 0.796875
300 0.54601276 0.828125
400 0.401887 0.859375
500 0.454468 0.8125
600 0.4281699 0.875
700 0.45130214 0.828125
800 0.63539547 0.765625
900 0.5161672 0.8125


## Non-trainable Layers

In [114]:
class ReduceSum(Layer):
    def __init__(self,input_dim):
        super(ReduceSum,self).__init__()
        self.total = tf.Variable(initial_value=tf.zeros(input_dim), trainable=False)
    def __call__(self,x):
        col_sum = tf.reduce_sum(x,axis=0) #reduce
        self.total.assign_add(col_sum) #add 
#         self.total.assign_add(tf.ones(10)) #add 
        return self.total

In [115]:
l = ReduceSum(10)
l.get_weights()

[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)]

In [116]:
for i in range(10): 
    print(l(tf.ones((3,10))).numpy())

[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]
[6. 6. 6. 6. 6. 6. 6. 6. 6. 6.]
[9. 9. 9. 9. 9. 9. 9. 9. 9. 9.]
[12. 12. 12. 12. 12. 12. 12. 12. 12. 12.]
[15. 15. 15. 15. 15. 15. 15. 15. 15. 15.]
[18. 18. 18. 18. 18. 18. 18. 18. 18. 18.]
[21. 21. 21. 21. 21. 21. 21. 21. 21. 21.]
[24. 24. 24. 24. 24. 24. 24. 24. 24. 24.]
[27. 27. 27. 27. 27. 27. 27. 27. 27. 27.]
[30. 30. 30. 30. 30. 30. 30. 30. 30. 30.]


## Stack Them Together

In [124]:
#how to do nn.ModuleList
class MLP(Layer):
    def __init__(self,output_dims=[32,32,10]):
        super(MLP,self).__init__()
        self.linears = [Linear(i) for i in output_dims]
    def call(self,x):
        for l in self.linears[:-1]:
            x = tf.nn.relu(l(x))
        return self.linears[-1](x)

In [129]:
m = MLP([32,32,10])
m(x).shape

TensorShape([32, 10])

In [130]:
for i,(x,y) in enumerate(dataset):
    with tf.GradientTape() as tape:
        preds = m(x) #prediction
        loss = loss_fn(y,preds) #record loss
        gradients = tape.gradient(loss,m.trainable_weights) #record gradients
    #update 
    optimizer.apply_gradients(zip(gradients,m.trainable_weights))
    #log
    if i % 100==0:
        acc = (tf.nn.softmax(preds).numpy().argmax(1)==y.numpy()).mean()
        print(i,loss.numpy(), acc)

0 2.3783333 0.0625
100 0.5672692 0.78125
200 0.57003796 0.78125
300 0.4793804 0.875
400 0.3243881 0.890625
500 0.47998983 0.765625
600 0.38265362 0.890625
700 0.36116746 0.859375
800 0.5555291 0.78125
900 0.45142463 0.84375


## Layer-specific Loss

In [176]:
#regularize based on sum of input; penalize dense features, we prefer less features
class ActivityRegularization(Layer):
    def __init__(self, rate=1e-2):
        super(ActivityRegularization,self).__init__()
        self.rate = rate
    def call(self,x):
        self.add_loss(self.rate * tf.reduce_sum(x))
        self.add_loss(0)
        return x

In [177]:
#how to do nn.ModuleList
class SparseMLP(Layer):
    def __init__(self,output_dims=[32,32,10], rate=1e-2):
        super(SparseMLP,self).__init__()
        self.linears = [Linear(i) for i in output_dims]
        self.reg = ActivityRegularization(rate)
    def call(self,x):
        for l in self.linears[:-1]:
            x = tf.nn.relu(l(x))
        x = self.reg(x) #return x; record loss
        x = self.linears[-1](x)
        return x

In [182]:
from datetime import datetime
start_time = datetime.now()

#setup
m = SparseMLP([32,10], 1e-4)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-3,
                                    beta_1=0.9, beta_2=0.999)

#loop
for i,(x,y) in enumerate(dataset):
    with tf.GradientTape() as tape:
        preds = m(x) #prediction
        loss = loss_fn(y,preds) #record loss
        loss += sum(m.losses) #add losses from layers
        gradients = tape.gradient(loss,m.trainable_weights) #record gradients
    #update 
    optimizer.apply_gradients(zip(gradients,m.trainable_weights))
    #log
    if i % 100==0:
        acc = (tf.nn.softmax(preds).numpy().argmax(1)==y.numpy()).mean()
        print(i,loss.numpy(), acc)

print(f'Done in {datetime.now() - start_time}')

0 2.4885657 0.0625
100 0.84381044 0.765625
200 0.8076217 0.75
300 0.69481254 0.84375
400 0.5209669 0.875
500 0.5760761 0.859375
600 0.5034058 0.875
700 0.5139364 0.875
800 0.7161176 0.765625
900 0.55102414 0.859375
Done in 0:01:19.100688


## Static Graph

In [189]:
from datetime import datetime
start_time = datetime.now()

#set up
m = SparseMLP([32,32,10],1e-4)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-3,
                                    beta_1=0.9, beta_2=0.999)
#graph
@tf.function
def train_batch(x,y):
    with tf.GradientTape() as tape:
        preds = m(x)
        loss = loss_fn(y,preds)
        loss += sum(m.losses)
        gradients = tape.gradient(loss,m.trainable_weights)
    optimizer.apply_gradients(zip(gradients,m.trainable_weights))
    return loss,preds

#loop
for i, (x,y) in enumerate(dataset):
    if i % 100==0:
        loss,preds = train_batch(x,y)
        acc = (tf.nn.softmax(preds).numpy().argmax(1)==y.numpy()).mean()
        print(i,loss.numpy(), acc)

print(f'Done in {datetime.now() - start_time}')

0 2.5101354 0.0625
100 2.4080944 0.0625
200 2.2871478 0.171875
300 2.1330879 0.34375
400 2.1797462 0.1875
500 2.1442578 0.28125
600 2.1344922 0.203125
700 1.9918184 0.40625
800 1.981747 0.34375
900 1.9799223 0.328125
Done in 0:00:03.034253


## Turn Off Layers

In [190]:
class Dropout(Layer):
    def __init__(self, p = 0.5):
        super(Dropout,self).__init__()
        self.p = p
    #to remind it that if needs to be put in the graph
    @tf.function
    def __call__(self,x, training=True):
        if training:
            return tf.nn.dropout(x,rate=self.p)
        return x
    
#how to do nn.ModuleList
class DropoutMLP(Layer):
    def __init__(self,output_dims=[32,32,10],p=0.5):
        super(DropoutMLP,self).__init__()
        self.linears = [Linear(i) for i in output_dims]
        self.dropout = Dropout(p)
    def call(self,x):
        for l in self.linears[:-1]:
            x = tf.nn.relu(l(x))
        x = self.dropout(x)
        return self.linears[-1](x)

In [193]:
from datetime import datetime
start_time = datetime.now()

#set up
m = DropoutMLP([32,32,10],0.1)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-3,
                                    beta_1=0.9, beta_2=0.999)
#graph
@tf.function
def train_batch(x,y):
    with tf.GradientTape() as tape:
        preds = m(x)
        loss = loss_fn(y,preds)
        loss += sum(m.losses)
        gradients = tape.gradient(loss,m.trainable_weights)
    optimizer.apply_gradients(zip(gradients,m.trainable_weights))
    return loss,preds

#loop
for i, (x,y) in enumerate(dataset):
    if i % 100==0:
        loss,preds = train_batch(x,y)
        acc = (tf.nn.softmax(preds).numpy().argmax(1)==y.numpy()).mean()
        print(i,loss.numpy(), acc)

print(f'Done in {datetime.now() - start_time}')

0 2.3755062 0.1875
100 2.3854315 0.046875
200 2.302129 0.109375
300 2.2056253 0.265625
400 2.153767 0.21875
500 2.1245136 0.28125
600 2.1070633 0.265625
700 2.0205166 0.46875
800 1.9951811 0.375
900 1.8572261 0.53125
Done in 0:00:03.383339
