In [1]:
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf


# Computing Gradients
To differentiate automaticly TF needs to remember what operations happened and in which order during the forward pass and then traverse this list of operations in reverse order in the backpropagation. <br>
For that TF uses gradient tape to keep track of these operations

In [34]:
# simple function
w = tf.Variable(tf.random.normal((3, 2)), name='w')
b = tf.Variable(tf.zeros(2, dtype=tf.float32), name='b')
x = [[1., 2., 3.]]

# forward pass
with tf.GradientTape() as tape:
    y = x @ w + b
    loss = tf.reduce_mean(y**2)

# apply backpropagation
[dl_dw, dl_db] = tape.gradient(loss, [w, b])

In [35]:
print(dl_dw)
print(dl_db)

tf.Tensor(
[[ 4.3425727   0.26980785]
 [ 8.685145    0.5396157 ]
 [13.027718    0.80942357]], shape=(3, 2), dtype=float32)
tf.Tensor([4.3425727  0.26980785], shape=(2,), dtype=float32)


In [36]:
# tf.GradientTape has its persistent property set to False, meaning that after the first gradient call (tape.gradient)
# the tape gets deleted, if persistent is set to True it won't

# Gradients with respect to a Model

In [37]:
# Usually gradient tape gets used in combination with a Model or Layer in regards to which it would compute the gradient
# models and subclasses aggregate their variables in model.trainable_variables

In [51]:
layer = tf.keras.layers.Dense(2, activation='relu')
x = tf.constant([[1., 2., 3.]])

with tf.GradientTape() as tape:
    # Forward pass
    y = layer(x)
    loss = tf.reduce_mean(y**2)

# Calculate gradients with respect to every trainable variable
grad = tape.gradient(loss, layer.trainable_variables)

In [53]:
# so our trainable variables are from our layer. It's kernel and bias
for var, g in zip(layer.trainable_variables, grad):
    print(f'{var.name}, shape: {g.shape}')


kernel, shape: (3, 2)
bias, shape: (2,)


# What the Tape watches

In [58]:
# A trainable variable
x0 = tf.Variable(3.0, name='x0')
# Not trainable
x1 = tf.Variable(3.0, name='x1', trainable=False)
# Not a Variable: A variable + tensor returns a tensor.
x2 = tf.Variable(2.0, name='x2') + 1.0
# Not a variable
x3 = tf.constant(3.0, name='x3')

with tf.GradientTape() as tape:
    y = (x0**2) + (x1**2) + (x2**2)

grad = tape.gradient(y, [x0, x1, x2, x3])

# The tape only computed a gradient for x0 since its a trainable Variable that automaticly gets watched
# gradient tape returns None if no gradient is computed
for g in grad:
    print(g)


tf.Tensor(6.0, shape=(), dtype=float32)
None
None
None


In [57]:
# to record gradients in respect to a tensor you need to use watch
x = tf.constant(3.0)
with tf.GradientTape() as tape:
    tape.watch(x)
    y = x**2

# dy = 2x * dx
dy_dx = tape.gradient(y, x)
print(dy_dx.numpy())
# and to only watch variables that should be watched set automaticly watched variables to false
# tf.GradientTape(watch_accessed_variables=False)

6.0


In [None]:
# more to gradient tape here: https://www.tensorflow.org/guide/autodiff
# Gradients of non-scalar targets
# Control flow (if statements)
# common errors