In [1]:
import tensorflow as tf
print(tf.__version__)

2.3.0


# 自动求导机制

- **GradientTape**是eager模式下计算梯度用的

上面的例子中的梯度计算部分可以更直观的理解这个函数的用法。
```
tf.GradientTape(
    persistent=False,
    watch_accessed_variables=True
)
```
* `persistent` : 用来指定新创建的 `gradient tape`是否是可持续性的。默认是False，意味着只能够调用一次gradient()函数。
* `watch_accessed_variables`: 表明这个`GradientTape`是不是会自动追踪任何能被训练（trainable）的变量。默认是True。要是为False的话，意味着你需要手动去指定你想追踪的那些变量。

In [3]:
x = tf.constant(3.0)

with tf.GradientTape() as g:
    g.watch(x)
    y = x * x
    
dy_dx = g.gradient(y, x)  # y’ = 2*x = 2*3 = 6

```
watch(tensor)
```
* 作用：确保某个`tensor` 被 `tape` 追踪 
* 参数：`tensor` -> 一个 `Tensor` 或者一个 `Tensor` 列表

```
gradient(target, sources)
```
* 作用：根据tape上面的上下文来计算某个或者某些tensor的梯度参数

      * target: 被微分的Tensor或者Tensor列表，你可以理解为经过某个函数之后的值
      * sources: Tensors 或者Variables列表（当然可以只有一个值）. 你可以理解为函数的某个变量
      
* 返回:
      一个列表表示各个变量的梯度值，和source中的变量列表一一对应，表明这个变量的梯度。

In [4]:
dy_dx

<tf.Tensor: shape=(), dtype=float32, numpy=6.0>

## 案例1、模型自动求导

构建模型（神经网络的前向传播） -> 定义损失函数 -> 定义优化函数 -> 定义tape ->  模型得到预测值 -> 前向传播得到loss -> 反向传播 -> 用优化函数将计算出来的梯度更新到变量上面去

In [5]:
class MyModel(tf.keras.Model):

    def __init__(self, num_classes=10):
        super(MyModel, self).__init__(name='my_model')
        self.num_classes = num_classes
        # 定义自己需要的层
        self.dense_1 = tf.keras.layers.Dense(32, activation='relu')  # 隐藏层
        self.dense_2 = tf.keras.layers.Dense(num_classes)  # 输出层

    def call(self, inputs):
        # 定义前向传播
        # 使用在 __init__() 定义的层
        x = self.dense_1(inputs)
        return self.dense_2(x)

In [6]:
import numpy as np
# 10分类问题
data = np.random.random((1000, 32))
labels = np.random.random((1000, 10))

一般在网络中使用时，不需要显式调用watch函数 ，使用默认设置，GradientTape会监控可训练变量。
```
apply_gradients(grads_and_vars,name=None)
```
作用：把计算出来的梯度更新到变量上面去。
参数含义 :
* `grads_and_vars`：(gradient, variable) 对的列表 .
* name: 操作名

In [7]:
model = MyModel(num_classes=10)  # 构建前向传播模型

loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=True)  # 定义损失函数
optimizer = tf.keras.optimizers.Adam()  # 定义优化器


with tf.GradientTape() as tape:
    predictions = model(data)
    loss = loss_object(labels, predictions)
    
gradients = tape.gradient(loss, model.trainable_variables)  # 求梯度

optimizer.apply_gradients(zip(gradients, model.trainable_variables))  # 将梯度更新到每个值上



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



<tf.Variable 'UnreadVariable' shape=() dtype=int64, numpy=1>

In [8]:
model.trainable_variables

[<tf.Variable 'my_model/dense/kernel:0' shape=(32, 32) dtype=float32, numpy=
 array([[ 0.2986059 , -0.22218348, -0.22293933, ...,  0.14016736,
          0.01760756, -0.11621397],
        [-0.05881743, -0.25921795,  0.2829372 , ..., -0.30361545,
          0.18618977, -0.15834057],
        [-0.2872937 , -0.14355828, -0.29375783, ..., -0.13330583,
         -0.07523468,  0.2320581 ],
        ...,
        [-0.26255342,  0.08072431, -0.04701015, ..., -0.04800224,
          0.26560968, -0.2825388 ],
        [-0.11009346, -0.25503758,  0.2563971 , ...,  0.06066217,
          0.13818401, -0.00197141],
        [ 0.1448306 , -0.06286307, -0.02791359, ..., -0.17092887,
         -0.24999507,  0.19709735]], dtype=float32)>,
 <tf.Variable 'my_model/dense/bias:0' shape=(32,) dtype=float32, numpy=
 array([-0.00099993, -0.00099947, -0.001     ,  0.0009918 ,  0.00099998,
         0.001     ,  0.00099769, -0.001     ,  0.00100002, -0.00099975,
         0.00100001,  0.00099961, -0.00099998, -0.001     , -0

```
apply_gradients(grads_and_vars, name=None)
```
* 作用：把计算出来的梯度更新到变量上面去。
* 参数含义:
    * grads_and_vars: (gradient, variable) 对的列表.
    * name: 操作名

## 案例2：使用GradientTape自定义训练模型

In [9]:
class MyModel(tf.keras.Model):

    def __init__(self, num_classes=10):
        super(MyModel, self).__init__(name='my_model')
        self.num_classes = num_classes
        # 定义自己需要的层
        self.dense_1 = tf.keras.layers.Dense(32, activation='relu')
        self.dense_2 = tf.keras.layers.Dense(num_classes)

    def call(self, inputs):
        #定义前向传播
        # 使用在 (in `__init__`)定义的层
        x = self.dense_1(inputs)
        return self.dense_2(x)

In [10]:
import numpy as np

data = np.random.random((1000, 32))
labels = np.random.random((1000, 10))

In [11]:
model = MyModel(num_classes=10)

# Instantiate an optimizer.
optimizer = tf.keras.optimizers.SGD(learning_rate=1e-3)
# Instantiate a loss function.
loss_fn = tf.keras.losses.CategoricalCrossentropy()

# Prepare the training dataset.
batch_size = 64
train_dataset = tf.data.Dataset.from_tensor_slices((data, labels))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)

In [11]:
# epoch
# batch_size
# tape 求梯度  梯度更新

In [12]:
epochs = 3
for epoch in range(epochs):
    print('Start of epoch %d' % (epoch,))
    
    # 遍历数据集的batch_size
    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        
        # 打开GradientTape以记录正向传递期间运行的操作，这将启用自动区分。
        with tf.GradientTape() as tape:

            # 运行该模型的前向传播。 模型应用于其输入的操作将记录在GradientTape上。
            logits = model(x_batch_train, training=True)  # 这个minibatch的预测值

            # 计算这个minibatch的损失值
            loss_value = loss_fn(y_batch_train, logits)

        # 使用GradientTape自动获取可训练变量相对于损失的梯度。
        grads = tape.gradient(loss_value, model.trainable_weights)

        # 通过更新变量的值来最大程度地减少损失，从而执行梯度下降的一步。
        optimizer.apply_gradients(zip(grads, model.trainable_weights))

        # 每200 batches打印一次.
        if step % 200 == 0:
            print('Training loss (for one batch) at step %s: %s' % (step, float(loss_value)))
            print('Seen so far: %s samples' % ((step + 1) * 64))

Start of epoch 0


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Training loss (for one batch) at step 0: 39.01036071777344
Seen so far: 64 samples
Start of epoch 1
Training loss (for one batch) at step 0: 37.70331573486328
Seen so far: 64 samples
Start of epoch 2
Training loss (for one batch) at step 0: 24.322628021240234
Seen so far: 64 samples


案例二相当于`model.fit()`，但是缺少评估函数`metrics`

## 案例3：使用GradientTape自定义训练模型进阶（加入评估函数）
让我们将metric添加到组合中。下面可以在从头开始编写的训练循环中随时使用内置指标（或编写的自定义指标）。流程如下：

- 在循环开始时初始化metrics
- metric.update_state()：每batch之后更新
- metric.result()：需要显示metrics的当前值时调用
- metric.reset_states()：需要清除metrics状态时重置（通常在每个epoch的结尾）


In [13]:
class MyModel(tf.keras.Model):

    def __init__(self, num_classes=10):
        super(MyModel, self).__init__(name='my_model')
        self.num_classes = num_classes
        # 定义自己需要的层
        self.dense_1 = tf.keras.layers.Dense(32, activation='relu')
        self.dense_2 = tf.keras.layers.Dense(num_classes)
    
    def call(self, inputs):
        #定义前向传播
        # 使用在 (in `__init__`)定义的层
        x = self.dense_1(inputs)
        return self.dense_2(x)


In [15]:
import numpy as np
x_train = np.random.random((1000, 32))
y_train = np.random.random((1000, 10))
x_val = np.random.random((200, 32))
y_val = np.random.random((200, 10))
x_test = np.random.random((200, 32))
y_test = np.random.random((200, 10))


# 优化器
optimizer = tf.keras.optimizers.SGD(learning_rate=1e-3)
# 损失函数
loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

# 准备metrics函数
train_acc_metric = tf.keras.metrics.CategoricalAccuracy()
val_acc_metric = tf.keras.metrics.CategoricalAccuracy()

# 准备训练数据集
batch_size = 64
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)

# 准备测试数据集
val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_dataset = val_dataset.batch(64)

进行几个epoch运行训练循环：

In [16]:
model = MyModel(num_classes=10)
epochs = 3
for epoch in range(epochs):
    print('Start of epoch %d' % (epoch,))

    # 遍历数据集的batch_size
    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        
        # 一个batch
        with tf.GradientTape() as tape:
            logits = model(x_batch_train)
            loss_value = loss_fn(y_batch_train, logits)
        grads = tape.gradient(loss_value, model.trainable_weights)
        optimizer.apply_gradients(zip(grads, model.trainable_weights))

        # 更新训练集的metrics
        train_acc_metric(y_batch_train, logits)     
            
            
    # 在每个epoch结束时显示metrics。
    train_acc = train_acc_metric.result()
    print('Training acc over epoch: %s' % (float(train_acc),))
    # 在每个epoch结束时重置训练指标
    train_acc_metric.reset_states()  #!!!!!!!!!!!!!!! 一定要重置

    # 在每个epoch结束时运行一个验证集。
    for x_batch_val, y_batch_val in val_dataset:
        val_logits = model(x_batch_val)
        # 更新验证集merics
        val_acc_metric(y_batch_val, val_logits)
    val_acc = val_acc_metric.result()
    print('Validation acc: %s' % (float(val_acc),))
    val_acc_metric.reset_states()

    #显示测试集


Start of epoch 0


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Training acc over epoch: 0.10199999809265137
Validation acc: 0.11999999731779099
Start of epoch 1
Training acc over epoch: 0.10000000149011612
Validation acc: 0.11999999731779099
Start of epoch 2
Training acc over epoch: 0.10000000149011612
Validation acc: 0.14000000059604645


![](media/大纲.png)