In [1]:
import torch 
import torch.nn as nn 
import torch.nn.functional as F 

In [2]:
### 假设最后Generator返回了[batchsize,seq_len,vocab_size]的张量
logits = torch.randn(2,3,4)
label = torch.randint(0,4,(2,3)) 

**交叉熵计算**

这里注意到如果是在当前语境下是vocab_size个类的分类问题，所以C=vocab_size

In [3]:
nn.CrossEntropyLoss
r'''
It is useful when training a classification problem with `C` classes.
    If provided, the optional argument :attr:`weight` should be a 1D `Tensor`
    assigning weight to each of the classes.
    This is particularly useful when you have an unbalanced training set.

    The `input` is expected to contain the unnormalized logits for each class (which do `not` need
    to be positive or sum to 1, in general).
    `input` has to be a Tensor of size :math:`(C)` for unbatched input,
    :math:`(minibatch, C)` or :math:`(minibatch, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1` for the
    `K`-dimensional case. The last being useful for higher dimension inputs, such
    as computing cross entropy loss per-pixel for 2D images.
'''

'\nIt is useful when training a classification problem with `C` classes.\n    If provided, the optional argument :attr:`weight` should be a 1D `Tensor`\n    assigning weight to each of the classes.\n    This is particularly useful when you have an unbalanced training set.\n\n    The `input` is expected to contain the unnormalized logits for each class (which do `not` need\n    to be positive or sum to 1, in general).\n    `input` has to be a Tensor of size :math:`(C)` for unbatched input,\n    :math:`(minibatch, C)` or :math:`(minibatch, C, d_1, d_2, ..., d_K)` with :math:`K \\geq 1` for the\n    `K`-dimensional case. The last being useful for higher dimension inputs, such\n    as computing cross entropy loss per-pixel for 2D images.\n'

In [6]:
logits = logits.transpose(1,2)

In [None]:
loss_fn_mean = nn.CrossEntropyLoss()
loss_fn_mean(logits,label)

tensor(1.1688)

如果不加入reduction，就会返回每个元素的交叉熵

In [8]:
loss_fn_none = nn.CrossEntropyLoss(reduction='none')
loss_fn_none(logits,label)

tensor([[0.7650, 0.7929, 0.9647],
        [1.1548, 1.6785, 1.6566]])

实际中，我们需要考虑是否为有效值还是pad的结果,所以会刻意记录每个序列有效长度

In [None]:
tgt_len = torch.Tensor([2,3]).to(torch.int32)  # 这里假设第一个序列的有效长度为2，第二个序列的有效长度为3

**法一:** 制作mask矩阵

In [12]:
mask = torch.cat([torch.unsqueeze(F.pad(torch.ones(L) ,(0,max(tgt_len)-L)),0) for L in tgt_len])
mask

tensor([[1., 1., 0.],
        [1., 1., 1.]])

In [13]:
loss_fn_none(logits,label) * mask

tensor([[0.7650, 0.7929, 0.0000],
        [1.1548, 1.6785, 1.6566]])

**法二** 使用类参数

在`nn.CrossEntropyLoss`中可以设置忽略编号，默认为-100

In [16]:
label[0,2] = -100 
label
    

tensor([[   1,    2, -100],
        [   1,    2,    0]])

In [17]:
loss_fn_none(logits,label)

tensor([[0.7650, 0.7929, 0.0000],
        [1.1548, 1.6785, 1.6566]])