In [1]:
import collections
import math
import torch
from torch import nn
from d2l import torch as d2l

## Loss Function

At each time step, the decoder
predicts a probability distribution for the output tokens.
Similar to language modeling,
we can apply softmax to obtain the distribution
and calculate the cross-entropy loss for optimization.
Recall :numref:`sec_machine_translation`
that the special padding tokens
are appended to the end of sequences
so sequences of varying lengths
can be efficiently loaded
in minibatches of the same shape.
However,
prediction of padding tokens
should be excluded from loss calculations.

To this end,
we can use the following
`sequence_mask` function
to [**mask irrelevant entries with zero values**]
so later
multiplication of any irrelevant prediction
with zero equals to zero.
For example,
if the valid length of two sequences
excluding padding tokens
are one and two, respectively,
the remaining entries after
the first one
and the first two entries are cleared to zeros.


In [2]:
#@save
def sequence_mask(X, valid_len, value=0):
    """Mask irrelevant entries in sequences."""
    maxlen = X.size(1)
    mask = torch.arange((maxlen), dtype=torch.float32,
                        device=X.device)[None, :] < valid_len[:, None]
    X[~mask] = value
    return X

X = torch.tensor([[1, 2, 3], [4, 5, 6]])
sequence_mask(X, torch.tensor([1, 2]))

tensor([[1, 0, 0],
        [4, 5, 0]])

In [27]:
X = torch.tensor([1,2,3])
print("X[None,:]")
print(X[None,:].shape)
print(X[None,:])
print("X[:, None]")
print(X[:, None].shape)
print(X[:, None])
print("X[None,:]<X[:, None]")
print(X[None,:]<X[:, None])
print(~(X[None,:]<X[:, None]))
# 小于会触发Broadcasting, 生成一个3x3的tensor
Y = torch.ones(3,3)
print(Y)
Y[~(X[None,:]<X[:, None])]=0;
print(Y)

Z = torch.arange(1,10).reshape(-1,3)
print(Z[~(X[None,:]<X[:, None])]) #直接索引返回的是所有满足条件的元素
Z[~(X[None,:]<X[:, None])]=0 # 也可以利用mask直接赋值
print(Z)

X[None,:]
torch.Size([1, 3])
tensor([[1, 2, 3]])
X[:, None]
torch.Size([3, 1])
tensor([[1],
        [2],
        [3]])
X[None,:]<X[:, None]
tensor([[False, False, False],
        [ True, False, False],
        [ True,  True, False]])
tensor([[ True,  True,  True],
        [False,  True,  True],
        [False, False,  True]])
tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
tensor([[0., 0., 0.],
        [1., 0., 0.],
        [1., 1., 0.]])
tensor([1, 2, 3, 5, 6, 9])
tensor([[0, 0, 0],
        [4, 0, 0],
        [7, 8, 0]])


(**We can also mask all the entries across the last
few axes.**)
If you like, you may even specify
to replace such entries with a non-zero value.


In [None]:
X = torch.ones(2, 3, 4)
sequence_mask(X, torch.tensor([1, 2]), value=-1)

tensor([[[ 1.,  1.,  1.,  1.],
         [-1., -1., -1., -1.],
         [-1., -1., -1., -1.]],

        [[ 1.,  1.,  1.,  1.],
         [ 1.,  1.,  1.,  1.],
         [-1., -1., -1., -1.]]])

Now we can [**extend the softmax cross-entropy loss
to allow the masking of irrelevant predictions.**]
Initially,
masks for all the predicted tokens are set to one.
Once the valid length is given,
the mask corresponding to any padding token
will be cleared to zero.
In the end,
the loss for all the tokens
will be multipled by the mask to filter out
irrelevant predictions of padding tokens in the loss.


In [37]:
#@save
class MaskedSoftmaxCELoss(nn.CrossEntropyLoss):
    """The softmax cross-entropy loss with masks."""
    # `pred` shape: (`batch_size`, `num_steps`, `vocab_size`)
    # `label` shape: (`batch_size`, `num_steps`)
    # `valid_len` shape: (`batch_size`,)
    def forward(self, pred, label, valid_len):
        weights = torch.ones_like(label)
        weights = sequence_mask(weights, valid_len)
        self.reduction='none'
        unweighted_loss = super(MaskedSoftmaxCELoss, self).forward(
            pred.permute(0, 2, 1), label)
        print(unweighted_loss.shape)
        print(unweighted_loss)
        weighted_loss = (unweighted_loss * weights).mean(dim=1)
        print(unweighted_loss * weights)
        print(weighted_loss.shape)
        print(weighted_loss)
        return weighted_loss
        # return unweighted_loss

For [**a sanity check**], we can create three identical sequences.
Then we can
specify that the valid lengths of these sequences
are 4, 2, and 0, respectively.
As a result,
the loss of the first sequence
should be twice as large as that of the second sequence,
while the third sequence should have a zero loss.


In [38]:
loss = MaskedSoftmaxCELoss()
loss(torch.ones(3, 4, 10), torch.ones((3, 4), dtype=torch.long),
     torch.tensor([4, 2, 0]))

torch.Size([3, 4])
tensor([[2.3026, 2.3026, 2.3026, 2.3026],
        [2.3026, 2.3026, 2.3026, 2.3026],
        [2.3026, 2.3026, 2.3026, 2.3026]])
tensor([[2.3026, 2.3026, 2.3026, 2.3026],
        [2.3026, 2.3026, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000]])
torch.Size([3])
tensor([2.3026, 1.1513, 0.0000])


tensor([2.3026, 1.1513, 0.0000])