##### El mecanismo de auto-atención con enmascaramiento del modelo Transformer

In [94]:
import torch
import torch.nn as nn
import math

In [95]:
Q = torch.tensor([[0.0, 0.0, 0.0], [1, 1, 1], [0.2, 0.2, 0.2], [0.3, 0.3, 0.3]])
K = torch.tensor([[0.1, 0.1, 0.1], [0.2, 0.2, 0.2], [0.3, 0.3, 0.3], [0.4, 0.4, 0.4]])
V = torch.tensor([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.], [0., 1., 1.]])

In [None]:
class AutoAtencionConEnmasc:
    def score(self, Q: torch.Tensor, K: torch.Tensor):
        return torch.matmul(Q, K.transpose(0, 1))

    def scale(self, score: torch.Tensor, size):
        return score.div(math.sqrt(size))

    def mask(self, scaled: torch.Tensor):
        mask = torch.ones_like(scaled).tril()
        return scaled * mask

    def softmax(self, scores: torch.Tensor):
        inf_mask = torch.full_like(scores, -math.inf).triu(diagonal=1)
        pre_soft = scores + inf_mask
        function = nn.Softmax(dim=1)
        post_soft = function(pre_soft)
        return post_soft

    def multiply_by_values(self, scores: torch.Tensor, V):
        return torch.matmul(scores, V)

    def calculate(self, Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor):
        scored = torch.matmul(Q, K.transpose(0, 1))
        scaled = scored.div(math.sqrt(K.size()[1]))
        masked = scaled * torch.ones_like(scaled).tril()
        inf_mask = torch.full_like(masked, -math.inf).triu(diagonal=1)
        pre_soft = masked + inf_mask
        function = nn.Softmax(dim=1)
        post_soft = function(pre_soft)
        return torch.matmul(post_soft, V)

In [105]:
transformer = AutoAtencionConEnmasc()

score = transformer.score(Q, K)
score

tensor([[0.0000, 0.0000, 0.0000, 0.0000],
        [0.3000, 0.6000, 0.9000, 1.2000],
        [0.0600, 0.1200, 0.1800, 0.2400],
        [0.0900, 0.1800, 0.2700, 0.3600]])

In [98]:
scaled = transformer.scale(score, K.size()[1])
scaled

tensor([[0.0000, 0.0000, 0.0000, 0.0000],
        [0.1732, 0.3464, 0.5196, 0.6928],
        [0.0346, 0.0693, 0.1039, 0.1386],
        [0.0520, 0.1039, 0.1559, 0.2078]])

In [99]:
masked = transformer.mask(scaled)
masked

tensor([[0.0000, 0.0000, 0.0000, 0.0000],
        [0.1732, 0.3464, 0.0000, 0.0000],
        [0.0346, 0.0693, 0.1039, 0.0000],
        [0.0520, 0.1039, 0.1559, 0.2078]])

In [103]:
torch.full_like(masked, -math.inf).triu(diagonal=1)

tensor([[0., -inf, -inf, -inf],
        [0., 0., -inf, -inf],
        [0., 0., 0., -inf],
        [0., 0., 0., 0.]])

In [100]:
softmaxed = transformer.softmax(masked)
softmaxed

tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.4568, 0.5432, 0.0000, 0.0000],
        [0.3219, 0.3332, 0.3449, 0.0000],
        [0.2309, 0.2432, 0.2561, 0.2698]])

In [101]:
multiplied = transformer.multiply_by_values(softmaxed, V)
multiplied

tensor([[1.0000, 0.0000, 0.0000],
        [0.4568, 0.5432, 0.0000],
        [0.3219, 0.3332, 0.3449],
        [0.2309, 0.5130, 0.5260]])

In [106]:
transformer.calculate(Q, K, V)

tensor([[1.0000, 0.0000, 0.0000],
        [0.4568, 0.5432, 0.0000],
        [0.3219, 0.3332, 0.3449],
        [0.2309, 0.5130, 0.5260]])