In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import math
import numpy as np
import re

torch.manual_seed(23)

<torch._C.Generator at 0x72cdf6c8df70>

In [2]:
def check_gpu():
    if torch.cuda.is_available():
        print("CUDA está disponible.")
        print(f"Hay {torch.cuda.device_count()} GPU(s) disponible(s).")
        for i in range(torch.cuda.device_count()):
            print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
    else:
        print("CUDA no está disponible. No hay GPU accesible.")

check_gpu()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

CUDA está disponible.
Hay 1 GPU(s) disponible(s).
GPU 0: NVIDIA GeForce RTX 2060


In [61]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model = 512, num_heads = 8):
        super().__init__()
        assert d_model % num_heads == 0, 'Embedding size not compatible with num heads'
        
        self.d_v = d_model // num_heads
        print(f"Mi d_v {self.d_v}")
        self.d_k = self.d_v
        self.num_heads = num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        print(f"Mi W_q {self.W_q.weight.data}")
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def forward(self, Q, K, V, mask = None):
        batch_size = Q.size(0)
        '''
        Q, K, V -> [batch_size, seq_len, num_heads*d_k]
        after transpose Q, K, V -> [batch_size, num_heads, seq_len, d_k]
        '''
        print("-----PRUEBAS------")
        Q_test = self.W_q(Q)
        print("Mi Q_test:", Q_test)
        print("Mi Q_test_dims:", Q_test.shape)
        print("Q_test.view():", Q_test.view(batch_size, -1, self.num_heads, self.d_k))
        print("Q_test.view().transpose():", Q_test.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2 ))

        
        Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2 )
        K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2 )
        V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2 )
        
        weighted_values, attention = self.scale_dot_product(Q, K, V, mask)
        print("------------- PRUEBAS EN WEIGHTED_VALES -------------")
        weigh_val = weighted_values.transpose(1, 2)
        print("Matrix Weight_val_transpose:", weigh_val)
        weigh_val_cont = weigh_val.contiguous()
        print("Matrix Weight_val_transpose_contiguous:", weigh_val)
        weigh_val_views = weigh_val_cont.view(batch_size, -1, self.num_heads*self.d_k)
        print("Matrix Weight_val_transpose_contiguous_views:", weigh_val_views)
        weighted_values = weighted_values.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads*self.d_k)
        print("Matrix weights W_o:", self.W_o.weight.data)
        weighted_values = self.W_o(weighted_values)
        print("Matrix weighted_values after W_o:", weighted_values)
        
        return weighted_values, attention
        
        
    def scale_dot_product(self, Q, K, V, mask = None):
        print("-------- PRUEBAS EN SCALE_DOT_PRODUCT ------------")
        print("Mi matriz K original:", K)
        k_trans = K.transpose(-2, -1)
        print("Mi matriz Q:", Q)
        print("Mi matriz K transposed", k_trans)
        qk = torch.matmul(Q, k_trans)
        print("Matrix Q*K_trans:", qk)

        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        print("Scores:", scores)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attention = F.softmax(scores, dim = -1)
        print("Attention:", attention)
        weighted_values = torch.matmul(attention, V)
        print("Matrix V:", V)
        print("Weighted Values:", weighted_values)
        
        return weighted_values, attention

In [62]:
# Testeo de la clase
d_model = 4 # Original: 512
num_heads = 2 # Original: 8
seq_len = 2 # Original: 10
batch_size = 1 # Original: 32

# Crear tensores Q, K, V aleatorios
Q = torch.rand(batch_size, seq_len, d_model)
print(f"Q: {Q}")
print(f"Q shape: {Q.shape}")
K = torch.rand(batch_size, seq_len, d_model)
V = torch.rand(batch_size, seq_len, d_model)

# Crear una instancia de MultiHeadAttention
mha = MultiHeadAttention(d_model, num_heads)

# Llamar al método forward para testeo
output, attention = mha(Q, K, V)

print("Output shape:", output.shape)   # Debe ser [batch_size, seq_len, d_model]
print("Attention shape:", attention.shape) # Debe ser [batch_size, num_heads, seq_len, seq_len]

Q: tensor([[[0.4173, 0.5896, 0.0422, 0.3887],
         [0.9875, 0.7190, 0.3843, 0.0279]]])
Q shape: torch.Size([1, 2, 4])
Mi d_v 2
Mi W_q tensor([[-0.3608,  0.1253,  0.0319,  0.4562],
        [ 0.1704, -0.4905, -0.3813, -0.3382],
        [-0.0396, -0.4052, -0.0390, -0.4643],
        [ 0.1030,  0.4129, -0.3891, -0.2233]])
-----PRUEBAS------
Mi Q_test: tensor([[[-0.3142, -0.4374, -0.8772, -0.2649],
         [-0.6574, -0.4121, -0.7980, -0.2054]]], grad_fn=<ViewBackward0>)
Mi Q_test_dims: torch.Size([1, 2, 4])
Q_test.view(): tensor([[[[-0.3142, -0.4374],
          [-0.8772, -0.2649]],

         [[-0.6574, -0.4121],
          [-0.7980, -0.2054]]]], grad_fn=<ViewBackward0>)
Q_test.view().transpose(): tensor([[[[-0.3142, -0.4374],
          [-0.6574, -0.4121]],

         [[-0.8772, -0.2649],
          [-0.7980, -0.2054]]]], grad_fn=<TransposeBackward0>)
-------- PRUEBAS EN SCALE_DOT_PRODUCT ------------
Mi matriz K original: tensor([[[[-0.0566,  0.1305],
          [ 0.0128,  0.0497]],

      