RuntimeError: The expanded size of the tensor (4) must match the existing size (2) at non-singleton dimension 1.  Target sizes: [-1, 4, -1, -1].  Tensor sizes: [2, 1, 10]

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math


class MultiHeadAttention(nn.Module):
    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model  # Embedding vector size
        self.h = h  # Number of heads
        assert d_model % h == 0, "d_model is not divisible by h"
        self.dropout = nn.Dropout(dropout)
        self.d_k = d_model // h  # Dimension of vector seen by each head
        self.q_lin = nn.Linear(d_model, d_model, bias=True)  # Wq
        self.k_lin = nn.Linear(d_model, d_model, bias=True)  # Wk
        self.v_lin = nn.Linear(d_model, d_model, bias=True)  # Wv
        self.out_lin = nn.Linear(d_model, d_model, bias=True)  # Wo

    @staticmethod
    def attention(query, key, value, mask, dropout):
        d_k = query.shape[-1]
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            batch_size, num_heads, seq_len, _ = attention_scores.size()
            mask = mask.unsqueeze(1)  # Chỉ cần một lần unsqueeze ở đây
            mask = mask.expand(batch_size, num_heads, seq_len, seq_len)
            attention_scores = attention_scores.masked_fill(mask == 0, -1e9)
        attention_scores = attention_scores.softmax(dim=-1)
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        return (attention_scores @ value), attention_scores

    def forward(self, q, k, v, mask):
        query = self.q_lin(q)
        key = self.k_lin(k)
        value = self.v_lin(v)

        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)

        x, self.attention_scores = MultiHeadAttention.attention(query, key, value, mask, self.dropout)

        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.d_model)
        return self.out_lin(x)

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Dropout
import math

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, h, dropout_rate):
        super().__init__()
        self.d_model = d_model
        self.h = h
        self.d_k = d_model // h
        self.dropout = nn.Dropout(dropout_rate)
        self.q_lin = nn.Linear(d_model, d_model)
        self.k_lin = nn.Linear(d_model, d_model)
        self.v_lin = nn.Linear(d_model, d_model)
        self.out_lin = nn.Linear(d_model, d_model)

    def attention(self, query, key, value, mask, dropout):
        d_k = query.shape[-1]
        # Compute attention scores
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            attention_scores = attention_scores.masked_fill(mask == 0, float('-inf'))
        attention_scores = F.softmax(attention_scores, dim=-1)
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        # Multiply scores with value
        output = attention_scores @ value
        return output, attention_scores

    def forward(self, q, k, v, mask):
        batch_size = q.size(0)
        # Apply linear projection and split into h heads
        q = self.q_lin(q).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
        k = self.k_lin(k).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
        v = self.v_lin(v).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
        # Calculate attention
        outputs, scores = self.attention(q, k, v, mask, self.dropout)
        # Concatenate heads and put through final linear layer
        outputs = outputs.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        return self.out_lin(outputs)

# Parameters
batch_size = 2
seq_len = 10
d_model = 64  # Embedding size
num_heads = 4
dropout_rate = 0.1

# Create query, key, value
query = torch.rand(batch_size, seq_len, d_model)
key = torch.rand(batch_size, seq_len, d_model)
value = torch.rand(batch_size, seq_len, d_model)

# Create mask
mask = torch.zeros(batch_size, seq_len).bool()  # Assuming no padding
mask = mask.unsqueeze(1).unsqueeze(2).expand(-1, num_heads, -1, -1)  # Dimension: [batch_size, num_heads, seq_len, seq_len]

# Create dropout module
dropout = Dropout(p=dropout_rate)

# Initialize MultiHeadAttention module
mha = MultiHeadAttention(d_model, num_heads, dropout_rate)

# Call attention method
output, attention_scores = mha(query, key, value, mask)
print("Output shape:", output.shape)  # Expected shape: [batch_size, seq_len, d_model]
print("Attention Scores shape:", attention_scores.shape)  # Expected shape: [batch_size, num_heads, seq_len, seq_len]


Output shape: torch.Size([10, 64])
Attention Scores shape: torch.Size([10, 64])


In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, h, dropout_rate):
        super().__init__()
        self.d_model = d_model
        self.h = h
        self.d_k = d_model // h
        self.dropout = nn.Dropout(dropout_rate)
        self.q_lin = nn.Linear(d_model, d_model)
        self.k_lin = nn.Linear(d_model, d_model)
        self.v_lin = nn.Linear(d_model, d_model)
        self.out_lin = nn.Linear(d_model, d_model)

    def attention(self, query, key, value, mask, dropout):
        d_k = query.shape[-1]
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            attention_scores = attention_scores.masked_fill(mask == 0, float('-inf'))
        attention_scores = F.softmax(attention_scores, dim=-1)
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        output = attention_scores @ value
        return output, attention_scores

    def forward(self, q, k, v, mask):
        batch_size = q.size(0)
        q = self.q_lin(q).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
        k = self.k_lin(k).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
        v = self.v_lin(v).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)

        # Adjust mask for multi-head
        if mask is not None:
            mask = mask.unsqueeze(1).expand(batch_size, self.h, -1, -1)  # Adjust mask shape for multi-head attention

        outputs, scores = self.attention(q, k, v, mask, self.dropout)
        outputs = outputs.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        return self.out_lin(outputs)


In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Dropout
import math

# Define MultiHeadAttention
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout_rate):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.depth = d_model // num_heads
        
        self.q_lin = nn.Linear(d_model, d_model)
        self.k_lin = nn.Linear(d_model, d_model)
        self.v_lin = nn.Linear(d_model, d_model)
        
        self.dropout = nn.Dropout(dropout_rate)
        self.out_lin = nn.Linear(d_model, d_model)

    def split_heads(self, x, batch_size):
        x = x.view(batch_size, -1, self.num_heads, self.depth)
        return x.permute(0, 2, 1, 3)

    def attention(self, query, key, value, mask):
        # Scale the dot product by the dimensions of the key
        scaled_attention_scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.depth)
        if mask is not None:
            scaled_attention_scores += (mask * -1e9)
        attention_weights = F.softmax(scaled_attention_scores, dim=-1)
        attention_weights = self.dropout(attention_weights)
        output = torch.matmul(attention_weights, value)
        return output, attention_weights

    def forward(self, q, k, v, mask):
        batch_size = q.size(0)
        q = self.split_heads(self.q_lin(q), batch_size)
        k = self.split_heads(self.k_lin(k), batch_size)
        v = self.split_heads(self.v_lin(v), batch_size)

        attention_output, _ = self.attention(q, k, v, mask)
        attention_output = attention_output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        output = self.out_lin(attention_output)
        return output

# Dummy data
batch_size = 1
seq_len = 60
d_model = 512
num_heads = 8

# Initialize model
dropout_rate = 0.1
model = MultiHeadAttention(d_model, num_heads, dropout_rate)

# Create query, key, value
query = torch.rand(batch_size, seq_len, d_model)
key = torch.rand(batch_size, seq_len, d_model)
value = torch.rand(batch_size, seq_len, d_model)

# Create mask
def create_causal_mask(size):
    mask = torch.triu(torch.ones(size, size) * float('-inf'), diagonal=1)
    return mask == 0

mask = create_causal_mask(seq_len).to(torch.bool)
mask = mask.unsqueeze(0).unsqueeze(1)  # fit the mask shape for the attention heads: [batch_size, 1, seq_len, seq_len]

# Test the forward pass
output = model(query, key, value, mask)
print("Output shape:", output.shape)


Output shape: torch.Size([1, 60, 512])


In [None]:
def causal_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0

def greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device):
    sos_idx = tokenizer_tgt.convert_tokens_to_ids('[SOS]')
    eos_idx = tokenizer_tgt.convert_tokens_to_ids('[EOS]')

    encoder_output = model.encoder(source, source_mask)
    decoder_input = torch.tensor([[sos_idx]], dtype=torch.long, device=device)

    while True:
        if decoder_input.size(1) == max_len:
            break

        decoder_mask = causal_mask(decoder_input.size(1)).type_as(source_mask).to(device)
        out = model.decoder(decoder_input, encoder_output, source_mask, decoder_mask)
        prob = model.projection(out[:, -1])
        _, next_word = torch.max(prob, dim=1)

        decoder_input = torch.cat(
            [decoder_input, torch.tensor([[next_word.item()]], dtype=torch.long, device=device)], dim=1
        )

        if next_word.item() == eos_idx:
            break

    return decoder_input.squeeze(0)

In [23]:
from datasets import load_dataset
from utils.dataset import TranslationDataset

datasets = load_dataset(
    'csv', 
    data_files={
        'train': '/home/chaos/Documents/ChaosAIVision/dataset/viet2eng/train_dataset.csv',
        'validation': '/home/chaos/Documents/ChaosAIVision/dataset/viet2eng/validation_dataset.csv'
    }
)
train_dataset = datasets['train']
valid_dataset = datasets['validation']

In [24]:
from transformers import AutoTokenizer

# Giả sử bạn sử dụng tokenizer của BERT
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# Tạo dataset
train_data = TranslationDataset(train_dataset, tokenizer)
valid_data = TranslationDataset(valid_dataset, tokenizer)

In [43]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout_rate):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.depth = d_model // num_heads        
        self.q_lin = nn.Linear(d_model, d_model)
        self.k_lin = nn.Linear(d_model, d_model)
        self.v_lin = nn.Linear(d_model, d_model)
        
        self.dropout = nn.Dropout(dropout_rate)
        self.out_lin = nn.Linear(d_model, d_model)

    def split_heads(self, x, batch_size):
        x = x.view(batch_size, -1, self.num_heads, self.depth)
        return x.permute(0, 2, 1, 3)

    def attention(self, query, key, value, mask):
        # Scale the dot product by the dimensions of the key
        scaled_attention_scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.depth)
        if mask is not None:
            scaled_attention_scores += (mask * -1e9)
        attention_weights = F.softmax(scaled_attention_scores, dim=-1)
        attention_weights = self.dropout(attention_weights)
        output = torch.matmul(attention_weights, value)
        return output, attention_weights

    def forward(self, q, k, v, mask):
        batch_size = q.size(0)
        q = self.split_heads(self.q_lin(q), batch_size)
        k = self.split_heads(self.k_lin(k), batch_size)
        v = self.split_heads(self.v_lin(v), batch_size)

        attention_output, attention_weights = self.attention(q, k, v, mask)
        attention_output = attention_output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        output = self.out_lin(attention_output)
        return output, attention_weights




class FFN(nn.Module):
    def __init__(self, embed_dim, ff_dim, dropout):
        super(FFN, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.lin1 = nn.Linear(embed_dim, ff_dim)
        self.lin2 = nn.Linear(ff_dim, embed_dim)
        self.activation = nn.GELU()

    def forward(self, x):
        x = self.lin1(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.lin2(x)
        return x
    
class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, ff_dim, dropout):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.sa_layer_norm = nn.LayerNorm(d_model, eps=1e-12)  # Chuẩn hóa lớp Self-Attention
        self.ffn = FFN(d_model, ff_dim, dropout)
        self.output_layer_norm = nn.LayerNorm(d_model, eps=1e-12)  # Chuẩn hóa lớp đầu ra

    def forward(self, x, mask):
        # Self-Attention và residual connection
        attn_output, _ = self.attention(x, x, x, mask)  # (batch_size, seq_len, d_model)
        x = self.sa_layer_norm(x + attn_output)  # Residual connection và chuẩn hóa lớp

        # Mạng Feed-Forward và residual connection
        ffn_output = self.ffn(x)  # (batch_size, seq_len, d_model)
        x = self.output_layer_norm(x + ffn_output)  # Residual connection và chuẩn hóa lớp

        return x


class ProjectionLayer(nn.Module):

    def __init__(self, d_model, vocab_size) -> None:
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        # (batch, seq_len, d_model) --> (batch, seq_len, vocab_size)
        return self.proj(x)


class TokenAndPositionEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_length, dropout, device):
        super().__init__()
        self.device = device
        self.word_embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)
        self.position_embeddings = nn.Embedding(num_embeddings=max_length, embedding_dim=d_model)
        self.Layer_norm = nn.LayerNorm(d_model, eps=1e-12)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x):
        N, seq_len = x.size()
        positions = torch.arange(0, seq_len, device=self.device).unsqueeze(0)
        word_emb = self.word_embeddings(x)
        pos_emb = self.position_embeddings(positions)
        x = word_emb + pos_emb
        return self.Layer_norm(self.dropout(x))


class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, max_length, d_model, num_heads, ff_dim, dropout, device):
        super().__init__()
        self.embd_layer = TokenAndPositionEmbedding(vocab_size, d_model, max_length, dropout, device)
        self.transformer_layers = nn.ModuleList([
            TransformerBlock(d_model, num_heads, ff_dim, dropout)
            for _ in range(6)  # Number of transformer layers
        ])
        
    def forward(self, x, mask=None):
        x = self.embd_layer(x)
        for layer in self.transformer_layers:
            x = layer(x, mask)
        return x

class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, max_length, d_model, num_heads, ff_dim, dropout, device):
        super().__init__()
        self.embd_layer = TokenAndPositionEmbedding(vocab_size, d_model, max_length, dropout, device)
        self.transformer_layers = nn.ModuleList([
            TransformerBlock(d_model, num_heads, ff_dim, dropout)
            for _ in range(3)  # Number of transformer layers
        ])
        self.layer_norm = nn.LayerNorm(d_model)
        self.attention = MultiHeadAttention(d_model, num_heads, dropout)

    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        x = self.embd_layer(x)
        for layer in self.transformer_layers:
            x = layer(x, tgt_mask)
            # Add cross attention layer
            cross_attn_output, _ = self.attention(x, enc_output, enc_output, src_mask)
            x = x + cross_attn_output  # Residual connection after cross attention
            x = self.layer_norm(x)  # Apply layer normalization after residual connection
        return x  # Trả về kích thước (batch_size, seq_len, d_model)


class TransformerSeq2Seq(nn.Module):
    def __init__(self, vocab_size, max_length, d_model, num_heads, ff_dim, dropout, device):
        super().__init__()
        self.encoder = TransformerEncoder(vocab_size, max_length, d_model, num_heads, ff_dim, dropout, device)
        self.decoder = TransformerDecoder(vocab_size, max_length, d_model, num_heads, ff_dim, dropout, device)
        self.projection = nn.Linear(d_model, vocab_size)  # Projection layer

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        enc_output = self.encoder(src, src_mask)
        dec_output = self.decoder(tgt, enc_output, src_mask, tgt_mask)
        return self.projection(dec_output)  # Dùng lớp projection để chuyển đổi từ d_model -> vocab_size


In [44]:
import torch
d_model = 768
num_heads = 12
ff_dim = 3072
dropout = 0.1

model = TransformerSeq2Seq(
        vocab_size=vocab_size,
        max_length=max_length,
        d_model=d_model,
        num_heads=num_heads,
        ff_dim=ff_dim,
        dropout=dropout,
        device=device
    ).to(device)
# Các thông tin cần thiết
max_length = 128  # Độ dài tối đa của chuỗi
vocab_size = 30522  # Giả sử vocab size của bạn là 30522 (ví dụ cho BERT tokenizer)
batch_size = 1  # Batch size của bạn, có thể là 1 hoặc hơn tùy ý

# Tạo dummy input giống như dataset
input_ids = torch.randint(1, vocab_size, (batch_size, max_length))  # Giả định input_ids có các giá trị ngẫu nhiên từ 1 đến vocab_size
attention_mask = torch.ones((batch_size, 1, 1, max_length))  # Giả định tất cả các từ đều được chú ý (1s)
decoder_input = torch.randint(1, vocab_size, (batch_size, max_length))  # Giả định input_ids cho decoder
decoder_mask = torch.ones((batch_size, 1, max_length, max_length))  # Giả định mask cho decoder
labels = torch.randint(1, vocab_size, (batch_size, max_length))  # Giả định nhãn (labels) có các giá trị ngẫu nhiên từ 1 đến vocab_size

# Đưa dummy input vào model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
decoder_input = decoder_input.to(device)
decoder_mask = decoder_mask.to(device)
labels = labels.to(device)

# Đặt mô hình vào chế độ đánh giá
model.eval()
with torch.no_grad():
    encoder_output = model.encoder(input_ids, attention_mask)
    decoder_output = model.decoder(decoder_input, encoder_output, attention_mask, decoder_mask)
    # Kiểm tra kích thước đầu ra của decoder trước khi đưa vào ProjectionLayer
    print("Decoder output size:", decoder_output.size())  # Nên là (batch_size, seq_len, d_model)

    # Sau đó truyền nó vào lớp ProjectionLayer
    output = model.projection(decoder_output)

    # Kiểm tra lại kích thước đầu ra sau ProjectionLayer
    print("Final output size:", output.size())  # Nên là (batch_size, seq_len, vocab_size)


# In ra kích thước đầu ra và giá trị của đầu ra
print("Output size:", output.size())
print("Output:", output)


Decoder output size: torch.Size([1, 128, 768])
Final output size: torch.Size([1, 128, 30522])
Output size: torch.Size([1, 128, 30522])
Output: tensor([[[-0.0514, -0.1942, -0.7749,  ..., -0.9980,  0.0474, -0.4377],
         [-0.7259, -0.8567, -0.4070,  ..., -1.0523,  1.1871, -0.7497],
         [ 0.4172,  0.4006,  0.4665,  ...,  0.0643, -1.1330,  0.1484],
         ...,
         [ 0.8848, -1.2318, -0.1485,  ..., -0.1547,  0.5888, -0.9604],
         [-0.0530, -0.2978, -0.4419,  ...,  1.0947,  0.1357, -0.0667],
         [-0.2117, -0.4139,  0.0339,  ...,  0.2559,  0.5129, -0.6952]]],
       device='cuda:0')
