In [66]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [67]:
import torch
import torch.nn as nn

In [68]:
class LayerNormalization(nn.Module):
    def __init__(self,emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self,x):
        mean = x.mean(dim = -1,keepdim = True)
        var = x.var(dim = -1,keepdim = True,unbiased = False)
        norm_x = (x-mean)/torch.sqrt(var+self.eps)
        return self.scale*norm_x+self.shift

In [69]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self,x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
            (x + 0.044715 * torch.pow(x, 3))
        ))

In [70]:
class FeedForward(nn.Module):
    def __init__(self,d_model,dropout):
        super().__init__()
        self.d_model = d_model
        self.layers = nn.Sequential(
            nn.Linear(self.d_model,4*self.d_model),
            GELU(),
            nn.Linear(4*self.d_model,self.d_model)
        )
    def forward(self,x):
        return self.layers(x)

In [71]:
class Embedding(nn.Module):
    def __init__(self,d_model,vocab_size,max_seq_len,n_segments):
        super().__init__()
        self.tok_emb = nn.Embedding(vocab_size,d_model)
        self.pos_emb = nn.Embedding(max_seq_len,d_model)
        self.seg_emb = nn.Embedding(n_segments,d_model)
        self.norm = LayerNormalization(d_model)
    def forward(self,x,seg):
        seq_len = x.size(1)
        pos = torch.arange(seq_len,dtype=torch.long)
        pos = pos.unsqueeze(0).expand_as(x) # (seq_len,) -> (batch_size, seq_len)
        embedding = self.tok_emb(x) + self.pos_emb(pos) + self.seg_emb(seg)
        return self.norm(embedding)

In [72]:
class BERTMultiheadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads

        # Linear projections for Query, Key, Value
        self.W_query = nn.Linear(d_model, d_model, bias=False)
        self.W_key = nn.Linear(d_model, d_model, bias=False)
        self.W_value = nn.Linear(d_model, d_model, bias=False)
        self.out_prj = nn.Linear(d_model, d_model, bias=False)

        # Dropout for regularization
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, attention_mask=None):
        """
        x: Input tensor of shape (batch_size, seq_len, d_model)
        attention_mask: Mask of shape (batch_size, seq_len), where 1 indicates valid tokens and 0 indicates padding.
        """
        b, seq_len, _ = x.shape

        # Compute Q, K, V
        queries = self.W_query(x)  # Shape: (batch_size, seq_len, d_model)
        keys = self.W_key(x)
        values = self.W_value(x)

        # Split into multiple heads
        queries = queries.view(b, seq_len, self.num_heads, self.head_dim).transpose(1, 2)  # (b, num_heads, seq_len, head_dim)
        keys = keys.view(b, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        values = values.view(b, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        # Compute scaled dot-product attention
        attn_scores = torch.matmul(queries, keys.transpose(-2, -1))  # (b, num_heads, seq_len, seq_len)
        attn_scores = attn_scores / (self.head_dim ** 0.5)  # Scale by sqrt(d_k)

        # Apply attention mask
        if attention_mask is not None:
            # Expand mask to match attention scores shape
            # attention_mask: (b, seq_len) -> (b, 1, 1, seq_len)
            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
            attn_scores = attn_scores.masked_fill(attention_mask == 0, -torch.inf)

        # Compute attention weights
        attn_weights = torch.softmax(attn_scores, dim=-1)  # (b, num_heads, seq_len, seq_len)
        attn_weights = self.dropout(attn_weights)

        # Compute context vector
        context = torch.matmul(attn_weights, values)  # (b, num_heads, seq_len, head_dim)
        context = context.transpose(1, 2).contiguous()  # (b, seq_len, num_heads, head_dim)
        context = context.view(b, seq_len, self.d_model)  # Combine heads

        # Output projection
        output = self.out_prj(context)
        return output


In [73]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dropout):
        super().__init__()
        self.attention = BERTMultiheadAttention(d_model, num_heads, dropout)
        self.norm1 = LayerNormalization(d_model)
        self.dropout1 = nn.Dropout(dropout)
        
        self.feed_forward = FeedForward(d_model, dropout)
        self.norm2 = LayerNormalization(d_model)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, attention_mask=None):
        # Multihead Attention + Residual Connection + LayerNorm
        attn_output = self.attention(x, attention_mask)
        x = self.norm1(x + self.dropout1(attn_output))
        
        # Feed Forward + Residual Connection + LayerNorm
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout2(ff_output))
        
        return x

In [74]:
class TransformerEncoder(nn.Module):
    def __init__(self, d_model, num_heads, num_layers, dropout):
        super().__init__()
        self.layers = nn.ModuleList([
            TransformerEncoderLayer(d_model, num_heads, dropout) for _ in range(num_layers)
        ])

    def forward(self, x, attention_mask=None):
        for layer in self.layers:
            x = layer(x, attention_mask)
        return x

In [75]:
class BERT(nn.Module):
    def __init__(self, d_model, vocab_size, max_seq_len, n_segments, num_heads, num_layers, dropout):
        super().__init__()
        self.embedding = Embedding(d_model, vocab_size, max_seq_len, n_segments)
        self.encoder = TransformerEncoder(d_model, num_heads, num_layers, dropout)

    def forward(self, input_ids, segment_ids, attention_mask=None):
        # Step 1: Embedding
        x = self.embedding(input_ids, segment_ids)
        
        # Step 2: Transformer Encoder
        x = self.encoder(x, attention_mask)
        
        return x

In [76]:
d_model = 64
vocab_size = 30522
max_seq_len = 128
n_segments = 2
num_heads = 8
num_layers = 6
dropout = 0.1

model = BERT(d_model=d_model, vocab_size=vocab_size, max_seq_len=max_seq_len,
             n_segments=n_segments, num_heads=num_heads, num_layers=num_layers, dropout=dropout)
model

BERT(
  (embedding): Embedding(
    (tok_emb): Embedding(30522, 64)
    (pos_emb): Embedding(128, 64)
    (seg_emb): Embedding(2, 64)
    (norm): LayerNormalization()
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (attention): BERTMultiheadAttention(
          (W_query): Linear(in_features=64, out_features=64, bias=False)
          (W_key): Linear(in_features=64, out_features=64, bias=False)
          (W_value): Linear(in_features=64, out_features=64, bias=False)
          (out_prj): Linear(in_features=64, out_features=64, bias=False)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (norm1): LayerNormalization()
        (dropout1): Dropout(p=0.1, inplace=False)
        (feed_forward): FeedForward(
          (layers): Sequential(
            (0): Linear(in_features=64, out_features=256, bias=True)
            (1): GELU()
            (2): Linear(in_features=256, out_features=64, bias=True)
          )
 

In [77]:
batch_size = 2
seq_len = 10  # Chiều dài chuỗi đầu vào

# input_ids chứa các token ID
input_ids = torch.randint(0, vocab_size, (batch_size, seq_len))  # (batch_size, seq_len)

# segment_ids phân biệt câu A (0) và câu B (1)
segment_ids = torch.randint(0, n_segments, (batch_size, seq_len))  # (batch_size, seq_len)

# attention_mask chỉ định token nào là thật (1) và padding (0)
attention_mask = torch.tensor([
    [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],  # Dòng 1: 5 token thật, 5 token padding
    [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],  # Dòng 2: 4 token thật, 6 token padding
])
output = model(input_ids, segment_ids, attention_mask)
output

tensor([[[ 7.2763e-04, -2.7103e+00,  1.1278e+00,  ..., -3.4072e-01,
          -6.3131e-01,  1.5203e-01],
         [-4.9745e-01, -1.0996e+00,  8.4621e-01,  ...,  9.9676e-01,
           2.9121e-01,  6.2709e-01],
         [ 4.8468e-01, -1.2053e+00,  9.1026e-01,  ..., -4.7352e-01,
          -1.2227e-01,  2.6009e-02],
         ...,
         [ 2.0146e+00, -2.3029e+00,  2.9124e-01,  ...,  3.7293e-01,
           1.4887e-02,  8.8913e-01],
         [ 9.6224e-01, -1.5969e+00,  1.5195e+00,  ..., -4.8423e-02,
          -1.2846e+00, -2.3044e-02],
         [-7.8419e-01, -1.9584e+00,  1.4113e+00,  ..., -1.7711e-01,
          -1.6480e+00,  2.2315e-01]],

        [[ 1.9133e-01, -2.0663e+00,  1.3880e+00,  ..., -1.2733e+00,
          -9.7928e-01, -5.2106e-01],
         [ 5.3954e-02, -2.5436e+00,  4.3138e-01,  ...,  5.3130e-01,
           2.0112e-01,  1.8335e+00],
         [ 6.6519e-03, -1.3078e+00,  1.7699e-01,  ..., -1.9498e-02,
           4.7979e-01,  8.5968e-01],
         ...,
         [ 7.5574e-01, -6