In [3]:
import pandas as pd
import re
from underthesea import word_tokenize

class Tokenizer:
    def __init__(self):
        self.use_re = False
        self.stop_words = set()
    
    def preprocess_text(self, text, re_pattern=None, stop_words=None):
        words = word_tokenize(text)
        if self.use_re and re_pattern:
            pattern = "[" + re.escape(re_pattern) + "]"
            text = re.sub(pattern, '', text)
            words = word_tokenize(text)
        if self.stop_words and stop_words:
            words = [word for word in words if word.lower() not in self.stop_words]
        return words

    def tokenize_df(self, df, input_column, output_column, re_pattern=None, stop_words=None):
        if re_pattern:
            self.use_re = True
        else:
            self.use_re = False
        if stop_words:
            with open(stop_words, "r", encoding="utf-8") as file:
                self.stop_words.update(line.strip() for line in file)
        else:
            self.stop_words = set()

        df['input'] = df[input_column].apply(lambda x: self.preprocess_text(x, re_pattern, stop_words))
        df['output'] = df[output_column].apply(lambda x: self.preprocess_text(x, re_pattern, stop_words))

        df.drop(columns=[input_column, output_column], inplace=True)
        return df

# Example usage
data = {
    'patterns0': ['Hello, how are you?', 'I am fine, thank you!'],
    'tag': ['greetings', 'responses']
}

df = pd.DataFrame(data)

tokenizer = Tokenizer()

# With re pattern and stop words
df_processed_with_re = tokenizer.tokenize_df(df.copy(), 'patterns0', 'tag', "[.;\]=]!?", "vietnamese.txt")
print("Processed with re pattern and stop words:")
print(df_processed_with_re[['input', 'output']])

# Without re pattern but with stop words
df_processed_with_stop_words = tokenizer.tokenize_df(df.copy(), 'patterns0', 'tag', stop_words="vietnamese.txt")
print("\nProcessed without re pattern but with stop words:")
print(df_processed_with_stop_words[['input', 'output']])

Processed with re pattern and stop words:
                          input       output
0     [Hello, ,, how, are, you]  [greetings]
1  [I, am, fine, ,, thank, you]  [responses]

Processed without re pattern but with stop words:
                             input       output
0     [Hello, ,, how, are, you, ?]  [greetings]
1  [I, am, fine, ,, thank, you, !]  [responses]


In [4]:
index_dict = {}
index = 2

# Gán chỉ số riêng biệt cho mỗi từ trong cột "patterns" và nhãn trong cột "tag"
for i, row in df_processed_with_stop_words.iterrows():
    indices = []
    for word in row['input']:
        if word not in index_dict:
            index_dict[word] = index
            index += 1
        indices.append(index_dict[word])
    df_processed_with_stop_words.at[i, 'input'] = indices
    
    label = row['output'][0]
    if label not in index_dict:
        index_dict[label] = index
        index += 1
    df_processed_with_stop_words.at[i, 'output'] = index_dict[label]


df_processed_with_stop_words


Unnamed: 0,input,output
0,"[2, 3, 4, 5, 6, 7]",8
1,"[9, 10, 11, 3, 12, 6, 13]",14


In [7]:
with open("vietnamese.txt", "r", encoding="utf-8") as file:
    stop_words = set(line.strip() for line in file)
print(stop_words)
class Tokenizer:
    def __init__(self):
        pass
    def preprocess_text(self, text, re_pattern=None, stop_words=None):
        # words = word_tokenize(text)
        if re_pattern:
            pattern = "[" + re.escape(re_pattern) + "]"
            text = re.sub(pattern, '', text)
        words = word_tokenize(text)
        if stop_words:
            words = [word for word in words if word.lower() not in stop_words]
        return words

    def tokenize_df(self, df, input_column, output_column, re_pattern=None, stop_words=None):

        df['input'] = df[input_column].apply(lambda x: self.preprocess_text(x, re_pattern, stop_words))
        df['output'] = df[output_column].apply(lambda x: self.preprocess_text(x, re_pattern, stop_words))

        df.drop(columns=[input_column, output_column], inplace=True)
        return df


tokenizer = Tokenizer()

# With re pattern and stop words
df_processed_with_re = tokenizer.tokenize_df(df.copy(), 'patterns0', 'tag', "[.;\]=]?", stop_words)
print("Processed with re pattern and stop words:")
print(df_processed_with_re[['input', 'output']])

# Without re pattern but with stop words
df_processed_with_stop_words = tokenizer.tokenize_df(df.copy(), 'patterns0', 'tag',  stop_words=stop_words)
print("\nProcessed without re pattern but with stop words:")
print(df_processed_with_stop_words[['input', 'output']])

{'dạ con', 'sáng rõ', 'ừ ào', 'chao ôi', 'gây giống', 'bấy giờ', 'có ngày', 'tới gần', 'bao lâu', 'lần khác', 'bất kỳ', 'ngay từ', 'nhân tiện', 'nhìn xuống', 'xon xón', 'thốc', 'nhân dịp', 'chứ không phải', 'tìm việc', 'ra vào', 'chết thật', 'không nhận', 'alô', 'dở chừng', 'ở đây', 'tên chính', 'ví phỏng', 'nhiều', 'từ thế', 'bỗng không', 'hiện tại', 'tính căn', 'tìm', 'lên ngôi', 'biết mình', 'thậm từ', 'gì', 'đúng ra', 'mở mang', 'rồi nữa', 'phóc', 'cơ cùng', 'chu cha', 'xem số', 'ngồi không', 'bỏ mẹ', 'loại từ', 'vụt', 'không ai', 'đạt', 'sau đây', 'quả', 'chậc', 'ắt là', 'nhờ chuyển', 'tránh', 'trước kia', 'cả đến', 'chùn chũn', 'lâu ngày', 'cật lực', 'giờ lâu', 'chúng mình', 'chứ như', 'chuyển tự', 'được nước', 'lấy thêm', 'lên', 'quả là', 'tính từ', 'đáng lí', 'nào phải', 'chăng', 'là thế nào', 'sao cho', 'chung nhau', 'sì', 'cảm thấy', 'phía sau', 'ba họ', 'làm bằng', 'tuy có', 'hơn', 'không được', 'tuy rằng', 'hay', 'dễ nghe', 'nghe đâu', 'qua chuyện', 'mọi nơi', 'anh ấy', 'cụ

In [2]:
import torch.nn as nn
import torch
import math

class EmbeddiingInput(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.vocal_size = vocab_size
        self.d_model = d_model
        self.embed = nn.Embedding(vocab_size, d_model)
    
    def forward(self, x):
        return self.embed(x) *math.sqrt(self.d_model)
    
class PositionEncoding(nn.Module):
    def __init__(self, d_model :int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model =d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(p=dropout)
        
        """creat a matrix of shape(seq_len, d_model)"""
        pe = torch.zeros(seq_len, d_model)
        position = torch.arange(0,seq_len,dtype = torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0,d_model,2).float(*(-math.log(10000.0)/d_model)))
        
        pe[ :,0::2] = torch.sin(position * div_term)
        pe[ :,1::2] = torch.cos(position * div_term)
        
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)
        
    def forward(self,x):
        x = x +(self.pe[ :,:x.shape(1):]).requires_gard_(False)
        return self.Dropout(x)
    
class LayerNormalize(nn.Module):
    def __init__(self, eps :float=10**-6):
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(1))
        self.bias = nn.Parameter(torch.zeros(1))
        
    def forward(self,x):
        mean = x.mean(dim =-1,keepdim=True)
        std = x.std(dim=-1,keepdim = True)
        return self.alpha * (x -mean)/(std + self.eps) + self.bias
class FeedForwardBlock(nn.Module):

    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # w1 and b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # w2 and b2

    def forward(self, x):
        # (batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

class InputEmbeddings(nn.Module):
    def __init__(self, d_model: int, vocab_size: int) -> None:
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        # (batch, seq_len) --> (batch, seq_len, d_model)
        # Multiply by sqrt(d_model) to scale the embeddings according to the paper
        return self.embedding(x) * math.sqrt(self.d_model)        
            
class ResidualConnection(nn.Module):

    def __init__(self, dropout: float) -> None:
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalize()

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))    
class MultiHeadAttentionBlock(nn.Module):

    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model # Embedding vector size
        self.h = h # Number of heads
        # Make sure d_model is divisible by h
        assert d_model % h == 0, "d_model is not divisible by h"

        self.d_k = d_model // h # Dimension of vector seen by each head
        self.w_q = nn.Linear(d_model, d_model, bias=False) # Wq
        self.w_k = nn.Linear(d_model, d_model, bias=False) # Wk
        self.w_v = nn.Linear(d_model, d_model, bias=False) # Wv
        self.w_o = nn.Linear(d_model, d_model, bias=False) # Wo
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]
        # Just apply the formula from the paper
        # (batch, h, seq_len, d_k) --> (batch, h, seq_len, seq_len)
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            # Write a very low value (indicating -inf) to the positions where mask == 0
            attention_scores.masked_fill_(mask == 0, -1e9)
        attention_scores = attention_scores.softmax(dim=-1) # (batch, h, seq_len, seq_len) # Apply softmax
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        # (batch, h, seq_len, seq_len) --> (batch, h, seq_len, d_k)
        # return attention scores which can be used for visualization
        return (attention_scores @ value), attention_scores

    def forward(self, q, k, v, mask):
        query = self.w_q(q) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        key = self.w_k(k) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        value = self.w_v(v) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)

        # (batch, seq_len, d_model) --> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k)
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)

        # Calculate attention
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)
        
        # Combine all the heads together
        # (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k) --> (batch, seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        # Multiply by Wo
        # (batch, seq_len, d_model) --> (batch, seq_len, d_model)  
        return self.w_o(x)
class EncoderBlock(nn.Module):

    def __init__(self, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(2)])

    def forward(self, x, src_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x
    
class Encoder(nn.Module):

    def __init__(self, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalize()

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

class DecoderBlock(nn.Module):

    def __init__(self, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(3)])

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x
    
class Decoder(nn.Module):

    def __init__(self, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalize()

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)

class ProjectionLayer(nn.Module):

    def __init__(self, d_model, vocab_size) -> None:
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x) -> None:
        # (batch, seq_len, d_model) --> (batch, seq_len, vocab_size)
        return torch.log_softmax(self.proj(x), dim = -1)
    
class Transformer(nn.Module):

    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbeddings, tgt_embed: InputEmbeddings, src_pos:  PositionEncoding, tgt_pos:  PositionEncoding, projection_layer: ProjectionLayer) -> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer

    def encode(self, src, src_mask):
        # (batch, seq_len, d_model)
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)
    
    def decode(self, encoder_output: torch.Tensor, src_mask: torch.Tensor, tgt: torch.Tensor, tgt_mask: torch.Tensor):
        # (batch, seq_len, d_model)
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)
    
    def project(self, x):
        # (batch, seq_len, vocab_size)
        return self.projection_layer(x)
    
def build_transformer(src_vocab_size: int, tgt_vocab_size: int, src_seq_len: int, tgt_seq_len: int, d_model: int=512, N: int=6, h: int=8, dropout: float=0.1, d_ff: int=2048) -> Transformer:

    # Create the embedding layers
    src_embed = InputEmbeddings(d_model, src_vocab_size)
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)

    # Create the positional encoding layers
    src_pos = PositionEncoding(d_model, src_seq_len, dropout)
    tgt_pos = PositionEncoding(d_model, tgt_seq_len, dropout)
    
    # Create the encoder blocks
    encoder_blocks = []
    for _ in range(N):
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_block = EncoderBlock(encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block)

    # Create the decoder blocks
    decoder_blocks = []
    for _ in range(N):
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        decoder_block = DecoderBlock(decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
        decoder_blocks.append(decoder_block)
    
    # Create the encoder and decoder
    encoder = Encoder(nn.ModuleList(encoder_blocks))
    decoder = Decoder(nn.ModuleList(decoder_blocks))
    
    # Create the projection layer
    projection_layer = ProjectionLayer(d_model, tgt_vocab_size)
    
    # Create the transformer
    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)
    
    # Initialize the parameters
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    
    return transformer


class T5(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, max_seq_len, dropout=0.1):
        super().__init__()
        self.embedding = EmbeddiingInput(vocab_size, d_model)
        self.positional_encoding = PositionEncoding(d_model, max_seq_len, dropout)
        self.encoder = Encoder(self.make_encoder_layers(d_model, num_layers, num_heads, d_ff, dropout))
        self.decoder = Decoder(self.make_decoder_layers(d_model, num_layers, num_heads, d_ff, dropout))
        self.projection_layer = ProjectionLayer(d_model, vocab_size)

    def make_encoder_layers(self, d_model, num_layers, num_heads, d_ff, dropout):
        encoder_layers = []
        for _ in range(num_layers):
            encoder_self_attention_block = MultiHeadAttentionBlock(d_model, num_heads, dropout)
            feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
            encoder_block = EncoderBlock(encoder_self_attention_block, feed_forward_block, dropout)
            encoder_layers.append(encoder_block)
        return nn.ModuleList(encoder_layers)

    def make_decoder_layers(self, d_model, num_layers, num_heads, d_ff, dropout):
        decoder_layers = []
        for _ in range(num_layers):
            decoder_self_attention_block = MultiHeadAttentionBlock(d_model, num_heads, dropout)
            decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, num_heads, dropout)
            feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
            decoder_block = DecoderBlock(decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
            decoder_layers.append(decoder_block)
        return nn.ModuleList(decoder_layers)

    def forward(self, src, tgt, src_mask, tgt_mask):
        src_embed = self.embedding(src)
        src_embed = self.positional_encoding(src_embed)
        encoder_output = self.encoder(src_embed, src_mask)

        tgt_embed = self.embedding(tgt)
        tgt_embed = self.positional_encoding(tgt_embed)
        decoder_output = self.decoder(tgt_embed, encoder_output, src_mask, tgt_mask)

        logits = self.projection_layer(decoder_output)
        return logits

# Sử dụng mô hình T5
src_vocab_size = 10000  # Thay đổi tùy theo từ điển nguồn của bạn
tgt_vocab_size = 10000  # Thay đổi tùy theo từ điển đích của bạn
src_seq_len = 256       # Thay đổi tùy theo độ dài nguồn của bạn
tgt_seq_len = 256       # Thay đổi tùy theo độ dài đích của bạn
d_model = 512
num_layers = 6
num_heads = 8
d_ff = 2048
max_seq_len = max(src_seq_len, tgt_seq_len)
dropout = 0.1

t5_model = T5(src_vocab_size, d_model, num_layers, num_heads, d_ff, max_seq_len, dropout)

# Sử dụng t5_model cho các tác vụ xử lý ngôn ngữ tự nhiên khác nhau

     

In [4]:
class TransformerMultiTask(nn.Module):
    def __init__(self, src_vocab_size: int, tgt_vocab_size: int, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, src_seq_len, tgt_seq_len, num_classes, dropout=0.1):
        super(TransformerMultiTask, self).__init__()

        # Create separate embeddings for source and target
        self.src_embedding = InputEmbeddings(d_model, src_vocab_size)
        self.tgt_embedding = InputEmbeddings(d_model, tgt_vocab_size)

        # Create separate positional encoding layers for source and target
        self.src_positional_encoding = PositionEncoding(d_model, src_seq_len, dropout=dropout)
        self.tgt_positional_encoding = PositionEncoding(d_model, tgt_seq_len, dropout=dropout)

        self.encoder = Encoder([EncoderBlock(
            MultiHeadAttentionBlock(d_model, nhead, dropout=dropout),
            FeedForwardBlock(d_model, dim_feedforward, dropout=dropout),
            dropout=dropout
        ) for _ in range(num_encoder_layers)])

        self.decoder = Decoder([DecoderBlock(
            MultiHeadAttentionBlock(d_model, nhead, dropout=dropout),
            MultiHeadAttentionBlock(d_model, nhead, dropout=dropout),
            FeedForwardBlock(d_model, dim_feedforward, dropout=dropout),
            dropout=dropout
        ) for _ in range(num_decoder_layers)])

        # Classification layer for text classification task
        self.classification_projection = nn.Linear(d_model, num_classes)

        # Translation projection layer for language translation task
        self.translation_projection = ProjectionLayer(d_model, tgt_vocab_size)

        # QA projection layer for question answering task
        self.qa_projection = nn.Linear(d_model, 2)  # Predict start and end positions

        # Text generation projection layer
        self.text_generation_projection = ProjectionLayer(d_model, tgt_vocab_size)

    def forward(self, src_input_ids, tgt_input_ids, src_mask, tgt_mask, task):
        src_embed = self.src_embedding(src_input_ids)
        src_embed = src_embed + self.src_positional_encoding(src_embed)

        tgt_embed = self.tgt_embedding(tgt_input_ids)
        tgt_embed = tgt_embed + self.tgt_positional_encoding(tgt_embed)

        encoder_output = self.encoder(src_embed, src_mask)

        if task == 'classification':
            logits = self.classification_projection(encoder_output.mean(dim=1))
            return logits
        elif task == 'translation':
            decoder_output = self.decoder(tgt_embed, encoder_output, src_mask, tgt_mask)
            logits = self.translation_projection(decoder_output)
            return logits
        elif task == 'qa':
            decoder_output = self.decoder(tgt_embed, encoder_output, src_mask, tgt_mask)
            logits = self.qa_projection(decoder_output)
            return logits
        elif task == 'text_generation':
            decoder_output = self.decoder(tgt_embed, encoder_output, src_mask, tgt_mask)
            logits = self.text_generation_projection(decoder_output)
            return logits
        else:
            raise ValueError("Task not supported.")

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Xác định các siêu tham số huấn luyện
src_vocab_size = 10000  # Kích thước từ vựng nguồn
tgt_vocab_size = 10000  # Kích thước từ vựng đích
d_model = 512  # Kích thước của vector nhúng
nhead = 8  # Số lượng head trong multi-head attention
num_encoder_layers = 6  # Số lớp mã hóa
num_decoder_layers = 6  # Số lớp giải mã
dim_feedforward = 2048  # Kích thước lớp feedforward ẩn
src_seq_len = 50  # Độ dài tối đa của chuỗi đầu vào
tgt_seq_len = 60  # Độ dài tối đa của chuỗi đầu ra
num_classes = 10  # Số lớp trong tác vụ phân loại
dropout = 0.1  # Tỷ lệ dropout

# Khởi tạo mô hình
model = TransformerMultiTask(src_vocab_size, tgt_vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, src_seq_len, tgt_seq_len, num_classes, dropout=dropout)

# Xác định hàm mất mát và tối ưu hóa
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Dữ liệu mẫu (bạn cần thay thế bằng dữ liệu thực tế)
train_src_input_ids = torch.randint(0, src_vocab_size, (100, src_seq_len))  # Mã hóa nguồn
train_tgt_input_ids = torch.randint(0, tgt_vocab_size, (100, tgt_seq_len))  # Mã hóa đích
train_src_mask = torch.ones(100, src_seq_len).bool()  # Mặt nạ nguồn
train_tgt_mask = torch.ones(100, tgt_seq_len).bool()  # Mặt nạ đích
train_labels = torch.randint(0, num_classes, (100,))  # Nhãn phân loại

# Huấn luyện mô hình
num_epochs = 10
for epoch in range(num_epochs):
    model.train()  # Chuyển sang chế độ huấn luyện
    optimizer.zero_grad()


In [None]:
class QADecoderHead(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.d_model = d_model
        self.start_token_projection = nn.Linear(d_model, 1)  # Dự đoán vị trí bắt đầu
        self.end_token_projection = nn.Linear(d_model, 1)    # Dự đoán vị trí kết thúc
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        # Đầu ra cho câu hỏi và trả lời
        start_logits = self.start_token_projection(x).squeeze(-1)  # Loại bỏ kích thước cuối cùng
        end_logits = self.end_token_projection(x).squeeze(-1)      # Loại bỏ kích thước cuối cùng
        
        # Áp dụng softmax để tính xác suất vị trí bắt đầu và kết thúc
        start_probs = self.softmax(start_logits)
        end_probs = self.softmax(end_logits)
        
        return start_probs, end_probs
class TextGenerationHead(nn.Module):
    def __init__(self, d_model, max_output_length, vocab_size):
        super().__init__()
        self.d_model = d_model
        self.max_output_length = max_output_length
        self.vocab_size = vocab_size
        self.decoder_layer = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        # Đầu ra cho tạo văn bản
        logits = self.decoder_layer(x)
        return logits
class Transformer(nn.Module):

    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbeddings, tgt_embed: InputEmbeddings, src_pos:  PositionEncoding, tgt_pos:  PositionEncoding, projection_layer: ProjectionLayer) -> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer

    def encode(self, src, src_mask):
        # (batch, seq_len, d_model)
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)
    
    def decode(self, encoder_output: torch.Tensor, src_mask: torch.Tensor, tgt: torch.Tensor, tgt_mask: torch.Tensor):
        # (batch, seq_len, d_model)
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)
    
    def project(self, x):
        # (batch, seq_len, vocab_size)
        return self.projection_layer(x)
class TransformerTasks(nn.Module):
    
    def __init__(self, task, src_vocab_size: int, tgt_vocab_size: int,src_seq_len: int, tgt_seq_len: int,d_model: int=512, N: int=6, h: int=8, dropout: float=0.1,d_ff: int=2048, num_classes= None)-> Transformer:
        super().__init__()
        self.task = task
        self.d_model = d_model
        
        # Input Embedding
        self.src_embed = InputEmbeddings(d_model, src_vocab_size)
        self.tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)
        
        # Positional Encoding
        self.src_pos = PositionEncoding(d_model, src_seq_len, dropout)
        self.tgt_pos = PositionEncoding(d_model, tgt_seq_len, dropout)
        
        # Encoder and Decoder layers (you can customize these as needed)
        self.encoder = Encoder([EncoderBlock(MultiHeadAttentionBlock(d_model, h, dropout),
                                             FeedForwardBlock(d_model, d_ff, dropout),dropout) for _ in range(N)])
        
        self.decoder = Decoder([DecoderBlock(MultiHeadAttentionBlock(d_model, h, dropout),
                                             MultiHeadAttentionBlock(d_model, h, dropout),
                                             FeedForwardBlock(d_model, d_ff, dropout),
                                             dropout) for _ in range(N)])
         # Create the encoder and decoder
        self.encoder = Encoder(nn.ModuleList(self.encoder))
        self.decoder = Decoder(nn.ModuleList(self.decoder))
        

        projection_layer = ProjectionLayer(d_model, tgt_vocab_size)
        if task == "classification":
            # Nhiệm vụ phân loại văn bản
            self.classification_layer = nn.Linear(d_model, num_classes)
        elif task == "translation":
            # Nhiệm vụ dịch tiếng
            self.projection_layer = ProjectionLayer(d_model, src_vocab_size)
        elif task == "qa":
            # Nhiệm vụ câu hỏi và trả lời
            self.qa_decoder = QADecoderHead(d_model)
        elif task == "summarization":
            # Nhiệm vụ tóm tắt văn bản
            self.projection_layer = ProjectionLayer(d_model, src_vocab_size)
        elif task == "generation":
            # Nhiệm vụ tạo văn bản
            self.text_generation_head = TextGenerationHead(d_model, max_output_length,src_vocab_size)
        else:
            raise ValueError("Invalid task specified")

    def forward(self, src, tgt, src_mask, tgt_mask):
        src_embedded = self.src_embed(src) * math.sqrt(self.d_model)
        src_encoded = self.encoder(self.src_pos(src_embedded), src_mask)
        
        tgt_embedded = self.tgt_embed(tgt) * math.sqrt(self.d_model)
        tgt_pos_encoded = self.tgt_pos(tgt_embedded)
        
        if self.task == "classification":
            # Nhiệm vụ phân loại văn bản
            cls_representation = src_encoded[:, 0, :]
            logits = self.classification_layer(cls_representation)
            return logits
        elif self.task == "translation":
            # Nhiệm vụ dịch tiếng
            translation_output = self.decoder(tgt_pos_encoded, src_encoded, src_mask, tgt_mask)
            translation_logits = self.projection_layer(translation_output)
            return translation_logits
        elif self.task == "qa":
            # Nhiệm vụ câu hỏi và trả lời
            qa_output = self.qa_decoder(tgt_pos_encoded)
            return qa_output
        elif self.task == "summarization":
            # Nhiệm vụ tóm tắt văn bản
            summarization_output = self.decoder(tgt_pos_encoded, src_encoded, src_mask, tgt_mask)
            summarization_logits = self.projection_layer(summarization_output)
            return summarization_logits
        elif self.task == "generation":
            # Nhiệm vụ tạo văn bản
            generation_output = self.decoder(tgt_pos_encoded, src_encoded, src_mask, tgt_mask)
            generation_logits = self.text_generation_head(generation_output)
            return generation_logits
        else:
            raise ValueError("Invalid task specified")

In [5]:
import torch

# Tạo một vector có kích thước (5,)
vector = torch.tensor([[1, 2, 3, 4, 5],[1, 2, 3, 4, 5]])
print(vector) 
# Sử dụng unsqueeze(1) để chuyển thành ma trận cột
matrix_column = vector.unsqueeze(1)

# In kích thước của ma trận cột
print(matrix_column)  # Output: torch.Size([5, 1])

tensor([[1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5]])
tensor([[[1, 2, 3, 4, 5]],

        [[1, 2, 3, 4, 5]]])


In [3]:
from gensim.models import Word2Vec

# Tạo một danh sách các câu văn bản từ dữ liệu
sentences = [['this', 'is', 'an', 'example', 'sentence'],
            ['another', 'example', 'sentence'],
            ['yet', 'another', 'sentence']]

# Huấn luyện mô hình Word2Vec
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=0)

# Lưu mô hình để sử dụng sau này
model.save("word2vec_model")

In [4]:
# Tìm vectơ biểu diễn của từ "example"
vector = model.wv["example"]
print(vector)

[ 9.4563962e-05  3.0773198e-03 -6.8126451e-03 -1.3754654e-03
  7.6685809e-03  7.3464094e-03 -3.6732971e-03  2.6427018e-03
 -8.3171297e-03  6.2054861e-03 -4.6373224e-03 -3.1641065e-03
  9.3113566e-03  8.7338570e-04  7.4907029e-03 -6.0740625e-03
  5.1605068e-03  9.9228229e-03 -8.4573915e-03 -5.1356913e-03
 -7.0648370e-03 -4.8626517e-03 -3.7785638e-03 -8.5361991e-03
  7.9556061e-03 -4.8439382e-03  8.4236134e-03  5.2625705e-03
 -6.5500261e-03  3.9578713e-03  5.4701497e-03 -7.4265362e-03
 -7.4057197e-03 -2.4752307e-03 -8.6257253e-03 -1.5815723e-03
 -4.0343284e-04  3.2996845e-03  1.4418805e-03 -8.8142155e-04
 -5.5940580e-03  1.7303658e-03 -8.9737179e-04  6.7936908e-03
  3.9735902e-03  4.5294715e-03  1.4343059e-03 -2.6998555e-03
 -4.3668128e-03 -1.0320747e-03  1.4370275e-03 -2.6460087e-03
 -7.0737829e-03 -7.8053069e-03 -9.1217868e-03 -5.9351693e-03
 -1.8474245e-03 -4.3238713e-03 -6.4606704e-03 -3.7173224e-03
  4.2891586e-03 -3.7390434e-03  8.3781751e-03  1.5339935e-03
 -7.2423196e-03  9.43379

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5ForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset

# Khởi tạo mô hình và tokenizer T5
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Định nghĩa dữ liệu huấn luyện dạng Dataset cho cả hai tác vụ
class MultiTaskDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128, is_classification=False):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_classification = is_classification

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        if self.is_classification:
            return {
                "input_ids": encoding.input_ids.squeeze(),
                "attention_mask": encoding.attention_mask.squeeze(),
                "label": label
            }
        else:
            return {
                "input_ids": encoding.input_ids.squeeze(),
                "attention_mask": encoding.attention_mask.squeeze(),
                "decoder_input_ids": encoding.input_ids.squeeze(),
                "decoder_attention_mask": encoding.attention_mask.squeeze(),
            }

# Dữ liệu cho tác vụ "Tạo văn bản"
texts_generation = ["This is a positive example.", "This is a negative example.", "This is another positive example."]
labels_generation = ["This is a positive generated text.", "This is a negative generated text.", "This is another positive generated text."]

# Dữ liệu cho tác vụ "Phân loại văn bản"
texts_classification = ["This is a positive text.", "This is a negative text.", "This is another positive text."]
labels_classification = [1, 0, 1]  # 1 cho tích cực, 0 cho tiêu cực

# Tạo datasets
generation_dataset = MultiTaskDataset(texts_generation, labels_generation, tokenizer, is_classification=False)
classification_dataset = MultiTaskDataset(texts_classification, labels_classification, tokenizer, is_classification=True)

# DataLoader để tải dữ liệu và xử lý batch cho cả hai tác vụ
generation_dataloader = DataLoader(generation_dataset, batch_size=2, shuffle=True)
classification_dataloader = DataLoader(classification_dataset, batch_size=2, shuffle=True)

# Khởi tạo mô hình T5 cho tác vụ "Tạo văn bản"
generation_model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Khởi tạo mô hình T5 cho tác vụ "Phân loại văn bản"
classification_model = T5ForSequenceClassification.from_pretrained("t5-small")

# Huấn luyện cả hai tác vụ
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

generation_model.to(device)
classification_model.to(device)

optimizer_generation = AdamW(generation_model.parameters(), lr=1e-4)
optimizer_classification = AdamW(classification_model.parameters(), lr=1e-4)

def train_generation(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0.0

    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        decoder_input_ids = batch["decoder_input_ids"].to(device)
        decoder_attention_mask = batch["decoder_attention_mask"].to(device)

        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=decoder_input_ids
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

def train_classification(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0.0

    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

# Huấn luyện mô hình cho tác vụ "Tạo văn bản"
for epoch in range(5):
    loss = train_generation(generation_model, generation_dataloader, optimizer_generation, device)
    print(f"Epoch {epoch + 1} - Text Generation Loss: {loss:.4f}")

# Huấn luyện mô hình cho tác vụ "Phân loại văn bản"
for epoch in range(5):
    loss = train_classification(classification_model, classification_dataloader, optimizer_classification, device)
    print(f"Epoch {epoch + 1} - Text Classification Loss: {loss:.4f}")

# Lưu mô hình đã huấn luyện cho cả hai tác vụ
generation_model.save_pretrained("text_generation_model")
classification_model.save_pretrained("text_classification_model")
