In [2]:
import numpy as np
def load_embedding(emb_path, emb_dim = 300):
    embeddings={}
    with open(emb_path, "r",encoding='utf-8') as f:
        for line in f:    
            values=line.strip().split()
            word=values[0]
            vector=np.asarray(values[1:], dtype=np.float32)
            if len(vector)==emb_dim:
                embeddings[word]=vector
    return embeddings
vi_emb=load_embedding("/kaggle/input/embedding/cc.vi.300.vec", emb_dim=300)
en_emb=load_embedding("/kaggle/input/embedding/cc.vi.300.vec", emb_dim=300)

print(vi_emb.get("xin", "Not found"))
print(en_emb.get("hello", "Not found"))


[-0.0773  0.0038  0.2197  0.0483 -0.0276 -0.0284  0.0559 -0.0435  0.0486
  0.0483 -0.1832 -0.0886  0.037   0.0301 -0.2261  0.2115  0.0942  0.1687
  0.0144 -0.078  -0.0376 -0.0617  0.041  -0.0558  0.1403 -0.1129 -0.0191
 -0.1314  0.0698  0.1036 -0.1541 -0.1535  0.0891 -0.0012 -0.0125  0.1032
 -0.0274 -0.0043  0.0149 -0.1026  0.024   0.0583  0.0162  0.0269 -0.0778
 -0.0694 -0.0035 -0.0453 -0.0755 -0.0589  0.1125  0.1777 -0.2076  0.054
 -0.0712 -0.2649 -0.0561 -0.0493 -0.09    0.0517  0.1251 -0.0806 -0.083
 -0.2259  0.0186 -0.0627 -0.0883 -0.0049  0.1506 -0.0081  0.0632 -0.0183
 -0.0592 -0.204   0.0016 -0.044  -0.1258  0.0098  0.0485 -0.0206 -0.25
  0.0724 -0.0894  0.0792 -0.2011  0.0731 -0.0675  0.0333  0.1101  0.1286
  0.0445 -0.2718 -0.1094  0.0372  0.0687 -0.0385  0.0642 -0.0129 -0.0773
  0.0529 -0.0212 -0.0278  0.0197 -0.1464  0.0162 -0.2767  0.0146  0.1724
 -0.0102 -0.036  -0.1316 -0.0276  0.0529 -0.0365 -0.0197 -0.0526 -0.0369
 -0.069   0.0169 -0.1916  0.0019  0.0583  0.0873  0.094

In [None]:
def load_dataset(vi_path, en_path):
    vi_sentences=[]
    en_sentences=[]
    with open(vi_path, "r",encoding='utf-8') as f:
        for line in f:
            vi_sentences.append(line.strip().split())
    with open(en_path, "r",encoding='utf-8') as f:
        for line in f:
            en_sentences.append(line.strip().split())
    return vi_sentences, en_sentences
vi_train, en_train = load_dataset("/kaggle/input/phomtdataset/PhoMT/tokenization/train/train.vi", "/kaggle/input/phomtdataset/PhoMT/tokenization/train/train.en")
vi_val, en_val = load_dataset("/kaggle/input/phomtdataset/PhoMT/tokenization/dev/dev.vi", "/kaggle/input/phomtdataset/PhoMT/tokenization/dev/dev.en")
vi_test, en_test = load_dataset("/kaggle/input/phomtdataset/PhoMT/tokenization/test/test.vi", "/kaggle/input/phomtdataset/PhoMT/tokenization/test/test.en")

print(vi_train[:2])
print(en_train[:2])

In [None]:
from collections import Counter
def build_vocab(sentences, min_freq=3, specials=["<oes>", "<pad>", "<sos>", "<unk>"]):
    word_counter=Counter(word for sent in sentences for word in sent)
    vocab = {word: idx for idx, word in enumerate(specials)}
    for word, count in word_counter.items():
        if count >= min_freq: 
            vocab[word] = len(vocab)
    return vocab
vi_vocab = build_vocab(vi_train, min_freq=3)
en_vocab = build_vocab(en_train, min_freq=3)

def sentence_to_indices(sentence, vocab):
    return [vocab.get(word, vocab["<unk>"]) for word in sentence]

vi_data_indices = [sentence_to_indices(sent, vi_vocab) for sent in vi_train]
en_data_indices = [sentence_to_indices(sent, en_vocab) for sent in en_train]

print(vi_data_indices[:2])
print(en_data_indices[:2])

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn.utils.rnn as rnn_utils

class TranslationDataset(Dataset):
    def __init__(self, vi_data, en_data):
        self.vi_data = vi_data
        self.en_data = en_data

    def __len__(self):
        return len(self.vi_data)

    def __getitem__(self, idx):
        return torch.tensor(self.vi_data[idx]), torch.tensor(self.en_data[idx])

def collate_fn(batch):
    vi_batch, en_batch = zip(*batch)
    vi_batch = rnn_utils.pad_sequence(vi_batch, batch_first=True, padding_value=vi_vocab["<pad>"])
    en_batch = rnn_utils.pad_sequence(en_batch, batch_first=True, padding_value=en_vocab["<pad>"])
    return vi_batch, en_batch

train_dataset = TranslationDataset(vi_data_indices, en_data_indices)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

vi_batch, en_batch = next(iter(train_loader))
print(vi_batch.shape, en_batch.shape)


In [None]:
def emb_matrix(vocab, embeddings_dict, embedding_dim=300):
    matrix = torch.randn(len(vocab), embedding_dim) 
    pad_idx = vocab["<pad>"]
    matrix[pad_idx] = torch.zeros(embedding_dim)
    for word, idx in vocab.items():
        if word in embeddings_dict:
            matrix[idx] = torch.tensor(embeddings_dict[word], dtype=torch.float32)
    return matrix
vi_embedding_matrix = emb_matrix(vi_vocab, vi_emb, embedding_dim=300)
en_embedding_matrix = emb_matrix(en_vocab, en_emb, embedding_dim=300)

print(vi_embedding_matrix.shape) 
print(en_embedding_matrix.shape) 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm  

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class TranslationDataset(Dataset):
    def __init__(self, vi_sentences, en_sentences, vi_emb, en_emb, pad_idx, max_len=50):
        self.vi_sentences = vi_sentences
        self.en_sentences = en_sentences
        self.vi_emb = vi_emb
        self.en_emb = en_emb
        self.pad_idx = pad_idx
        self.max_len = max_len

    def __len__(self):
        return len(self.vi_sentences)
    
    def sentence_to_tensor(self, sentence, emb):
        vectors = [torch.tensor(emb[word], dtype=torch.float32) if word in emb else torch.zeros(300) for word in sentence]
        tensor = torch.stack(vectors)
        if tensor.shape[0] < self.max_len:
            pad_size = self.max_len - tensor.shape[0]
            tensor = F.pad(tensor, (0, 0, 0, pad_size))
        else:
            tensor = tensor[:self.max_len]
        return tensor

    def __getitem__(self, idx):
        vi_tensor = self.sentence_to_tensor(self.vi_sentences[idx], self.vi_emb)
        en_tensor = self.sentence_to_tensor(self.en_sentences[idx], self.en_emb)
        return vi_tensor, en_tensor

class TransformerModel(nn.Module):
    def __init__(self, emb_dim=300, num_heads=6, num_layers=6, hidden_dim=512):
        super().__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=emb_dim, nhead=num_heads, dim_feedforward=hidden_dim, batch_first=True
        )  
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.decoder_layer = nn.TransformerDecoderLayer(
            d_model=emb_dim, nhead=num_heads, dim_feedforward=hidden_dim, batch_first=True
        )
        self.decoder = nn.TransformerDecoder(self.decoder_layer, num_layers=num_layers)
        self.output_layer = nn.Linear(emb_dim, emb_dim)
    
    def forward(self, src, tgt):
        memory = self.encoder(src)
        output = self.decoder(tgt, memory)
        return self.output_layer(output)

def train_model(model, dataloader, optimizer, criterion, device, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        loop = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=True)
        for vi_tensor, en_tensor in loop:
            vi_tensor = vi_tensor.to(device, non_blocking=True)
            en_tensor = en_tensor.to(device, non_blocking=True) 
            optimizer.zero_grad()
            output = model(vi_tensor, en_tensor)
            loss = criterion(output, en_tensor)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            loop.set_postfix(loss=total_loss / len(dataloader))
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")
train_dataset = TranslationDataset(vi_train, en_train, vi_emb, en_emb, pad_idx=0)
train_loader = DataLoader(
    train_dataset, batch_size=16, shuffle=True, num_workers=2, pin_memory=True
) 
model = TransformerModel().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
train_model(model, train_loader, optimizer, criterion, device)
