In [1]:
!cp -r "/kaggle/input/vi-tone-no-tone/data" /kaggle/working/

In [2]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import time
import random
import re
from torch import amp

In [None]:
# Tạo thư mục lưu kết quả
os.makedirs("/kaggle/working/results", exist_ok=True)

# Định nghĩa lớp Dataset
class TranslationDatasetFull(Dataset):
    def __init__(self, in_file, out_file, in_vocab, out_vocab, max_len=50):
        self.in_sentences = self._load_sentences(in_file)
        self.out_sentences = self._load_sentences(out_file)
        self.in_vocab = self._load_vocab(in_vocab) if isinstance(in_vocab, str) else in_vocab
        self.out_vocab = self._load_vocab(out_vocab) if isinstance(out_vocab, str) else out_vocab
        self.max_len = max_len

    def _load_sentences(self, file_path):
        df = pd.read_csv(file_path, encoding='utf-8', header=None, names=['ID', 'Sentence'])
        sentences = df['Sentence'].tolist()
        return [str(s).strip() for s in sentences if str(s).strip()]

    def _load_vocab(self, vocab_path):
        vocab = {}
        with open(vocab_path, 'r', encoding='utf-8') as f:
            words = [line.strip() for line in f if line.strip()]
        for idx, word in enumerate(words):
            vocab[word] = idx
        required_tokens = ['<unk>', '<pad>', '<sos>', '<eos>']
        max_idx = max(vocab.values()) if vocab else -1
        for token in required_tokens:
            if token not in vocab:
                max_idx += 1
                vocab[token] = max_idx
        return vocab

    def _encode_sentence(self, sentence, vocab, max_len):
        tokens = sentence.strip().split()
        token_ids = [vocab.get(token, vocab['<unk>']) for token in tokens]
        token_ids = token_ids[:max_len] + [vocab['<pad>']] * (max_len - len(token_ids))
        return token_ids

    def _encode_decoder_sentence(self, sentence, vocab, max_len):
        tokens = sentence.strip().split()
        full_tokens = [vocab['<sos>']] + [vocab.get(token, vocab['<unk>']) for token in tokens] + [vocab['<eos>']]
        if len(full_tokens) < max_len:
            full_tokens += [vocab['<pad>']] * (max_len - len(full_tokens))
        else:
            full_tokens = full_tokens[:max_len]
        return full_tokens

    def __len__(self):
        return len(self.in_sentences)

    def __getitem__(self, idx):
        in_sentence = self.in_sentences[idx]
        out_sentence = self.out_sentences[idx]
        src = self._encode_sentence(in_sentence, self.in_vocab, self.max_len)
        tgt = self._encode_decoder_sentence(out_sentence, self.out_vocab, self.max_len)
        return torch.tensor(src, dtype=torch.long), torch.tensor(tgt, dtype=torch.long)

# Tạo dataset và DataLoader
train_dataset = TranslationDatasetFull(
    "/kaggle/working/data/train/source.csv",
    "/kaggle/working/data/train/target.csv",
    "/kaggle/working/data/vocab/input_vocab.txt",
    "/kaggle/working/data/vocab/output_vocab.txt",
    max_len=50
)
val_dataset = TranslationDatasetFull(
    "/kaggle/working/data/val/source.csv",
    "/kaggle/working/data/val/target.csv",
    "/kaggle/working/data/vocab/input_vocab.txt",
    "/kaggle/working/data/vocab/output_vocab.txt",
    max_len=50
)
test_dataset = TranslationDatasetFull(
    "/kaggle/working/data/test/source.csv",
    "/kaggle/working/data/test/target.csv",
    "/kaggle/working/data/vocab/input_vocab.txt",
    "/kaggle/working/data/vocab/output_vocab.txt",
    max_len=50
)

batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=2)

# Debug dataset và vocab
print(f"Number of sentences in train_dataset: {len(train_dataset)}")
print(f"Number of sentences in val_dataset: {len(val_dataset)}")
print(f"Number of sentences in test_dataset: {len(test_dataset)}")
print(f"Expected train batches: {len(train_loader)}")
print("Input vocab size:", len(train_dataset.in_vocab))
print("Output vocab size:", len(train_dataset.out_vocab))
print("Sample input vocab:", list(train_dataset.in_vocab.items())[:5])
print("Sample output vocab:", list(train_dataset.out_vocab.items())[:5])
print("Value of <pad>:", train_dataset.out_vocab.get('<pad>', "Not found"))
print("Sample source:", train_dataset.in_sentences[:5])
print("Sample target:", train_dataset.out_sentences[:5])

Number of sentences in train_dataset: 4393646
Number of sentences in val_dataset: 549205
Number of sentences in test_dataset: 549207
Expected train batches: 34326
Input vocab size: 1450
Output vocab size: 5805
Sample input vocab: [('a', 0), ('ac', 1), ('ach', 2), ('ai', 3), ('am', 4)]
Sample output vocab: [('a', 0), ('a1', 1), ('a1c', 2), ('a1ch', 3), ('a1i', 4)]
Value of <pad>: 5802
Sample source: ['tenedos barronus uoc ralph vary chamberlin mieu ta nam', 'ngay giao su tran van huong uoc quoc truong phan khac suu bo nhiem lam thu tuong', 'trong noi inh cac vi quy nhan cung cung tan co ngau nhien lam sai ieu gi quach hau cung khong truy cuu con o truoc mat tao phi bao che', 'chung uoc su dung cho che tao cac cam bien tia hong ngoai hoac nhiet ien', 'uchukeiji gyaban uoc bat au tu mot tam hinh minh hoa cua murakami katsushi mot nhan vien thiet ke cua hang bandai nguoi a e lai ten tuoi minh trong lich su nganh o choi voi nhieu san pham oc ao']
Sample target: ['tenedos barronus d9u7o75c r

In [None]:
# Định nghĩa mô hình LSTM
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class EncoderLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout=0.5):
        super(EncoderLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=train_dataset.in_vocab['<pad>'])
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
        self.v = nn.Parameter(torch.rand(hidden_dim))
        stdv = 1. / (self.v.size(0) ** 0.5)
        self.v.data.uniform_(-stdv, stdv)

    def forward(self, hidden, encoder_outputs):
        batch_size = encoder_outputs.shape[0]
        src_len = encoder_outputs.shape[1]
        hidden = hidden[-1].unsqueeze(1).repeat(1, src_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = torch.sum(self.v * energy, dim=2)
        return F.softmax(attention, dim=1)

class DecoderLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout=0.5):
        super(DecoderLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=train_dataset.out_vocab['<pad>'])
        self.lstm = nn.LSTM(embedding_dim + hidden_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.attention = Attention(hidden_dim)

    def forward(self, tgt, hidden, cell, encoder_outputs):
        embedded = self.dropout(self.embedding(tgt))
        attn_weights = self.attention(hidden, encoder_outputs)
        attn_weights = attn_weights.unsqueeze(1)
        context = torch.bmm(attn_weights, encoder_outputs)
        lstm_input = torch.cat((embedded, context), dim=2)
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden, cell, attn_weights

class Seq2SeqLSTM(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2SeqLSTM, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, tgt, teacher_forcing_ratio=0.8):
        batch_size = src.shape[0]
        tgt_len = tgt.shape[1]
        tgt_vocab_size = self.decoder.fc.out_features
        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)
        encoder_outputs, hidden, cell = self.encoder(src)
        input = tgt[:, 0]
        for t in range(1, tgt_len):
            output, hidden, cell, _ = self.decoder(input.unsqueeze(1), hidden, cell, encoder_outputs)
            outputs[:, t, :] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = tgt[:, t] if teacher_force else top1
        return outputs

# Khởi tạo mô hình
embedding_dim = 256
hidden_dim = 512
num_layers = 1
dropout = 0.5

input_vocab_size = len(train_dataset.in_vocab)
output_vocab_size = len(train_dataset.out_vocab)

encoder = EncoderLSTM(input_vocab_size, embedding_dim, hidden_dim, num_layers, dropout).to(device)
decoder = DecoderLSTM(output_vocab_size, embedding_dim, hidden_dim, num_layers, dropout).to(device)
model = Seq2SeqLSTM(encoder, decoder, device).to(device)

# Định nghĩa loss, optimizer và scaler
criterion = nn.CrossEntropyLoss(ignore_index=train_dataset.out_vocab['<pad>'])
optimizer = optim.Adam(model.parameters(), lr=0.0005)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)
scaler = torch.amp.GradScaler('cuda')

# Hàm beam search (giữ đầu ra mã hóa)
def beam_search(model, src, in_vocab, out_vocab, max_len, device, beam_size=3):
    model.eval()
    src = src.to(device)
    start_token = out_vocab['<sos>']
    end_token = out_vocab['<eos>']
    with torch.no_grad():
        encoder_outputs, hidden, cell = model.encoder(src)
    sequences = [[torch.tensor([start_token], dtype=torch.long, device=device), 0.0, hidden, cell]]
    completed_sequences = []
    for _ in range(max_len):
        all_candidates = []
        for seq, score, hidden, cell in sequences:
            if seq[-1].item() == end_token:
                completed_sequences.append([seq, score])
                continue
            with torch.no_grad():
                output, hidden, cell, _ = model.decoder(seq[-1].unsqueeze(0).unsqueeze(1), hidden, cell, encoder_outputs)
            probs = F.softmax(output.squeeze(1), dim=1)
            top_probs, top_indices = probs.topk(beam_size, dim=1)
            for i in range(beam_size):
                token = top_indices[0, i].unsqueeze(0)
                token_prob = top_probs[0, i].item()
                new_seq = torch.cat((seq, token), dim=0)
                new_score = score + torch.log(torch.tensor(token_prob))
                all_candidates.append([new_seq, new_score, hidden, cell])
        all_candidates = sorted(all_candidates, key=lambda x: x[1], reverse=True)
        sequences = all_candidates[:beam_size]
        if len(sequences) == 0:
            break
    if completed_sequences:
        best_seq = max(completed_sequences, key=lambda x: x[1])[0]
    else:
        best_seq = sequences[0][0]
    idx2word = {idx: word for word, idx in out_vocab.items()}
    translated = [idx2word.get(token.item(), '<unk>') for token in best_seq[1:] if token.item() != end_token]
    return translated

# Hàm suy luận với câu đầu vào
def predict_sentence(model, sentence, in_vocab, out_vocab, max_len, device, beam_size=3):
    model.eval()
    tokens = sentence.strip().split()
    token_ids = [in_vocab.get(token, in_vocab['<unk>']) for token in tokens]
    token_ids = token_ids[:max_len] + [in_vocab['<pad>']] * (max_len - len(token_ids))
    src = torch.tensor([token_ids], dtype=torch.long).to(device)
    pred_tokens = beam_search(model, src, in_vocab, out_vocab, max_len, device, beam_size)
    normalized_sentence = ' '.join(pred_tokens)
    return normalized_sentence

# Hàm tính metrics với beam search
def calculate_metrics(model, iterator, in_vocab, out_vocab, device, max_len, criterion):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_tokens = 0
    bleu_scores = []
    idx2word = {idx: word for word, idx in out_vocab.items()}
    smoothie = SmoothingFunction().method4
    total_samples = len(iterator.dataset)
    sample_size = max(1, total_samples // 100)
    sampled_indices = random.sample(range(total_samples), sample_size)
    sampled_dataset = torch.utils.data.Subset(iterator.dataset, sampled_indices)
    sampled_loader = DataLoader(sampled_dataset, batch_size=iterator.batch_size, shuffle=False, pin_memory=True, num_workers=4)
    with torch.no_grad():
        for src, tgt in iterator:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt, teacher_forcing_ratio=0)
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            tgt_flat = tgt[:, 1:].reshape(-1)
            loss = criterion(output, tgt_flat)
            total_loss += loss.item()
            preds = output.argmax(dim=1)
            non_pad_mask = tgt_flat != out_vocab['<pad>']
            correct = (preds == tgt_flat) & non_pad_mask
            total_correct += correct.sum().item()
            total_tokens += non_pad_mask.sum().item()
        for src, tgt in sampled_loader:
            src, tgt = src.to(device), tgt.to(device)
            for i in range(src.shape[0]):
                src_sent = src[i].unsqueeze(0)
                tgt_sent = tgt[i].cpu().numpy()
                pred_sent = beam_search(model, src_sent, in_vocab, out_vocab, max_len, device, beam_size=3)
                ref_sent = [idx2word.get(idx, '<unk>') for idx in tgt_sent if idx not in [out_vocab['<pad>'], out_vocab['<sos>'], out_vocab['<eos>']]]
                bleu = sentence_bleu([ref_sent], pred_sent, smoothing_function=smoothie)
                bleu_scores.append(bleu)
    avg_loss = total_loss / len(iterator)
    accuracy = total_correct / total_tokens if total_tokens > 0 else 0
    avg_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
    return avg_loss, accuracy, avg_bleu

# Hàm huấn luyện
def train(model, train_loader, val_loader, optimizer, criterion, in_vocab, out_vocab, max_len, device, num_epochs=15, clip=1, patience=3):
    train_losses = []
    val_losses = []
    val_accuracies = []
    val_bleu_scores = []
    best_val_loss = float('inf')
    patience_counter = 0
    log_file = "/kaggle/working/results/training_log.txt"
    with open(log_file, 'w', encoding='utf-8') as f:
        f.write("Epoch,Train Loss,Val Loss,Val Accuracy,Val BLEU,VRAM (MB),Epoch Time (s)\n")
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        batch_count = 0
        start_time = time.time()
        for src, tgt in train_loader:
            src, tgt = src.to(device, non_blocking=True), tgt.to(device, non_blocking=True)
            optimizer.zero_grad()
            with torch.amp.autocast('cuda'):
                output = model(src, tgt, teacher_forcing_ratio=0.8)
                output_dim = output.shape[-1]
                output = output[:, 1:].reshape(-1, output_dim)
                tgt = tgt[:, 1:].reshape(-1)
                loss = criterion(output, tgt)
            scaler.scale(loss).backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            scaler.step(optimizer)
            scaler.update()
            epoch_loss += loss.item()
            batch_count += 1
        train_loss = epoch_loss / batch_count
        val_loss, val_accuracy, val_bleu = calculate_metrics(model, val_loader, in_vocab, out_vocab, device, max_len, criterion)
        epoch_time = time.time() - start_time
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)
        val_bleu_scores.append(val_bleu)
        scheduler.step(val_loss)
        with open(log_file, 'a', encoding='utf-8') as f:
            f.write(f"{epoch+1},{train_loss:.3f},{val_loss:.3f},{val_accuracy:.3f},{val_bleu:.3f},{torch.cuda.memory_allocated()/1024**2:.2f},{epoch_time:.2f}\n")
        print(f'Epoch: {epoch+1:02}')
        print(f'\tTrain Loss: {train_loss:.3f}')
        print(f'\tVal Loss: {val_loss:.3f}')
        print(f'\tVal Accuracy: {val_accuracy:.3f}')
        print(f'\tVal BLEU: {val_bleu:.3f}')
        print(f'\tEpoch Time: {epoch_time:.2f} seconds')
        print(f'\tVRAM allocated: {torch.cuda.memory_allocated()/1024**2:.2f} MB')
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model, '/kaggle/working/results/seq2seq_lstm_best.pt')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f'Early stopping triggered after epoch {epoch+1}')
                break
    return train_losses, val_losses, val_accuracies, val_bleu_scores

# Hàm vẽ và lưu biểu đồ
def plot_and_save_metrics(train_losses, val_losses, val_accuracies, val_bleu_scores):
    epochs = range(1, len(train_losses) + 1)
    plt.figure(figsize=(10, 5))
    plt.plot(epochs, train_losses, label='Train Loss')
    plt.plot(epochs, val_losses, label='Validation Loss')
    plt.title('Loss over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid()
    plt.savefig('/kaggle/working/results/loss_plot.png')
    plt.close()
    plt.figure(figsize=(10, 5))
    plt.plot(epochs, val_accuracies, label='Validation Accuracy', color='green')
    plt.title('Validation Accuracy over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid()
    plt.savefig('/kaggle/working/results/accuracy_plot.png')
    plt.close()
    plt.figure(figsize=(10, 5))
    plt.plot(epochs, val_bleu_scores, label='Validation BLEU', color='blue')
    plt.title('Validation BLEU Score over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('BLEU Score')
    plt.legend()
    plt.grid()
    plt.savefig('/kaggle/working/results/bleu_plot.png')
    plt.close()



In [None]:
# Huấn luyện
num_epochs = 2
max_len = 50
train_losses, val_losses, val_accuracies, val_bleu_scores = train(
    model, train_loader, val_loader, optimizer, criterion,
    train_dataset.in_vocab, train_dataset.out_vocab, max_len, device, num_epochs=num_epochs, patience=3
)

# Vẽ và lưu biểu đồ
plot_and_save_metrics(train_losses, val_losses, val_accuracies, val_bleu_scores)

# Kiểm tra trên test set
test_loss, test_accuracy, test_bleu = calculate_metrics(model, test_loader, train_dataset.in_vocab, train_dataset.out_vocab, device, max_len, criterion)
print(f'\nTest Loss: {test_loss:.3f}')
print(f'Test Accuracy: {test_accuracy:.3f}')
print(f'Test BLEU: {test_bleu:.3f}')

# Test suy luận trên test set (giữ mã hóa)
for i in range(5):
    src, tgt = test_dataset[i]
    src = src.unsqueeze(0)
    pred_tokens = beam_search(model, src, train_dataset.in_vocab, train_dataset.out_vocab, max_len, device, beam_size=3)
    src_words = [train_dataset.in_vocab.get(k, '<unk>') for k in src[0].cpu().numpy() if k != train_dataset.in_vocab['<pad>']]
    tgt_words = [train_dataset.out_vocab.get(k, '<unk>') for k in tgt.cpu().numpy() if k not in [train_dataset.out_vocab['<pad>'], train_dataset.out_vocab['<sos>'], train_dataset.out_vocab['<eos>']]]
    print(f"Source: {' '.join(src_words)}")
    print(f"Target (normalized): {' '.join(tgt_words)}")
    print(f"Predicted (normalized): {' '.join(pred_tokens)}\n")


Epoch: 01
	Train Loss: 1.806
	Val Loss: 0.304
	Val Accuracy: 0.910
	Val BLEU: 0.835
	Epoch Time: 15007.65 seconds
	VRAM allocated: 218.56 MB
Epoch: 02
	Train Loss: 0.278
	Val Loss: 0.270
	Val Accuracy: 0.927
	Val BLEU: 0.872
	Epoch Time: 15028.77 seconds
	VRAM allocated: 218.56 MB


KeyboardInterrupt: 