<a href="https://colab.research.google.com/github/BDH-teacher/Deep_Learning_Audit_code/blob/main/MachineTranslation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
pairs = [
    ("나는 고양이를 사랑한다", "I love cats"),
    ("이것은 기계 번역이다", "This is machine translation"),
    ("나는 학교에 간다", "I go to school"),
    ("사과는 맛있다", "Apples are delicious"),
]
def build_vocab(pairs):
    src_vocab = set()
    tgt_vocab = set()

    for src, tgt in pairs:
        src_vocab.update(src.split())
        tgt_vocab.update(tgt.split())

    src_vocab = {word: i for i, word in enumerate(src_vocab, start=4)}
    tgt_vocab = {word: i for i, word in enumerate(tgt_vocab, start=4)}
    src_vocab["<bos>"], src_vocab["<eos>"], src_vocab["<pad>"], src_vocab["<unk>"]   = 0, 1, 2, 3
    tgt_vocab["<bos>"], tgt_vocab["<eos>"], tgt_vocab["<pad>"], tgt_vocab["<unk>"]  = 0, 1, 2, 3

    return src_vocab, tgt_vocab

src_vocab, tgt_vocab = build_vocab(pairs)

In [2]:
import torch
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size)

    def forward(self, input_seq):
        embedded = self.embedding(input_seq)
        outputs, hidden = self.rnn(embedded)
        return hidden

class Decoder(nn.Module):
    def __init__(self, output_size, hidden_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input_seq, hidden):
        embedded = self.embedding(input_seq)
        output, hidden = self.rnn(embedded, hidden)
        output = self.fc(output)
        return output, hidden


In [3]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt):
        hidden = self.encoder(src)
        output, hidden = self.decoder(tgt, hidden)
        return output

In [4]:

def train(model, pairs, src_vocab, tgt_vocab, epochs=10, lr=0.01):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        for src, tgt in pairs:
            src_indices = torch.tensor([src_vocab["<bos>"]] + [src_vocab[word] for word in src.split()] + [src_vocab["<eos>"]]).unsqueeze(1)
            tgt_indices = torch.tensor([tgt_vocab["<bos>"]] + [tgt_vocab[word] for word in tgt.split()] + [tgt_vocab["<eos>"]]).unsqueeze(1)

            optimizer.zero_grad()
            outputs = model(src_indices, tgt_indices[:-1])  # 마지막 <eos>는 제외
            loss = criterion(outputs.view(-1, len(tgt_vocab)), tgt_indices[1:].view(-1))  # <bos> 이후부터 계산
            loss.backward()
            optimizer.step()
        print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}')


hidden_size = 256
embedding_size = 128
encoder = Encoder(len(src_vocab), hidden_size)
decoder = Decoder(len(tgt_vocab), hidden_size)
model = Seq2Seq(encoder, decoder)

train(model, pairs, src_vocab, tgt_vocab)


Epoch [1/10], Loss: 3.8048
Epoch [2/10], Loss: 0.3319
Epoch [3/10], Loss: 0.1129
Epoch [4/10], Loss: 0.0076
Epoch [5/10], Loss: 0.0035
Epoch [6/10], Loss: 0.0021
Epoch [7/10], Loss: 0.0015
Epoch [8/10], Loss: 0.0011
Epoch [9/10], Loss: 0.0009
Epoch [10/10], Loss: 0.0008


In [5]:
#|startoftext|, |endoftext|
# Test API
def translate(model, sentence, src_vocab, tgt_vocab):
    model.eval()
    with torch.no_grad():
        src_indices = torch.tensor([src_vocab[word] for word in sentence.split()] + [src_vocab["<eos>"]]).unsqueeze(1)
        hidden = model.encoder(src_indices)

        tgt_indices = [tgt_vocab['<bos>']]
        for _ in range(10):
            tgt_tensor = torch.tensor([tgt_indices[-1]]).unsqueeze(1)
            output, hidden = model.decoder(tgt_tensor, hidden)
            top_word = output.argmax(2)[-1, 0].item()
            if top_word == tgt_vocab['<eos>']:
                break
            tgt_indices.append(top_word)

    return ' '.join([list(tgt_vocab.keys())[list(tgt_vocab.values()).index(idx)] for idx in tgt_indices[1:]])


print(translate(model, "나는 고양이를 사랑한다", src_vocab, tgt_vocab))

I love cats
