In [None]:
import pandas as pd
from opencc import OpenCC

def convert_news_to_csv(data_path, csv_file_path):
    cc = OpenCC('s2tw') # 簡體轉繁體
    with open(data_path, 'r', encoding = "utf-8") as f:
        lines = f.read().split('\n')
        english, chinese = [], []
        for line in lines:
            if line:
                en, cn, _, = line.split('\t') # 資料是\t分割的
                english.append(en)
                
                chinese.append(cc.convert(cn))
    df = pd.DataFrame({'chinese':chinese, 'english':english})
    df.to_csv(csv_file_path)
    
convert_news_to_csv('cmn.txt', 'translate.csv')

In [None]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import torch
import pandas as pd

class TranslateDataset(Dataset):
    def __init__(self, x, y, src_tokenizer, tgt_tokenizer):
        self.x = x
        self.y = y
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer

    def __getitem__(self, index):
        return self.x[index], self.y[index]
       
    def __len__(self):
        return len(self.x)
    
    def collate_fn(self, batch):    
        batch_x, batch_y = zip(*batch)
        inputs = self.src_tokenizer(batch_x, max_length=256, truncation=True, padding="longest", return_tensors='pt').input_ids[:, 1:]
        targets = self.tgt_tokenizer(batch_y, max_length=256, truncation=True, padding="longest", return_tensors='pt').input_ids
       
        return {'src_input_ids':inputs, 'tgt_input_ids': targets}

df = pd.read_csv('translate.csv')
input_texts = df['chinese'].values
target_texts = df['english'].values
x_train, x_valid, y_train, y_valid = train_test_split(input_texts, target_texts, train_size=0.8, random_state=46, shuffle=True)

src_tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
tgt_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

trainset = TranslateDataset(x_train, y_train, src_tokenizer, tgt_tokenizer)
validset = TranslateDataset(x_valid, y_valid, src_tokenizer, tgt_tokenizer)

train_loader = DataLoader(trainset, batch_size = 64, shuffle = True, num_workers = 0, pin_memory = True, collate_fn=trainset.collate_fn)
valid_loader = DataLoader(validset, batch_size = 64, shuffle = True, num_workers = 0, pin_memory = True, collate_fn=validset.collate_fn)

In [None]:
import torch.nn as nn

class EncoderGRU(nn.Module):
    def __init__(self, vocab_size, hidden_size, padding_idx):
        super(EncoderGRU, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=padding_idx)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(0.1)

    def forward(self, token_ids):
        embedded = self.dropout(self.embedding(token_ids))
        #embedded: (batch_size, time_step, emb_dim)
        output, hidden = self.gru(embedded) 
        # output: (batch_size, time_step, hidden_size * 2)
        # hidden: (2, batch_size, hidden_size)
        return output, hidden
    
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.encoder_projection = nn.Linear(hidden_size, hidden_size)
        self.decoder_projection = nn.Linear(hidden_size, hidden_size)
        self.attention_v = nn.Linear(hidden_size, 1)
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, encoder_hidden, decoder_hidden):
        energy = self.tanh(self.encoder_projection(encoder_hidden) + self.decoder_projection(decoder_hidden))
        #energy: (batch_size, time_step, hidden_size)
        scores = self.attention_v(energy)
        #scores: (batch_size, time_step, 1)
        scores = scores.squeeze(2).unsqueeze(1)
        #scores: (batch_size, 1, time_step)

        attention_weights = self.softmax(scores)
        # attention_weights (batch_size, 1, time_step)
        context_vector = torch.bmm(attention_weights, decoder_hidden)
        #context_vector: (batch_size, 1, hidden_size)
        return context_vector
    
class DecoderGRU(nn.Module):
    def __init__(self, attention, hidden_size, output_size, padding_idx):
        super(DecoderGRU, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size, padding_idx=padding_idx)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.output_projection = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(0.1)
        self.attention = attention


    def forward(self, encoder_outputs, decoder_hidden, decoder_input_ids):
        # decoder_input_ids: (batch_size, 1)
        embedded = self.dropout(self.embedding(decoder_input_ids)) 
        # embedded: (1, batch_size, emb_dim)
        decoder_state = decoder_hidden.permute(1, 0, 2) 
        #decoder_state (batch_size, 1, emb_dim)
        context = self.attention(decoder_state, encoder_outputs) 
        # (batch_size, 1, hidden_size)
        input_gru = torch.cat((embedded, context), dim=-1) 
        # input_gru (batch_size, 1, hidden_size + emb_dim)
        output, decoder_hidden = self.gru(input_gru, decoder_hidden) 
        # output: (batch_size, time_step, hidden_size)
        # decoder_hidden: (1, batch_size, hidden_size)
        decoder_output = self.output_projection(output)
        # decoder_output: (batch_size, 1, output_size)
        return decoder_output, decoder_hidden

In [None]:
class Attentionseq2seq(nn.Module):
    def __init__(self, encoder, decoder, padding_idx):
        super(Attentionseq2seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.criterion = nn.NLLLoss(ignore_index=padding_idx)
        self.logsoftmax = nn.LogSoftmax(dim=-1)

    def forward(self, src_input_ids, tgt_input_ids):
        input_ids = src_input_ids
        targets = tgt_input_ids

        # Encoder
        encoder_outputs, decoder_hidden = self.encoder(input_ids)
        # encoder_outputs: (batch_size, time_step, hidden_size)
        # decoder_hidden: (1, batch_size, hidden_size)
        decoder_next_input = torch.empty(targets.shape[0], 1, dtype=torch.long).fill_(101).to(input_ids.device.type) # 加入CLS token
        # decoder_next_input: (batch_size, 1)

        # Decoder
        decoder_outputs = []
        for i in range(targets.shape[1]):
            decoder_next_input, decoder_hidden = self.decoder(encoder_outputs, decoder_hidden, decoder_next_input)
            # decoder_next_input: (batch_size, 1, hidden_size)
            # decoder_hidden: (1, batch_size, hidden_size)

            decoder_outputs.append(decoder_next_input)      # 儲存當前時序的文字分布狀態
            decoder_next_input = targets[:, i].unsqueeze(1) # 取出下一個對應的文字進行生成
            # decoder_next_input: (batch_size, 1)

        decoder_outputs = torch.cat(decoder_outputs, dim=1) # 完整的Decoder隱狀態輸出
        # decoder_outputs: (batch_size, time_step, output_dim)
        decoder_outputs = self.logsoftmax(decoder_outputs)  # 計算個文字機率
        # decoder_outputs: (batch_size, time_step, output_dim)
       
        # 計算損失值
        loss = self.criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)), # (batch_size * time_step,  output_dim)
            targets.view(-1) # (batch_size * time_step)
        )
        
        return loss, decoder_outputs
    
    def generate(self, input_ids, sos_token=101, eos_token=102, max_len=50):
        with torch.no_grad():
            encoder_outputs, decoder_hidden = self.encoder(input_ids)
            decoder_outputs = []
            decoder_next_input = torch.empty(1, 1, dtype=torch.long).fill_(sos_token).to(input_ids.device.type)
            for _ in range(max_len):
                decoder_next_input, decoder_hidden = self.decoder(encoder_outputs, decoder_hidden, decoder_next_input)
                decoder_outputs.append(decoder_next_input)

                _, top_token_index = decoder_next_input.topk(1)
                if top_token_index == eos_token:
                    break
                
                decoder_next_input = top_token_index.squeeze(-1).detach()  # detach from history as input
            decoder_outputs = torch.cat(decoder_outputs, dim=1)
            decoder_outputs = self.logsoftmax(decoder_outputs)

            _, generated_ids = decoder_outputs.topk(1)
        return generated_ids.squeeze()

In [None]:
import torch.optim as optim
from Trainer import Trainer

# 主程式部分
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
hidden_size = 768
encoder = EncoderGRU(
    vocab_size=len(src_tokenizer), 
    hidden_size=hidden_size, 
    padding_idx=src_tokenizer.pad_token_id
)

decoder = DecoderGRU(
    attention = BahdanauAttention(hidden_size=hidden_size),
    hidden_size=hidden_size, 
    output_size=len(tgt_tokenizer), 
    padding_idx=tgt_tokenizer.pad_token_id
)

model = Attentionseq2seq(
    encoder = encoder,
    decoder = decoder,
    padding_idx = tgt_tokenizer.pad_token_id
).to(device)

optimizer_e = optim.Adam(encoder.parameters(), lr=1e-4)
optimizer_d = optim.Adam(decoder.parameters(), lr=1e-4)
trainer = Trainer(
    epochs=30, 
    train_loader=train_loader, 
    valid_loader=valid_loader, 
    model=model, 
    optimizer=[optimizer_e, optimizer_d],
    early_stopping=3
)
trainer.train()

In [None]:
model.load_state_dict(torch.load('model.ckpt'))
model.eval()

for idx in range(3):
    input_ids = src_tokenizer(x_valid[idx], max_length=256, truncation=True, padding="longest", return_tensors='pt').to(device).input_ids[:, 1:]
    generated_ids = model.generate(input_ids, max_len=20)
    print('\n輸入文字:', x_valid[idx])
    print('目標文字:', y_valid[idx])
    print('翻譯文字:', tgt_tokenizer.decode(generated_ids))