In [1]:
# 必要なライブラリのインストール
pip install jieba

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting jieba
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/c6/cb/18eeb235f833b726522d7ebed54f2278ce28ba9438e3135ab0278d9792a2/jieba-0.42.1.tar.gz (19.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.2/19.2 MB[0m [31m68.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: jieba
  Building wheel for jieba (setup.py) ... [?25ldone
[?25h  Created wheel for jieba: filename=jieba-0.42.1-py3-none-any.whl size=19314458 sha256=7245ee8156b70f33d4354de04053c3d8f5fbb425e33f8633fd009ead72fc270d
  Stored in directory: /home/featurize/.cache/pip/wheels/37/08/79/ea7c0d2ca823affa13f89586a5a9eff8dd6ad589640396e1b5
Successfully built jieba
Installing collected packages: jieba
Successfully installed jieba-0.42.1
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install sentencepiece

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting sentencepiece
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/fb/12/2f5c8d4764b00033cf1c935b702d3bb878d10be9f0b87f0253495832d85f/sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install torchtext==0.4.0 

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting torchtext==0.4.0
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/43/94/929d6bd236a4fb5c435982a7eb9730b78dcd8659acf328fd2ef9de85f483/torchtext-0.4.0-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torchtext
Successfully installed torchtext-0.4.0
Note: you may need to restart the kernel to use updated packages.


In [36]:
# ライブラリのインポート
import math
import torchtext
import torch
import torch.nn as nn
from torch import Tensor
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from collections import Counter
from torchtext.vocab import Vocab
from torch.nn import TransformerEncoder, TransformerDecoder, TransformerEncoderLayer, TransformerDecoderLayer
import io
import time
import pandas as pd 
import numpy as np
import pickle
import tqdm
import sentencepiece as spm
# 再現性のためにシードを固定
torch.manual_seed(0)
# GPUが使える場合はGPUを使用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [37]:
# 日中対訳データセットの読み込み
df = pd.read_csv('zh-ja.bicleaner05.txt', sep='\\t', engine='python', header=None)
trainzh = df[2].values.tolist()#[:10000]
trainja = df[3].values.tolist()#[:10000]

In [38]:
# データの確認
print(trainzh[500])
print(trainja[500])

Chinese HS Code Harmonized Code System < HS编码 2905 无环醇及其卤化、磺化、硝化或亚硝化衍生物 HS Code List (Harmonized System Code) for US, UK, EU, China, India, France, Japan, Russia, Germany, Korea, Canada ...
Japanese HS Code Harmonized Code System < HSコード 2905 非環式アルコール並びにそのハロゲン化誘導体、スルホン化誘導体、ニトロ化誘導体及びニトロソ化誘導体 HS Code List (Harmonized System Code) for US, UK, EU, China, India, France, Japan, Russia, Germany, Korea, Canada ...


In [39]:
# SentencePieceトークナイザのモデル読み込み
zh_tokenizer = spm.SentencePieceProcessor(model_file='spm.zh.nopretok.model')
ja_tokenizer = spm.SentencePieceProcessor(model_file='spm.ja.nopretok.model')

In [41]:
# 中国語のトークン化テスト
zh_tokenizer.encode("年金 在日本居住的20岁到60岁的人必须加入公共年金制度。")

[45,
 11042,
 22559,
 45,
 22523,
 21367,
 23664,
 23194,
 13313,
 1483,
 29132,
 22977,
 2723,
 29132,
 13313,
 11726,
 23044,
 26716,
 22631,
 19869,
 16973,
 22835,
 11042,
 22559,
 22828,
 22615,
 4776]

In [42]:
# 日本語のトークン化テスト
ja_tokenizer.encode("年金 日本に住んでいる20歳~60歳の全ての人は、公的年金制度に加入しなければなりません。")

[4,
 6866,
 714,
 12628,
 210,
 1550,
 306,
 1077,
 5231,
 1092,
 830,
 3,
 7503,
 6866,
 786,
 10,
 8556,
 4600,
 5]

In [43]:
# 語彙集（ボキャブラリー）の構築関数
def build_vocab(sentences, tokenizer):
    counter = Counter()
    for sentence in sentences:
        counter.update(tokenizer.encode(sentence, out_type=str))
    # 特殊トークン（未知語、パディング、文頭、文末）を追加
    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

# 語彙集の作成
ja_vocab = build_vocab(trainja, ja_tokenizer)
zh_vocab = build_vocab(trainzh, zh_tokenizer)

In [44]:
# データをテンソル（数値）に変換する処理
def data_process(ja, en):
    data = []
    for (raw_ja, raw_en) in zip(ja, en):
        # 文末の改行を削除し、ID列に変換
        ja_tensor_ = torch.tensor([ja_vocab[token] for token in ja_tokenizer.encode(raw_ja.rstrip("\n"), out_type=str)],
                                  dtype=torch.long)
        zh_tensor_ = torch.tensor([zh_vocab[token] for token in zh_tokenizer.encode(raw_en.rstrip("\n"), out_type=str)],
                                  dtype=torch.long)
        data.append((ja_tensor_, zh_tensor_))
    return data

train_data = data_process(trainja, trainzh)


In [45]:
# バッチサイズと特殊トークンのID定義
BATCH_SIZE = 8

PAD_IDX = ja_vocab['<pad>']
BOS_IDX = ja_vocab['<bos>']
EOS_IDX = ja_vocab['<eos>']
# バッチデータの生成関数（パディング処理含む）
def generate_batch(data_batch):
    ja_batch, zh_batch = [], []
    for (ja_item, zh_item) in data_batch:
        # 文頭と文末にトークンを追加
        ja_batch.append(torch.cat([torch.tensor([BOS_IDX]), ja_item, torch.tensor([EOS_IDX])], dim=0))
        zh_batch.append(torch.cat([torch.tensor([BOS_IDX]), zh_item, torch.tensor([EOS_IDX])], dim=0))
        # パディングを行って長さを揃える
    ja_batch = pad_sequence(ja_batch, padding_value=PAD_IDX)
    zh_batch = pad_sequence(zh_batch, padding_value=PAD_IDX)
    return ja_batch, zh_batch
# データローダーの作成
train_iter = DataLoader(train_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)

In [46]:
# Transformerモデルの定義
from torch.nn import (TransformerEncoder, TransformerDecoder,
                      TransformerEncoderLayer, TransformerDecoderLayer)

class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers: int, num_decoder_layers: int,
                 emb_size: int, src_vocab_size: int, tgt_vocab_size: int,
                 dim_feedforward:int = 512, dropout:float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        # エンコーダ層
        encoder_layer = TransformerEncoderLayer(d_model=emb_size, nhead=NHEAD,
                                                dim_feedforward=dim_feedforward)
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        # デコーダ層
        decoder_layer = TransformerDecoderLayer(d_model=emb_size, nhead=NHEAD,
                                                dim_feedforward=dim_feedforward)
        self.transformer_decoder = TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)
        # 出力層
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        # 埋め込み層（Embedding）
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        # 位置エンコーディング
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

    def forward(self, src: Tensor, trg: Tensor, src_mask: Tensor,
                tgt_mask: Tensor, src_padding_mask: Tensor,
                tgt_padding_mask: Tensor, memory_key_padding_mask: Tensor):
        # 順伝播処理
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        memory = self.transformer_encoder(src_emb, src_mask, src_padding_mask)
        outs = self.transformer_decoder(tgt_emb, memory, tgt_mask, None,
                                        tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer_encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer_decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [47]:
# 位置エンコーディング（Positional Encoding）クラス
# Transformerは順序情報を持たないため、位置情報を加算する
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding +
                            self.pos_embedding[:token_embedding.size(0),:])
# トークン埋め込みクラス
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

In [48]:
# マスク生成関数
# デコーダが未来の単語を見ないようにするためのマスク（Look-ahead Mask）などを作成
def generate_square_subsequent_mask(sz):

    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)

    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [49]:
# ハイパーパラメータの設定
SRC_VOCAB_SIZE = len(ja_vocab)
TGT_VOCAB_SIZE = len(zh_vocab)
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 16
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
NUM_EPOCHS = 16
# モデルのインスタンス化
transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS,
                                 EMB_SIZE, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE,
                                 FFN_HID_DIM)
# パラメータの初期化（Xavier Initialization）
for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(device)
# 損失関数の定義（パディング部分は無視する）
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)
# 最適化アルゴリズム（Adam）の設定
optimizer = torch.optim.Adam(
    transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9
)
# 1エポック分の学習関数
def train_epoch(model, train_iter, optimizer):
    model.train()
    losses = 0
    for idx, (src, tgt) in enumerate(train_iter):
        src = src.to(device)
        tgt = tgt.to(device)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,
                       src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()
    return losses / len(train_iter)

# 検証関数
def evaluate(model, val_iter):
    model.eval()
    losses = 0
    for idx, (src, tgt) in enumerate(val_iter):
        src = src.to(device)
        tgt = tgt.to(device)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,
                       src_padding_mask, tgt_padding_mask, src_padding_mask)
        
        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()
    return losses / len(val_iter)



In [51]:
# 学習の実行ループ
for epoch in tqdm.tqdm(range(1, NUM_EPOCHS + 1)):
    start_time = time.time()  
    train_loss = train_epoch(transformer, train_iter, optimizer)
    end_time = time.time()  
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, "
           f"Epoch time = {(end_time - start_time):.3f}s"))

  6%|▋         | 1/16 [03:50<57:32, 230.19s/it]

Epoch: 1, Train loss: 4.498, Epoch time = 230.192s


 12%|█▎        | 2/16 [07:42<54:03, 231.66s/it]

Epoch: 2, Train loss: 3.510, Epoch time = 232.688s


 19%|█▉        | 3/16 [11:45<51:17, 236.71s/it]

Epoch: 3, Train loss: 3.096, Epoch time = 242.716s


 25%|██▌       | 4/16 [15:44<47:29, 237.48s/it]

Epoch: 4, Train loss: 2.796, Epoch time = 238.662s


 31%|███▏      | 5/16 [19:40<43:27, 237.07s/it]

Epoch: 5, Train loss: 2.574, Epoch time = 236.348s


 38%|███▊      | 6/16 [24:06<41:08, 246.83s/it]

Epoch: 6, Train loss: 2.408, Epoch time = 265.779s


 44%|████▍     | 7/16 [28:06<36:40, 244.52s/it]

Epoch: 7, Train loss: 2.293, Epoch time = 239.749s


 50%|█████     | 8/16 [32:09<32:32, 244.04s/it]

Epoch: 8, Train loss: 2.200, Epoch time = 243.011s


 56%|█████▋    | 9/16 [36:05<28:10, 241.55s/it]

Epoch: 9, Train loss: 2.116, Epoch time = 236.068s


 62%|██████▎   | 10/16 [39:59<23:55, 239.27s/it]

Epoch: 10, Train loss: 2.041, Epoch time = 234.184s


 69%|██████▉   | 11/16 [43:57<19:55, 239.00s/it]

Epoch: 11, Train loss: 1.979, Epoch time = 238.383s


 75%|███████▌  | 12/16 [47:47<15:45, 236.30s/it]

Epoch: 12, Train loss: 1.926, Epoch time = 230.122s


 81%|████████▏ | 13/16 [51:45<11:50, 236.69s/it]

Epoch: 13, Train loss: 1.878, Epoch time = 237.577s


 88%|████████▊ | 14/16 [55:41<07:52, 236.42s/it]

Epoch: 14, Train loss: 1.836, Epoch time = 235.804s


 94%|█████████▍| 15/16 [59:59<04:03, 243.03s/it]

Epoch: 15, Train loss: 1.797, Epoch time = 258.346s


100%|██████████| 16/16 [1:04:21<00:00, 241.35s/it]

Epoch: 16, Train loss: 1.763, Epoch time = 262.011s





In [52]:
# 貪欲法によるデコード（翻訳文生成）関数
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(device)
    src_mask = src_mask.to(device)
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
    for i in range(max_len-1):
        memory = memory.to(device)
        memory_mask = torch.zeros(ys.shape[0], memory.shape[0]).to(device).type(torch.bool)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                                    .type(torch.bool)).to(device)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.item()
        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys
# 翻訳実行のラッパー関数
def translate(model, src, src_vocab, tgt_vocab, src_tokenizer):
    model.eval()  
    tokens = [BOS_IDX] + [src_vocab.stoi[tok] for tok in src_tokenizer.encode(src, out_type=str)] + [EOS_IDX]
    num_tokens = len(tokens)
    src = (torch.LongTensor(tokens).reshape(num_tokens, 1))
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(model, src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join([tgt_vocab.itos[tok] for tok in tgt_tokens]).replace("<bos>", "").replace("<eos>", "")


In [101]:
# テストデータの抽出（全体の1%を使用）
TEST_RATIO = 0.01 
NUM_SAMPLES = 100 
import random
random.seed(42) 
test_data = random.sample(train_data, min(NUM_SAMPLES, int(len(train_data) * TEST_RATIO)))

test_iter = DataLoader(test_data, batch_size=1,
                       shuffle=False, collate_fn=generate_batch)

# BLEUスコア計算関数（機械翻訳の評価指標）
def calculate_bleu(pred_tokens, target_tokens):
    pred_set = set(pred_tokens)
    target_set = set(target_tokens)
    correct = sum(1 for token in pred_tokens if token in target_set)
    precision = correct / len(pred_tokens) if len(pred_tokens) > 0 else 0
    bleu = precision * math.exp(min(0, 1 - len(target_tokens) / len(pred_tokens))) if precision > 0 else 0
    return bleu


In [104]:
# 学習済みモデルの評価関数
def evaluate_trained_model(model, test_loader, num_samples=None):
    model.eval()
    total_bleu = 0
    total_exact_match = 0
    sample_count = 0
    
    examples = []
    
    with torch.no_grad():
        for idx, (src, tgt) in enumerate(test_loader):
            if num_samples and idx >= num_samples:
                break
                
            ja_sentence = ' '.join([ja_vocab.itos[token.item()] for token in src.squeeze(1) 
                                  if token.item() not in [BOS_IDX, EOS_IDX, PAD_IDX]])
            
            target_zh = [zh_vocab.itos[token.item()] for token in tgt.squeeze(1) 
                        if token.item() not in [BOS_IDX, EOS_IDX, PAD_IDX]]
            target_zh_str = ''.join(target_zh)  
            
            src_mask = (torch.zeros(src.shape[0], src.shape[0])).type(torch.bool).to(device)
            translated = greedy_decode(model, src, src_mask, max_len=50, start_symbol=BOS_IDX).flatten()
            
            pred_zh = [zh_vocab.itos[token.item()] for token in translated 
                      if token.item() not in [BOS_IDX, EOS_IDX, PAD_IDX]]
            pred_zh_str = ''.join(pred_zh)  
            
            bleu, precision, recall = calculate_bleu(pred_zh, target_zh)
            
            exact_match = int(pred_zh_str == target_zh_str)
            
            total_bleu += bleu
            total_exact_match += exact_match
            sample_count += 1
            
            if idx < 5:
                # 以下のキーは出力結果の各項目に対応しています
                examples.append({
                    "日语": ja_sentence,# 日本語原文
                    "预测中文": pred_zh_str,# 予測された中国語訳
                    "真实中文": target_zh_str,# 正解の中国語訳
                    "完全匹配": "是" if exact_match else "否",# 完全一致したか（是=はい, 否=いいえ）
                    "BLEU分数": f"{bleu:.4f}"# BLEUスコア
                })
        
    avg_bleu = total_bleu / sample_count if sample_count > 0 else 0
    exact_match_rate = total_exact_match / sample_count if sample_count > 0 else 0
    # モデル評価結果の出力
    print("\n模型性能评估:")# 日本語訳：モデル性能評価
    print(f"测试句子数: {sample_count}")# 日本語訳：テストした文の数
    print(f"平均BLEU分数: {avg_bleu:.4f}")# 日本語訳：平均BLEUスコア
    
    return avg_bleu, exact_match_rate

In [105]:
print("\n开始评估训练好的模型...")# 日本語訳：学習済みモデルの評価を開始します...
avg_bleu, exact_match_rate = evaluate_trained_model(transformer, test_iter)


开始评估训练好的模型...

模型性能评估:
测试句子数: 100
平均BLEU分数: 0.5414


In [4]:
translated_sentence = translate(
    transformer, 
    "日本のコンビニのサービスには 本当に感心させられる。",  
    ja_vocab, 
    zh_vocab, 
    ja_tokenizer  
)

print(translated_sentence)

_日本 便利店 的 服务 真 的 令 人 佩服
