In [1]:
from datasets import load_dataset, load_metric
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from translate.storage.tmx import tmxfile
import pandas as pd
import os
from tqdm import tqdm
rawdata_path = '/Users/alexlo/Desktop/Project/Chai_Trans/rawdata/trados_tmx/'
workdata_path = '/Users/alexlo/Desktop/Project/Chai_Trans/workdata/'
logdata_path = '/Users/alexlo/Desktop/Project/Chai_Trans/logdata'
os.chdir(rawdata_path)

  from .autonotebook import tqdm as notebook_tqdm


# Data Wrangling

In [56]:
def tmx_to_df(path: str, from_: str, to_:str) -> pd.DataFrame:
    with open(path, 'rb') as fin:
        tmx_file = tmxfile(fin, from_, to_)
    data = []
    for node in tmx_file.unit_iter():
        data.append([node.source, node.target])
    df = pd.DataFrame(data, columns=[from_, to_])
    return df

In [57]:
file1 = 'CH(Simplified)-EN.tmx'
file2 = 'CH(Simplified)-EN_who.tmx'
df1 = tmx_to_df(file1, 'zh', 'en')
df2 = tmx_to_df(file2, 'zh', 'en')
df = pd.concat([df1, df2], axis=0).reset_index(drop=True)

In [58]:
print(df.shape)
df.head()

(48894, 2)


Unnamed: 0,zh,en
0,焦距調節機構,FOCAL LENGTH ADJUSTMENT MECHANISM
1,一種焦距調節機構，適於調節位於一投影機的一殼體內的一焦距調節件。,"A focal length adjustment mechanism, adapted f..."
2,焦距調節機構包括一旋鈕及一第一行程調整件。,The focal length adjustment mechanism includes...
3,旋鈕局部地外露於殼體。,The knob is partially exposed to the housing.
4,旋鈕轉動，帶動第一行程調整件以第一樞軸為中心轉動。,The knob rotates to drive the first stroke adj...


In [59]:
os.chdir(workdata_path)
df.to_json('tmx_zh_en.json', orient='records', lines=True)

# Load the dataset in Hugging Face

In [2]:
os.chdir(workdata_path)
raw_datasets = load_dataset('json', data_files='tmx_zh_en.json') 

Generating train split: 24447 examples [00:00, 215962.99 examples/s]


In [7]:
# 試著只用100行來訓練
raw_datasets['train'] = raw_datasets['train'].select(range(100, 200))


In [8]:
raw_datasets['train'][0:10]

{'zh': ['此外，在本實施例中，阻尼組件140還設置於第二行程調整件130。',
  '具體地說，阻尼組件140的抵壓件142(左方的抵壓件142)設置於第二樞軸135上以對第二樞軸135向下施壓。',
  '如此，第一端部121的位置便能夠被保持。',
  '因此，第一行程調整件120在第一樞軸127處便會受到抵壓件142的下壓力所形成的摩擦力，而不會隨意轉動。',
  '因此，第二行程調整件130在第二樞軸135處便會受到抵壓件142的下壓力所形成的摩擦力而不會隨意轉動，第四端部133的位置便能夠被保持。',
  '因此，旋鈕110可連帶地保持位置，待使用者施力轉動旋鈕110，克服上述摩擦力，而可使第二行程調整件130與第一行程調整件120轉動。',
  '由圖1可見，阻尼組件140還可選擇地包括一緩衝件144。',
  '緩衝件144設置於抵壓件142與第一樞軸127之間，緩衝件144受抵壓件142擠壓變形。',
  '這樣的設計可使得抵壓件142透過緩衝件144來調整下壓力，以免第一樞軸127處的摩擦力過大。',
  '當然，組裝者也可以透過調整抵壓件142在沿著軸向A的螺接位置，來提供不同的下壓力。'],
 'en': ['In addition, in this embodiment, the 140 is also set on the 130.',
  'Specifically, the 142 of the 140 (the 142 on the left) is set on the 135 to puts downward pressure on the 135.',
  'As a result, the position of the 121 can be remained.',
  'Therefore, the 120 on the 127 is subjected to the friction formed by the downward pressure of the 142, and will not rotate freely.',
  'Therefore, the 130 on the 135 is subjected to the friction formed by the downward pressure 

In [9]:
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
split_datasets["validation"] = split_datasets.pop("test")
split_datasets

DatasetDict({
    train: Dataset({
        features: ['zh', 'en'],
        num_rows: 90
    })
    validation: Dataset({
        features: ['zh', 'en'],
        num_rows: 10
    })
})

In [10]:
split_datasets["train"][1]

{'zh': '在一實施例中，磁吸件146、147也可以是其中一者是磁鐵，另一者是可被磁鐵吸引的金屬。',
 'en': 'In an embodiment, one of the magnetic attraction members 146 and 147 can also be a magnet, while the other one is a metal that can be attracted by the magnet.'}

In [11]:
from transformers import pipeline

model_checkpoint = "charliealex123/marian-finetuned-kde4-zh-to-en"
translator = pipeline("translation", model=model_checkpoint)
translator("第二組線路結構適合連接另一組接合器。")

[{'translation_text': 'The second line structure is suitable to connect to another group of connectors.'}]

In [12]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

In [13]:
split_datasets["train"][1]["en"]

'In an embodiment, one of the magnetic attraction members 146 and 147 can also be a magnet, while the other one is a metal that can be attracted by the magnet.'

In [14]:
zh_sentence = split_datasets["train"][1]["zh"]
en_sentence = split_datasets["train"][1]["en"]

inputs = tokenizer(zh_sentence)
with tokenizer.as_target_tokenizer():
    targets = tokenizer(en_sentence)



In [15]:
max_input_length = 128
max_target_length = 128

def preprocess_function(examples):
    inputs = [ex for ex in examples["zh"]]
    targets = [ex for ex in examples["en"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [16]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)

In [17]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [18]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [19]:
batch["labels"]

tensor([[   66,    57, 61273,     2,   139,     4,     3, 49328, 49546,   298,
         33709,     6, 33154,   122,    79,    32,    12, 51279,     2,   599,
             3,    85,   139,    30,    12, 21202,    19,   122,    32, 27457,
            29,     3, 51279,     5,     0,  -100,  -100],
        [ 5063,     2,     3, 21581,    18,     3, 31103,    30,  6295,     8,
             3, 55047,  7327,    29,     3, 32220,  5988,     4,     3, 34626,
             2,     6,    73,    54, 48118,  9335,     2,     3,  1445,     4,
             3, 30294,   122,    32,  2912,     5,     0]])

In [20]:
for i in range(1, 3):
    print(tokenized_datasets["train"][i]["labels"])

[66, 57, 61273, 2, 139, 4, 3, 49328, 49546, 298, 33709, 6, 33154, 122, 79, 32, 12, 51279, 2, 599, 3, 85, 139, 30, 12, 21202, 19, 122, 32, 27457, 29, 3, 51279, 5, 0]
[5063, 2, 3, 21581, 18, 3, 31103, 30, 6295, 8, 3, 55047, 7327, 29, 3, 32220, 5988, 4, 3, 34626, 2, 6, 73, 54, 48118, 9335, 2, 3, 1445, 4, 3, 30294, 122, 32, 2912, 5, 0]


In [21]:
from datasets import load_metric

metric = load_metric("sacrebleu")

  metric = load_metric("sacrebleu")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [15]:
predictions = [
    "This plugin lets you translate web pages between several languages automatically."
]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

{'score': 46.750469682990165,
 'counts': [11, 6, 4, 3],
 'totals': [12, 11, 10, 9],
 'precisions': [91.66666666666667,
  54.54545454545455,
  40.0,
  33.333333333333336],
 'bp': 0.9200444146293233,
 'sys_len': 12,
 'ref_len': 13}

In [16]:
predictions = ["This This This This"]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

{'score': 1.683602693167689,
 'counts': [1, 0, 0, 0],
 'totals': [4, 3, 2, 1],
 'precisions': [25.0, 16.666666666666668, 12.5, 12.5],
 'bp': 0.10539922456186433,
 'sys_len': 4,
 'ref_len': 13}

In [22]:
import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [23]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"marian-finetuned-kde4-zh-to-en",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=True,
)

In [24]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

#Logging
1. 100 句話前後評估都差不多 15s
2. 訓練要 9m14s(6min跑循環最後3min不知道衝啥小)

In [20]:
trainer.evaluate(max_length=max_target_length)

100%|██████████| 1/1 [00:00<00:00, 13.31it/s]


{'eval_loss': 2.314296007156372,
 'eval_bleu': 28.452433575353787,
 'eval_runtime': 14.5938,
 'eval_samples_per_second': 0.685,
 'eval_steps_per_second': 0.069}

In [21]:
trainer.train()

  0%|          | 0/6 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
100%|██████████| 6/6 [05:56<00:00, 59.46s/it]


{'train_runtime': 356.7753, 'train_samples_per_second': 0.757, 'train_steps_per_second': 0.017, 'train_loss': 1.84809414545695, 'epoch': 3.0}


TrainOutput(global_step=6, training_loss=1.84809414545695, metrics={'train_runtime': 356.7753, 'train_samples_per_second': 0.757, 'train_steps_per_second': 0.017, 'train_loss': 1.84809414545695, 'epoch': 3.0})

In [22]:
trainer.evaluate(max_length=max_target_length)

100%|██████████| 1/1 [00:00<00:00,  5.15it/s]


{'eval_loss': 1.182519555091858,
 'eval_bleu': 46.86788757848948,
 'eval_runtime': 17.5937,
 'eval_samples_per_second': 0.568,
 'eval_steps_per_second': 0.057,
 'epoch': 3.0}

In [23]:
trainer.push_to_hub(tags="translation", commit_message="Training complete")

Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
model.safetensors: 100%|██████████| 310M/310M [04:16<00:00, 1.21MB/s]   


CommitInfo(commit_url='https://huggingface.co/charliealex123/marian-finetuned-kde4-zh-to-en/commit/b395ea3d3c31553cc64dd4d210c8a41f2ce6bbdc', commit_message='Training complete', commit_description='', oid='b395ea3d3c31553cc64dd4d210c8a41f2ce6bbdc', pr_url=None, pr_revision=None, pr_num=None)

# 完整循環

In [25]:
from torch.utils.data import DataLoader

tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

In [26]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [27]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)



In [28]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [29]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [30]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "marian-finetuned-kde4-zh-to-en"
repo_name = get_full_repo_name(model_name)
repo_name

'charliealex123/marian-finetuned-kde4-zh-to-en'

In [4]:
output_dir = "/Users/alexlo/Desktop/Project/Chai_Trans/marian-finetuned-kde4-zh-to-en-local"
repo = Repository(output_dir, clone_from=repo_name)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/charliealex123/marian-finetuned-kde4-zh-to-en into local empty directory.
Download file model.safetensors:   0%|          | 17.5k/296M [00:00<?, ?B/s]
Download file model.safetensors: 100%|█████████▉| 294M/296M [02:55<00:00, 1.45MB/s] 

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

Download file model.safetensors: 100%|██████████| 296M/296M [03:10<00:00, 1.45MB/s]

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A


In [31]:
def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels

In [24]:
from huggingface_hub import HfApi
from getpass import getpass

# 填寫用戶名和密碼
# username = input("Enter your username: ")
# password = getpass("Enter your password: ")
model_path = "/Users/alexlo/Desktop/Project/Chai_Trans/marian-finetuned-kde4-zh-to-en"  # 模型文件的路徑

# 認證並創建API
api = HfApi()


In [32]:
from tqdm.auto import tqdm
import torch
import numpy as np

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=128,
            )
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        generated_tokens = accelerator.pad_across_processes(
            generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
        )
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(generated_tokens)
        labels_gathered = accelerator.gather(labels)

        decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=decoded_preds, references=decoded_labels)

    results = metric.compute()
    print(f"epoch {epoch}, BLEU score: {results['score']:.2f}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        # api.upload_folder(token="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",
        #         folder_path='/Users/alexlo/Desktop/Project/Chai_Trans/marian-finetuned-kde4-zh-to-en',
        #         repo_id=repo_name,
        #         commit_message=f"Training in progress epoch {epoch}", 
        #         )
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )

100%|██████████| 2/2 [00:15<00:00,  7.52s/it]t]
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


epoch 0, BLEU score: 40.27


100%|██████████| 2/2 [00:14<00:00,  7.15s/it]t]
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


epoch 1, BLEU score: 49.34


Several commits (2) will be pushed upstream.
100%|██████████| 2/2 [00:14<00:00,  7.32s/it]t]
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


epoch 2, BLEU score: 48.21


Several commits (3) will be pushed upstream.


In [None]:
9:55

In [33]:
model

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(65001, 512, padding_idx=65000)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(65001, 512, padding_idx=65000)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [None]:
api.upload_folder(token="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",
        folder_path='/Users/alexlo/Desktop/Project/Chai_Trans/marian-finetuned-kde4-zh-to-en',
        repo_id=repo_name,
        commit_message=f"Training in progress epoch {epoch}", 
        )

In [None]:
repo.push_to_hub(
    commit_message=f"Training in progress epoch {epoch}", blocking=False
)

In [29]:
api.upload_file(token="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",
        repo_id=repo_name,
        commit_message=f"Training in progress epoch {epoch}", 
        )

TypeError: HfApi.upload_file() missing 2 required keyword-only arguments: 'path_or_fileobj' and 'path_in_repo'

# 使用模型

In [2]:
os.chdir(workdata_path)
raw_datasets = load_dataset('json', data_files='tmx_zh_en.json') 

In [3]:
from transformers import pipeline

model_checkpoint = "charliealex123/marian-finetuned-kde4-zh-to-en"
translator = pipeline("translation", model=model_checkpoint)

model.safetensors: 100%|██████████| 310M/310M [04:16<00:00, 1.21MB/s] 
special_tokens_map.json: 100%|██████████| 416/416 [00:00<00:00, 537kB/s]


In [8]:
raw_datasets['train'].num_rows

48894

In [7]:
# 訓練前0~10
for i in range(1000, 1005):
    # i = 3
    zh_sentence = raw_datasets['train']['zh'][i]
    print('需翻譯句:', zh_sentence)
    print('原翻譯:', raw_datasets['train']['en'][i])
    print('機器學習翻譯:', translator(zh_sentence)[0]['translation_text'])
    print()

需翻譯句: 如請求項1所述的電路板模組，其中該至少一凹槽包括兩凹槽，該些攝像元件分別設置於該兩凹槽內。
原翻譯: The circuit board module according to claim 1, wherein the at least one groove has two grooves, and the two camera elements are respectively disposed in the two grooves.
機器學習翻譯: A circuit board module, as described in request 1, in which at least one groove includes two grooves, each of which is set separately in the two grooves.

需翻譯句: 在本發明的一實施例中，上述的電路板本體在兩凹槽之間包括一無兩元件導通走線區段，兩攝像元件透過該接電路板彼此電性連接，而無法透過該兩元件導通走線區段電性連接。
原翻譯: In an embodiment of the disclosure, the circuit board body includes a conducting wiring section without two elements between the two grooves. The two camera elements are electrically connected to each other through the circuit board, but not electronically connecting to each other through the conducting wiring section without two elements.
機器學習翻譯: In an illustration of the disclosure, the above-mentioned circuit board body includes a no-two-component steering line segment between the two grooves, the two camera ele

In [5]:
# 拿100句話訓練後
for i in range(0, 10):
    # i = 3
    zh_sentence = raw_datasets['train']['zh'][i]
    print('需翻譯句:', zh_sentence)
    print('原翻譯:', raw_datasets['train']['en'][i])
    print('機器學習翻譯:', translator(zh_sentence)[0]['translation_text'])
    print()

需翻譯句: 焦距調節機構
原翻譯: FOCAL LENGTH ADJUSTMENT MECHANISM
機器學習翻譯: Focal length adjustment mechanism

需翻譯句: 一種焦距調節機構，適於調節位於一投影機的一殼體內的一焦距調節件。
原翻譯: A focal length adjustment mechanism, adapted for adjusting a focal length adjustment device located in a housing of a projector.
機器學習翻譯: A focal length adjustment mechanism, suitable for adjusting a focal length adjustment device located in a shell of a projector.

需翻譯句: 焦距調節機構包括一旋鈕及一第一行程調整件。
原翻譯: The focal length adjustment mechanism includes a knob and a first stroke adjustment member.
機器學習翻譯: The focal length adjustment mechanism includes a knob and a first process adjustment member.

需翻譯句: 旋鈕局部地外露於殼體。
原翻譯: The knob is partially exposed to the housing.
機器學習翻譯: The knob is partially exposed to the shell.

需翻譯句: 旋鈕轉動，帶動第一行程調整件以第一樞軸為中心轉動。
原翻譯: The knob rotates to drive the first stroke adjustment member to rotate along the first pivot.
機器學習翻譯: The knob rotates, with the first movement adjustment member rotates along the first pivot.

需翻譯句: Descripti

In [49]:
# 訓練前1000~1010
for i in range(1000, 1010):
    # i = 3
    zh_sentence = raw_datasets['train']['zh'][i]
    print('需翻譯句:', zh_sentence)
    print('原翻譯:', raw_datasets['train']['en'][i])
    print('機器學習翻譯:', translator(zh_sentence)[0]['translation_text'])
    print()

需翻譯句: 如請求項1所述的電路板模組，其中該至少一凹槽包括兩凹槽，該些攝像元件分別設置於該兩凹槽內。
原翻譯: The circuit board module according to claim 1, wherein the at least one groove has two grooves, and the two camera elements are respectively disposed in the two grooves.
機器學習翻譯: If requested, the circuit board module described in item 1 should include at least two diagonals in at least one groove in which the camera element is separated into two diagonal slots.

需翻譯句: 在本發明的一實施例中，上述的電路板本體在兩凹槽之間包括一無兩元件導通走線區段，兩攝像元件透過該接電路板彼此電性連接，而無法透過該兩元件導通走線區段電性連接。
原翻譯: In an embodiment of the disclosure, the circuit board body includes a conducting wiring section without two elements between the two grooves. The two camera elements are electrically connected to each other through the circuit board, but not electronically connecting to each other through the conducting wiring section without two elements.
機器學習翻譯: In the case of this invention, the above-mentioned circuit board body consists of a no-two-part line link between the two dents, which is 

In [51]:
# 訓練後1000~1010
for i in range(1000, 1010):
    # i = 3
    zh_sentence = raw_datasets['train']['zh'][i]
    print('需翻譯句:', zh_sentence)
    print('原翻譯:', raw_datasets['train']['en'][i])
    print('機器學習翻譯:', translator(zh_sentence)[0]['translation_text'])
    print()


需翻譯句: 如請求項1所述的電路板模組，其中該至少一凹槽包括兩凹槽，該些攝像元件分別設置於該兩凹槽內。
原翻譯: The circuit board module according to claim 1, wherein the at least one groove has two grooves, and the two camera elements are respectively disposed in the two grooves.
機器學習翻譯: If requested, the circuit board module described in item 1 should include at least two diagonals in at least one groove in which the camera element is separated into two diagonal slots.

需翻譯句: 在本發明的一實施例中，上述的電路板本體在兩凹槽之間包括一無兩元件導通走線區段，兩攝像元件透過該接電路板彼此電性連接，而無法透過該兩元件導通走線區段電性連接。
原翻譯: In an embodiment of the disclosure, the circuit board body includes a conducting wiring section without two elements between the two grooves. The two camera elements are electrically connected to each other through the circuit board, but not electronically connecting to each other through the conducting wiring section without two elements.
機器學習翻譯: In the case of this invention, the above-mentioned circuit board body consists of a no-two-part line link between the two dents, which is 