In [33]:
# 設定目標
directory = 'Untrans'
file_name = 'test.docx'
model_checkpoint = "charliealex123/marian-finetuned-kde4-zh-to-en"

In [34]:
# 讀取套件與函數
from t0_config import rawdata_path, workdata_path
from transformers import pipeline
from docx import Document
import os
from tqdm import tqdm, trange
import re

rawdata = f'{rawdata_path}/{directory}/'
workdata = f'{workdata_path}/{directory}/'

def is_english_start(text):
    """
    檢查一個字符串是否符合指定條件：
    1. 為空
    2. 英文字開頭
    3. 跳脫字元後英文字開頭，e.g. \tThis is an apple.
    """
    pattern = r'^(\s*|\\[^\s]*)[A-Za-z]'

    # 如果字符串以英文字母開頭，返回 True
    if re.match(pattern, text):
        return True

    # 如果字符串為空，返回 True
    if not text.strip():
        return True

    return False

def split_string_into_chunks(text, max_length=500):
    """
    將輸入的字串分成不超過指定長度的片段
    """
    chunks = []

    # 將輸入字串按照最大長度分成多個片段
    while len(text) > max_length:
        chunk = text[:max_length]
        chunks.append(chunk)
        text = text[max_length:]

    # 將最後剩餘的部分添加到片段中
    if len(text) > 0:
        chunks.append(text)

    return chunks


In [35]:
# 讀取Word檔案
os.chdir(rawdata)
doc = Document(file_name)
raw_text_list = [paragraph.text for paragraph in doc.paragraphs]

In [36]:
# 讀取機器學習模型
translator = pipeline("translation", model=model_checkpoint)

config.json: 100%|██████████| 1.43k/1.43k [00:00<00:00, 2.70MB/s]
model.safetensors: 100%|██████████| 310M/310M [00:37<00:00, 8.33MB/s] 
generation_config.json: 100%|██████████| 288/288 [00:00<00:00, 553kB/s]


In [37]:
translator2 = pipeline("translation", model='Helsinki-NLP/opus-mt-zh-en', tokenizer=model_checkpoint)

config.json: 100%|██████████| 1.39k/1.39k [00:00<00:00, 1.85MB/s]
pytorch_model.bin: 100%|██████████| 312M/312M [00:29<00:00, 10.6MB/s] 
generation_config.json: 100%|██████████| 293/293 [00:00<00:00, 839kB/s]


In [15]:
# 設定翻譯範圍
raw_text_list = raw_text_list[:]
print('第一句前五十字:', raw_text_list[0][0:50])
print('最後一句前五十字:', raw_text_list[-1][0:50])
print('總共有', len(raw_text_list), '句')

第一句前五十字: 防振橡胶的性能评估装置
最後一句前五十字: 
總共有 154 句


In [7]:
# 開始一句句翻譯
max_length = 250
trans_text_list = []
for raw_text in tqdm(raw_text_list):
    # 空白/英文開頭的段落不翻譯
    if is_english_start(raw_text):
        trans_text_list.append(raw_text)
        continue

    # 如果段落長度小於最大長度，直接翻譯，否則切完再翻譯
    if len(raw_text) < max_length:
        paragraph = translator(raw_text, max_length=1000, return_text=True)[0]['translation_text']
    else:
        temp_text_list = split_string_into_chunks(raw_text, max_length)
        paragraph = ''
        for temp_text in tqdm(temp_text_list):
            paragraph += translator(temp_text, max_length=1000, return_text=True)[0]['translation_text']
    trans_text_list.append(paragraph)

  0%|          | 0/154 [00:00<?, ?it/s]

  3%|▎         | 5/154 [00:01<00:44,  3.34it/s]

In [206]:
# 檢查翻譯句子數量是否相同
print('len of raw_text_list:', len(raw_text_list))
print('len of trans_text_list:', len(trans_text_list))

len of raw_text_list: 154
len of trans_text_list: 154


In [207]:
# 產生新的Word檔案
for i in trange(len(trans_text_list)):
    paragraph = doc.paragraphs[i]
    paragraph.text = trans_text_list[i]

    # 如果格式是標題2，加粗
    style = paragraph.style.name
    if style == 'Heading 2':
        paragraph.style.font.bold = True

# 儲存成新檔案
doc.save('translated.docx')

  0%|          | 0/154 [00:00<?, ?it/s]

100%|██████████| 154/154 [00:00<00:00, 1214.39it/s]
