In [1]:
!pip install ctranslate2[gpu] pyonmttok huggingface_hub psutil punctfix wordsegment pyspellchecker language_tool_python --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.0/17.0 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.3/192.3 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [11]:
import time
import psutil
import torch
import ctranslate2
import pyonmttok
from huggingface_hub import snapshot_download
from wordsegment import load, segment
from punctfix import PunctFixer
from spellchecker import SpellChecker
#import language_tool_python
from concurrent.futures import ThreadPoolExecutor
# 加载 wordsegment 数据
load()

# 初始化拼写检查器
spell = SpellChecker()

# 初始化 PunctFixer，只需要初始化一次
punct_fixer = PunctFixer()

# 初始化语言工具
#tool = language_tool_python.LanguageTool('en-US')


# 初始化全局变量以缓存模型和分词器
ctranslate2_translator = None
pyonmttok_tokenizer = None




In [14]:

def process_text_with_model(text):
    # 使用 wordsegment 处理文本
    text = ' '.join(segment(text))

    # 使用 spellchecker 进行拼写纠正
    words = text.split()
    misspelled = spell.unknown(words)
    corrected_words = [spell.correction(word) if spell.correction(word) is not None else word for word in words]
    text = ' '.join(corrected_words)

    return text


In [15]:


def initialize_model():
    global ctranslate2_translator, pyonmttok_tokenizer

    # 下载并缓存 CTranslate2 模型
    model_dir = snapshot_download(repo_id="jordimas/gec-opennmt-english", revision="main")

    # 初始化分词器
    pyonmttok_tokenizer = pyonmttok.Tokenizer(mode="none", sp_model_path=model_dir + "/sp_m.model")

    # 检测是否有 GPU
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # 初始化 CTranslate2 翻译器，启用多线程
    ctranslate2_translator = ctranslate2.Translator(model_dir, device=device, inter_threads=4, intra_threads=4)

In [16]:


def translate_batch_with_model(texts):
    global ctranslate2_translator, pyonmttok_tokenizer

    # 处理输入文本，使用多线程进行并行处理
    with ThreadPoolExecutor() as executor:
        processed_texts = list(executor.map(process_text_with_model, texts))

    # 测量开始时间和初始内存使用情况
    start_time = time.time()
    start_memory = psutil.Process().memory_info().rss

    # Tokenize and translate the processed texts
    tokenized_batch = [pyonmttok_tokenizer.tokenize(text)[0] for text in processed_texts]
    translated_batch = ctranslate2_translator.translate_batch(tokenized_batch)
    gec_corrected_texts = [pyonmttok_tokenizer.detokenize(translated.hypotheses[0]) for translated in translated_batch]

    # 使用 punctfix 处理标点符号，使用多线程进行并行处理
    with ThreadPoolExecutor() as executor:
        punctuated_texts = list(executor.map(punct_fixer.punctuate, gec_corrected_texts))

    # 测量结束时间和最终内存使用情况
    end_time = time.time()
    end_memory = psutil.Process().memory_info().rss

    # 计算时间和内存使用情况
    time_taken = end_time - start_time
    memory_used = end_memory - start_memory

    return punctuated_texts, time_taken, memory_used


In [17]:
# 初始化模型
initialize_model()

# 示例批量翻译

input_texts = [
    "The water are hot.",
    "My friends are going to be late.",
    "Today mine mother is in Barcelona.",
    "I have arecieve a letter.",
    "She didnt go tothe party.",
    "The car is loosing speed.",
    "I like to eat appl.",
    "They are planinga trip.",
    "Although it wasraining, they decided to go for a hike in the mountains.",
    "Despite beingtired, he finished his homework before going to bed.",
    "She enjoys reading books, especially mystery novels and historical fiction.",
    "The company's profits have increased significantly over the past year.",
    "He is considering applying to several universities, including Harvard and MIT.",
    "The quick brown fox jumps over the lazy dog while the cat watches from the window and the birds sing in the trees.",
    "In a small village nestled in the mountains, there lived a young girl who dreamed of exploring the world beyond her home.",
    "As the sun set over the horizon, casting a warm golden glow across the landscape, the children played in the fields, laughing and chasing each other.",
    "The conference was attended by experts from various fields, including technology, medicine, and education, who shared their insights and discussed the latest advancements.",
    "After a long and tiring journey, they finally arrived at their destination, a beautiful coastal town with sandy beaches and crystal-clear waters.",
    "Wait! Are you coming with us?",
    "He said, 'I'll be there at 6:00 PM.'",
    "Wow, that's amazing! How did you do it?",
    "Let's meet at the café on Main Street.",
    "She asked, 'Can you help me with this problem?'"
]

# 批量翻译使用模型
output_texts_model, time_taken_model, memory_used_model = translate_batch_with_model(input_texts)



# 打印翻译结果和性能指标
for i, output_texts_model in enumerate(output_texts_model):
    print(f"Input: {input_texts[i]}")
    print(f"Output: {output_texts_model}")
    print()

print(f"Time taken (Model): {time_taken_model:.2f} seconds")
print(f"Memory used (Model): {memory_used_model / (1024 ** 2):.2f} MB")



Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Input: The water are hot.
Output: The water is hot.

Input: My friends are going to be late.
Output: My friends are going to be late.

Input: Today mine mother is in Barcelona.
Output: Today my mother is in Barcelona.

Input: I have arecieve a letter.
Output: I have received a letter.

Input: She didnt go tothe party.
Output: She didnt go to the party.

Input: The car is loosing speed.
Output: The car is losing speed.

Input: I like to eat appl.
Output: I like to eat and apply.

Input: They are planinga trip.
Output: They are planning a trip.

Input: Although it wasraining, they decided to go for a hike in the mountains.
Output: Although it was raining, they decided to go for a hike in the mountains.

Input: Despite beingtired, he finished his homework before going to bed.
Output: Despite being tired, he finished his homework before going to bed.

Input: She enjoys reading books, especially mystery novels and historical fiction.
Output: She enjoys reading books, especially mystery nove