In [None]:
!pip install ctranslate2[gpu] pyonmttok huggingface_hub psutil punctfix wordsegment pyspellchecker language_tool_python --quiet


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.3/192.3 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.0/17.0 MB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m74.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m70.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import time
import psutil
import torch
import ctranslate2
import pyonmttok
from huggingface_hub import snapshot_download
from wordsegment import load, segment
from punctfix import PunctFixer
from spellchecker import SpellChecker
import language_tool_python

# 加载 wordsegment 数据
load()

# 初始化拼写检查器
spell = SpellChecker()

# 初始化 PunctFixer，只需要初始化一次
punct_fixer = PunctFixer()

# 初始化语言工具
tool = language_tool_python.LanguageTool('en-US')


# 初始化全局变量以缓存模型和分词器
ctranslate2_translator = None
pyonmttok_tokenizer = None




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/379 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/253k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading LanguageTool 6.4: 100%|██████████| 246M/246M [00:04<00:00, 55.1MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmpfudduzqb.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://www.languagetool.org/download/LanguageTool-6.4.zip to /root/.cache/language_tool_python.


In [None]:
def process_text_with_tool(text):
    # 使用 wordsegment 处理文本
    text = ' '.join(segment(text))

    # 使用 spellchecker 进行拼写纠正
    words = text.split()
    misspelled = spell.unknown(words)
    corrected_words = [spell.correction(word) if spell.correction(word) is not None else word for word in words]
    text = ' '.join(corrected_words)

    # 使用 language_tool_python 进行语法和拼写检查
    matches = tool.check(text)
    corrected_text = list(text)
    for match in reversed(matches):
        if match.replacements:
            corrected_text[match.offset:match.offset + match.errorLength] = match.replacements[0]
    text = ''.join(corrected_text)

    # 使用 punctfix 处理标点符号
    text = punct_fixer.punctuate(text)

    return text

def process_text_with_model(text):
    # 使用 wordsegment 处理文本
    text = ' '.join(segment(text))

    # 使用 spellchecker 进行拼写纠正
    words = text.split()
    misspelled = spell.unknown(words)
    corrected_words = [spell.correction(word) if spell.correction(word) is not None else word for word in words]
    text = ' '.join(corrected_words)

    return text

def process_text_without_tool_and_model(text):
   # 使用 wordsegment 处理文本
    text = ' '.join(segment(text))

    # 使用 spellchecker 进行拼写纠正
    words = text.split()
    misspelled = spell.unknown(words)
    corrected_words = [spell.correction(word) if spell.correction(word) is not None else word for word in words]
    text = ' '.join(corrected_words)
    text = punct_fixer.punctuate(text)
    return text

In [None]:
'''
在 initialize_model 函数中下载并缓存 CTranslate2 模型，初始化分词器，并对模型进行量化。检测是否有 GPU，如果有则使用 GPU，并无论如何启用多线程。
'''

def initialize_model():
    global ctranslate2_translator, pyonmttok_tokenizer

    # 下载并缓存 CTranslate2 模型
    model_dir = snapshot_download(repo_id="jordimas/gec-opennmt-english", revision="main")

    # 初始化分词器
    pyonmttok_tokenizer = pyonmttok.Tokenizer(mode="none", sp_model_path=model_dir + "/sp_m.model")

    # 检测是否有 GPU
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # 初始化 CTranslate2 翻译器，启用多线程
    ctranslate2_translator = ctranslate2.Translator(model_dir, device=device, inter_threads=4, intra_threads=4)

In [None]:
from concurrent.futures import ThreadPoolExecutor

def translate_batch_with_model(texts):
    global ctranslate2_translator, pyonmttok_tokenizer

    # 处理输入文本，使用多线程进行并行处理
    with ThreadPoolExecutor() as executor:
        processed_texts = list(executor.map(process_text_with_model, texts))

    # 测量开始时间和初始内存使用情况
    start_time = time.time()
    start_memory = psutil.Process().memory_info().rss

    # Tokenize and translate the processed texts
    tokenized_batch = [pyonmttok_tokenizer.tokenize(text)[0] for text in processed_texts]
    translated_batch = ctranslate2_translator.translate_batch(tokenized_batch)
    gec_corrected_texts = [pyonmttok_tokenizer.detokenize(translated.hypotheses[0]) for translated in translated_batch]

    # 使用 punctfix 处理标点符号，使用多线程进行并行处理
    with ThreadPoolExecutor() as executor:
        punctuated_texts = list(executor.map(punct_fixer.punctuate, gec_corrected_texts))

    # 测量结束时间和最终内存使用情况
    end_time = time.time()
    end_memory = psutil.Process().memory_info().rss

    # 计算时间和内存使用情况
    time_taken = end_time - start_time
    memory_used = end_memory - start_memory

    return punctuated_texts, time_taken, memory_used

def translate_batch_with_tool(texts):
  # 测量开始时间和初始内存使用情况
    start_time = time.time()
    start_memory = psutil.Process().memory_info().rss
    # 处理输入文本，使用多线程进行并行处理
    with ThreadPoolExecutor() as executor:
        processed_texts = list(executor.map(process_text_with_tool, texts))





    # 测量结束时间和最终内存使用情况
    end_time = time.time()
    end_memory = psutil.Process().memory_info().rss

    # 计算时间和内存使用情况
    time_taken = end_time - start_time
    memory_used = end_memory - start_memory

    return processed_texts, time_taken, memory_used

def translate_batch_without_model_and_tool(texts):
    global ctranslate2_translator, pyonmttok_tokenizer
    # 测量开始时间和初始内存使用情况
    start_time = time.time()
    start_memory = psutil.Process().memory_info().rss

    # 使用 punctfix 处理标点符号，使用多线程进行并行处理
    with ThreadPoolExecutor() as executor:
        punctuated_texts = list(executor.map(process_text_without_tool_and_model, texts))

    # 测量结束时间和最终内存使用情况
    end_time = time.time()
    end_memory = psutil.Process().memory_info().rss

    # 计算时间和内存使用情况
    time_taken = end_time - start_time
    memory_used = end_memory - start_memory

    return punctuated_texts, time_taken, memory_used

In [None]:
# 初始化模型
initialize_model()

# 示例批量翻译
'''
input_texts = [
    "The water are hot.",
    "My friends are going to be late.",
    "Today mine mother is in Barcelona.",
    "I have arecieve a letter.",
    "She didnt go tothe party.",
    "The car is loosing speed.",
    "I like to eat appl.",
    "They are planinga trip.",
    "Although it wasraining, they decided to go for a hike in the mountains.",
    "Despite beingtired, he finished his homework before going to bed.",
    "She enjoys reading books, especially mystery novels and historical fiction.",
    "The company's profits have increased significantly over the past year.",
    "He is considering applying to several universities, including Harvard and MIT.",
    "The quick brown fox jumps over the lazy dog while the cat watches from the window and the birds sing in the trees.",
    "In a small village nestled in the mountains, there lived a young girl who dreamed of exploring the world beyond her home.",
    "As the sun set over the horizon, casting a warm golden glow across the landscape, the children played in the fields, laughing and chasing each other.",
    "The conference was attended by experts from various fields, including technology, medicine, and education, who shared their insights and discussed the latest advancements.",
    "After a long and tiring journey, they finally arrived at their destination, a beautiful coastal town with sandy beaches and crystal-clear waters.",
    "Wait! Are you coming with us?",
    "He said, 'I'll be there at 6:00 PM.'",
    "Wow, that's amazing! How did you do it?",
    "Let's meet at the café on Main Street.",
    "She asked, 'Can you help me with this problem?'"
]
'''
test="My friendsare goiinng to be late."
input_texts=[test[:i] for i in range(len(test))]
# 批量翻译使用模型
#output_texts_model, time_taken_model, memory_used_model = translate_batch_with_model(input_texts)

# 批量翻译使用language_tool_python
output_texts_tool, time_taken_tool, memory_used_tool = translate_batch_with_tool(input_texts)


output_texts, time_taken, memory_used = translate_batch_without_model_and_tool(input_texts)

# 打印翻译结果和性能指标
print("neither")
for i, output_text in enumerate(output_texts):
    print(f"Input: {input_texts[i]}")
    print(f"Output: {output_text}")
    print()

print(f"Time taken (Model): {time_taken:.2f} seconds")
print(f"Memory used (Model): {memory_used / (1024 ** 2):.2f} MB")

'''
print("Using GEC Model:")
for i, output_text in enumerate(output_texts_model):
    print(f"Input: {input_texts[i]}")
    print(f"Output: {output_text}")
    print()

print(f"Time taken (Model): {time_taken_model:.2f} seconds")
print(f"Memory used (Model): {memory_used_model / (1024 ** 2):.2f} MB")

print("\nUsing language_tool_python:")
for i, output_text in enumerate(output_texts_tool):
    print(f"Input: {input_texts[i]}")
    print(f"Output: {output_text}")
    print()

print(f"Time taken (Tool): {time_taken_tool:.2f} seconds")
print(f"Memory used (Tool): {memory_used_tool / (1024 ** 2):.2f} MB")
'''


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

neither
Input: 
Output: 

Input: M
Output: M.

Input: My
Output: My

Input: My 
Output: My

Input: My f
Output: My F.

Input: My fr
Output: My for.

Input: My fri
Output: My fry.

Input: My frie
Output: My Free.

Input: My frien
Output: My friend.

Input: My friend
Output: My friend.

Input: My friends
Output: My friends.

Input: My friendsa
Output: My friends A.

Input: My friendsar
Output: My friends A.

Input: My friendsare
Output: My friends are.

Input: My friendsare 
Output: My friends are.

Input: My friendsare g
Output: My friends are G.

Input: My friendsare go
Output: My friends are go.

Input: My friendsare goi
Output: My friends are go i.

Input: My friendsare goii
Output: My friends are go i.

Input: My friendsare goiin
Output: My friends are got in.

Input: My friendsare goiinn
Output: My friends are go i inn.

Input: My friendsare goiinng
Output: My friends are got in no.

Input: My friendsare goiinng 
Output: My friends are got in no.

Input: My friendsare goiinng t
Out

'\nprint("Using GEC Model:")\nfor i, output_text in enumerate(output_texts_model):\n    print(f"Input: {input_texts[i]}")\n    print(f"Output: {output_text}")\n    print()\n\nprint(f"Time taken (Model): {time_taken_model:.2f} seconds")\nprint(f"Memory used (Model): {memory_used_model / (1024 ** 2):.2f} MB")\n\nprint("\nUsing language_tool_python:")\nfor i, output_text in enumerate(output_texts_tool):\n    print(f"Input: {input_texts[i]}")\n    print(f"Output: {output_text}")\n    print()\n\nprint(f"Time taken (Tool): {time_taken_tool:.2f} seconds")\nprint(f"Memory used (Tool): {memory_used_tool / (1024 ** 2):.2f} MB")\n'