From 349e1d2dbf1c7fe8c01d5bad328f1bfe01bcfb4f Mon Sep 17 00:00:00 2001 From: DoodleBears Date: Sat, 29 Jun 2024 05:25:39 +0900 Subject: [PATCH] version(v0.1.0): init langsplit --- .gitignore | 4 + README.md | 56 ++++++++ langsplit/__init__.py | 1 + langsplit/detect_lang/detector.py | 24 ++++ langsplit/split/splitter.py | 210 ++++++++++++++++++++++++++++++ setup.py | 35 +++++ tests/test_split.py | 39 ++++++ 7 files changed, 369 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 langsplit/__init__.py create mode 100644 langsplit/detect_lang/detector.py create mode 100644 langsplit/split/splitter.py create mode 100644 setup.py create mode 100644 tests/test_split.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..74ab87b --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +__pycache__ +build/ +dist/ +langsplit.egg-info/ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..165af45 --- /dev/null +++ b/README.md @@ -0,0 +1,56 @@ +# `langsplit` + +splitting sentences by language (concatenating over-split substrings based on their language) + +# Motivation +1. TTS (Text-To-Speech) model often fail on multi-language sentence, separate sentence based on language will bring better result +2. Existed NLP toolkit (e.g. SpaCy) is helpful for parsing text in one language, however when it comes to multi-language text like below is hard to deal with: + +``` +你最近好吗、最近どうですか?요즘 어떻게 지내요?sky is clear and sunny。 +``` + +# Usage + +## Installation + +You can install the package using pip: + +```bash +pip install langsplit +``` + +```python +texts = [ + "我是 VGroupChatBot,一个旨在支持多人通信的助手,通过可视化消息来帮助团队成员更好地交流。我可以帮助团队成员更好地整理和共享信息,特别是在讨论、会议和Brainstorming等情况下。你好我的名字是西野くまですmy name is bob很高兴认识你どうぞよろしくお願いいたします「こんにちは」是什么意思。", + "你好,我的名字是西野くまです。I am from Tokyo, 日本の首都。今天的天气非常好,sky is clear and sunny。おはようございます、皆さん!我们一起来学习吧。Learning languages can be fun and exciting。昨日はとても忙しかったので、今日は少しリラックスしたいです。Let's take a break and enjoy some coffee。中文、日本語、and English are three distinct languages, each with its own unique charm。希望我们能一起进步,一起成长。Let's keep studying and improving our language skills together. ありがとう!", + "你好,今日はどこへ行きますか?", + "我的名字是田中さんです。", + "我喜欢吃寿司和拉面おいしいです。", + "今天の天気はとてもいいですね。", + "我在学习日本語少し難しいです。", + "日语真是おもしろい啊", + "你喜欢看アニメ吗?", + "我想去日本旅行、特に京都に行きたいです。", + "昨天見た映画はとても感動的でした。" "我朋友是日本人、彼はとても優しいです。", + "我们一起去カラオケ吧、楽しそうです。", + "你今天吃了什么、朝ごはんは何ですか?", + "我的家在北京、でも、仕事で東京に住んでいます。", + "我在学做日本料理、日本料理を作るのを習っています。", + "你会说几种语言、何ヶ国語話せますか?", + "我昨天看了一本书、その本はとても面白かったです。", + "我们一起去逛街、買い物に行きましょう。", + "你最近好吗、最近どうですか?", + "我在学做日本料理와 한국 요리、日本料理を作るのを習っています。", + "你会说几种语言、何ヶ国語話せますか?몇 개 언어를 할 수 있어요?", + "我昨天看了一本书、その本はとても面白かったです。어제 책을 읽었는데, 정말 재미있었어요。", + "我们一起去逛街와 쇼핑、買い物に行きましょう。쇼핑하러 가요。", + "你最近好吗、最近どうですか?요즘 어떻게 지내요?", +] + +for text in texts: + substr_list = split(text, verbose=True) + for index, substr in enumerate(substr_list): + print(f"{substr.lang}|{index}: {substr.text}") + print("----------------------") +``` \ No newline at end of file diff --git a/langsplit/__init__.py b/langsplit/__init__.py new file mode 100644 index 0000000..b13eaf1 --- /dev/null +++ b/langsplit/__init__.py @@ -0,0 +1 @@ +from .split.splitter import split diff --git a/langsplit/detect_lang/detector.py b/langsplit/detect_lang/detector.py new file mode 100644 index 0000000..18373e5 --- /dev/null +++ b/langsplit/detect_lang/detector.py @@ -0,0 +1,24 @@ +from langdetect import detect +import fast_langdetect + +lang_map = { + "zh": "zh", + "zh-cn": "zh", + "zh-tw": "x", + "ko": "ko", + "ja": "ja", +} + + +def detect_lang(text: str) -> str: + result = str(detect(text)) + result = result.lower() + return result + + +def fast_detect_lang(text: str, text_len_threshold=3) -> str: + if len(text) <= text_len_threshold: + return detect_lang(text) + result = str(fast_langdetect.detect(text, low_memory=False)["lang"]) + result = result.lower() + return result diff --git a/langsplit/split/splitter.py b/langsplit/split/splitter.py new file mode 100644 index 0000000..7be1e29 --- /dev/null +++ b/langsplit/split/splitter.py @@ -0,0 +1,210 @@ +from typing import List +from dataclasses import dataclass + +from langdetect.lang_detect_exception import LangDetectException +from wtpsplit import SaT, WtP + +from langsplit.detect_lang.detector import detect_lang, fast_detect_lang, lang_map + + +@dataclass +class SubString: + lang: str + text: str + + +class SentenceSplitter: + def __init__(self, wtp_split_model: WtP | SaT = WtP("wtp-bert-mini")): + self.wtp_split_model = wtp_split_model + + def split(self, text: str, threshold: float = 5e-5, verbose=False): + return self.wtp_split_model.split( + text_or_texts=text, threshold=threshold, verbose=verbose + ) + + +default_sentence_splitter = SentenceSplitter() + + +def split( + text: str, + threshold: float = 5e-5, + verbose=False, + splitter: SentenceSplitter = default_sentence_splitter, +): + """using + 1. `wtpsplit` to split sentences into 'small' substring + 2. concat substring based on language using `fasttext` and `langdetect` + + Args: + text (str): text to split + threshold (float, optional): the lower the more separated (more) substring will return. Defaults to 5e-5. + """ + substr_list = splitter.split(text=text, threshold=threshold, verbose=verbose) + if verbose: + print(f"substr_list: {substr_list}") + substr_list = _init_substr_lang(substr_list) + if verbose: + print(f"substr_list: {substr_list}") + substr_list = _smart_concat(substr_list) + if verbose: + print(f"split_result: {substr_list}") + return substr_list + + +def _smart_concat(substr_list: List[SubString]): + is_concat_complete = False + while is_concat_complete is False: + substr_list = _smart_concat_logic(substr_list) + is_concat_complete = True + for index, block in enumerate(substr_list): + if block.lang == "x": + is_concat_complete = False + break + if index < len(substr_list) - 1: + if substr_list[index].lang == substr_list[index + 1].lang: + is_concat_complete = False + break + return substr_list + + +def _init_substr_lang(substr: List[str]) -> List[SubString]: + concat_result = [] + lang = "" + for block in substr: + try: + cur_lang = detect_lang(block) + except LangDetectException: + cur_lang = lang + cur_lang = lang_map.get(cur_lang, "en") + concat_result.append(SubString(cur_lang, block)) + lang = cur_lang + return concat_result + + +def _merge_middle_substr_to_two_side(substr_list: List[SubString]): + for index in range(len(substr_list) - 2): + left_block = substr_list[index] + middle_block = substr_list[index + 1] + right_block = substr_list[index + 2] + if left_block.lang == right_block.lang and left_block.lang != "x": + if len(middle_block.text) <= 1 or middle_block.lang == "x": + substr_list[index + 1].lang = left_block.lang + return substr_list + + +def _merge_two_side_substr_to_near(concat_result: List[SubString]): + if concat_result[0].lang == "x": + for substr in concat_result: + if substr.lang != "x": + concat_result[0].lang = substr.lang + break + elif len(concat_result[0].text) <= 1: + concat_result[0].lang = _find_nearest_lang_with_direction( + concat_result, 0, is_left=False + ) + if concat_result[-1].lang == "x": + concat_result[-1].lang = _find_nearest_lang_with_direction( + concat_result, len(concat_result) - 1, is_left=True + ) + return concat_result + + +def _fill_missing_languages(concat_result: List[SubString]): + for index, substr in enumerate(concat_result): + if substr.lang == "x": + if index == 0: + # For head substring, find right substring + concat_result[index].lang = _find_nearest_lang_with_direction( + concat_result, index, is_left=False + ) + elif index == len(concat_result) - 1: + # For tail substring, find left substring + concat_result[index].lang = _find_nearest_lang_with_direction( + concat_result, index, is_left=True + ) + else: + # For body (middle) substring, find based on rule + is_left = _get_find_direction(concat_result, index) + concat_result[index].lang = _find_nearest_lang_with_direction( + concat_result, index, is_left + ) + return concat_result + + +def _find_nearest_lang_with_direction( + concat_result: List[SubString], index: int, is_left: bool +): + if is_left: + for i in range(1, len(concat_result)): + if index - i >= 0 and concat_result[index - i].lang != "x": + return concat_result[index - i].lang + else: + for i in range(1, len(concat_result)): + if index + i < len(concat_result) and concat_result[index + i].lang != "x": + return concat_result[index + i].lang + return "en" + + +def _get_find_direction(substr_list: List[SubString], index: int) -> bool: + is_left = False + if index == 0: + is_left = False + return is_left + elif index == len(substr_list) - 1: + is_left = True + return is_left + left_block = substr_list[index - 1] + right_block = substr_list[index + 1] + if len(left_block.text) < len(right_block.text) or right_block.lang not in [ + "ja", + "zh", + ]: + is_left = True + else: + is_left = False + return is_left + + +def _merge_blocks(concat_result: List[SubString]): + smart_concat_result = [] + lang = "" + for block in concat_result: + cur_lang = block.lang + if cur_lang != lang: + smart_concat_result.append(block) + else: + smart_concat_result[-1].text += block.text + lang = cur_lang + return smart_concat_result + + +def _check_languages(lang_text_list: List[SubString]): + for index, block in enumerate(lang_text_list): + try: + cur_lang = fast_detect_lang(block.text) + except LangDetectException: + cur_lang = "en" + cur_lang = lang_map.get(cur_lang, "en") + if cur_lang == "ko": + fast_lang = fast_detect_lang(block.text, text_len_threshold=0) + if fast_lang != "ko": + is_left = _get_find_direction(lang_text_list, index) + cur_lang = _find_nearest_lang_with_direction( + lang_text_list, index, is_left + ) + if cur_lang != "x": + block.lang = cur_lang + return lang_text_list + + +def _smart_concat_logic(lang_text_list: List[SubString]): + lang_text_list = _merge_middle_substr_to_two_side(lang_text_list) + lang_text_list = _merge_blocks(lang_text_list) + lang_text_list = _check_languages(lang_text_list) + lang_text_list = _merge_middle_substr_to_two_side(lang_text_list) + lang_text_list = _fill_missing_languages(lang_text_list) + lang_text_list = _merge_two_side_substr_to_near(lang_text_list) + lang_text_list = _merge_blocks(lang_text_list) + lang_text_list = _check_languages(lang_text_list) + return lang_text_list diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..fae9210 --- /dev/null +++ b/setup.py @@ -0,0 +1,35 @@ +from setuptools import setup, find_packages +from os import path + + +def packagefile(*relpath): + return path.join(path.dirname(__file__), *relpath) + + +def read(*relpath): + with open(packagefile(*relpath), encoding="utf-8") as f: + return f.read() + + +setup( + name="langsplit", + version="0.1.0", + description="A package for splitting sentences by language (concatenating over-split substrings based on their language)", + long_description=read("README.md"), + long_description_content_type="text/markdown", + url="https://github.com/DoodleBears/langsplit", + author="DoodleBear", + author_email="yangmufeng233@gmail.com", + packages=find_packages(), + install_requires=[ + "langdetect", + "fast_langdetect", + "wtpsplit", + ], + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + python_requires=">=3.9", +) diff --git a/tests/test_split.py b/tests/test_split.py new file mode 100644 index 0000000..817ca99 --- /dev/null +++ b/tests/test_split.py @@ -0,0 +1,39 @@ +from langsplit import split + +texts = [ + "我是 VGroupChatBot,一个旨在支持多人通信的助手,通过可视化消息来帮助团队成员更好地交流。我可以帮助团队成员更好地整理和共享信息,特别是在讨论、会议和Brainstorming等情况下。你好我的名字是西野くまですmy name is bob很高兴认识你どうぞよろしくお願いいたします「こんにちは」是什么意思。", + "你好,我的名字是西野くまです。I am from Tokyo, 日本の首都。今天的天气非常好,sky is clear and sunny。おはようございます、皆さん!我们一起来学习吧。Learning languages can be fun and exciting。昨日はとても忙しかったので、今日は少しリラックスしたいです。Let's take a break and enjoy some coffee。中文、日本語、and English are three distinct languages, each with its own unique charm。希望我们能一起进步,一起成长。Let's keep studying and improving our language skills together. ありがとう!", + "你好,今日はどこへ行きますか?", + "我的名字是田中さんです。", + "我喜欢吃寿司和拉面、おいしいです。", + "我喜欢吃寿司和拉面おいしいです。", + "今天の天気はとてもいいですね。", + "我在学习日本語、少し難しいです。", + "我在学习日本語少し難しいです。", + "日语真是おもしろい啊", + "你喜欢看アニメ吗?", + "我想去日本旅行、特に京都に行きたいです。", + "昨天見た映画はとても感動的でした。" "我朋友是日本人、彼はとても優しいです。", + "我们一起去カラオケ吧、楽しそうです。", + "你今天吃了什么、朝ごはんは何ですか?", + "我的家在北京、でも、仕事で東京に住んでいます。", + "我喜欢读书、本を読むのが好きです。", + "这个周末、一緒に公園へ行きましょうか?", + "你的猫很可爱、あなたの猫はかわいいです。", + "我在学做日本料理、日本料理を作るのを習っています。", + "你会说几种语言、何ヶ国語話せますか?", + "我昨天看了一本书、その本はとても面白かったです。", + "我们一起去逛街、買い物に行きましょう。", + "你最近好吗、最近どうですか?", + "我在学做日本料理와 한국 요리、日本料理を作るのを習っています。", + "你会说几种语言、何ヶ国語話せますか?몇 개 언어를 할 수 있어요?", + "我昨天看了一本书、その本はとても面白かったです。어제 책을 읽었는데, 정말 재미있었어요。", + "我们一起去逛街와 쇼핑、買い物に行きましょう。쇼핑하러 가요。", + "你最近好吗、最近どうですか?요즘 어떻게 지내요?", +] + +for text in texts: + substr_list = split(text, verbose=True) + for index, substr in enumerate(substr_list): + print(f"{substr.lang}|{index}: {substr.text}") + print("----------------------")