-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
4355530
commit 349e1d2
Showing
7 changed files
with
369 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
__pycache__ | ||
build/ | ||
dist/ | ||
langsplit.egg-info/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
# `langsplit` | ||
|
||
splitting sentences by language (concatenating over-split substrings based on their language) | ||
|
||
# Motivation | ||
1. TTS (Text-To-Speech) model often fail on multi-language sentence, separate sentence based on language will bring better result | ||
2. Existed NLP toolkit (e.g. SpaCy) is helpful for parsing text in one language, however when it comes to multi-language text like below is hard to deal with: | ||
|
||
``` | ||
你最近好吗、最近どうですか?요즘 어떻게 지내요?sky is clear and sunny。 | ||
``` | ||
|
||
# Usage | ||
|
||
## Installation | ||
|
||
You can install the package using pip: | ||
|
||
```bash | ||
pip install langsplit | ||
``` | ||
|
||
```python | ||
texts = [ | ||
"我是 VGroupChatBot,一个旨在支持多人通信的助手,通过可视化消息来帮助团队成员更好地交流。我可以帮助团队成员更好地整理和共享信息,特别是在讨论、会议和Brainstorming等情况下。你好我的名字是西野くまですmy name is bob很高兴认识你どうぞよろしくお願いいたします「こんにちは」是什么意思。", | ||
"你好,我的名字是西野くまです。I am from Tokyo, 日本の首都。今天的天气非常好,sky is clear and sunny。おはようございます、皆さん!我们一起来学习吧。Learning languages can be fun and exciting。昨日はとても忙しかったので、今日は少しリラックスしたいです。Let's take a break and enjoy some coffee。中文、日本語、and English are three distinct languages, each with its own unique charm。希望我们能一起进步,一起成长。Let's keep studying and improving our language skills together. ありがとう!", | ||
"你好,今日はどこへ行きますか?", | ||
"我的名字是田中さんです。", | ||
"我喜欢吃寿司和拉面おいしいです。", | ||
"今天の天気はとてもいいですね。", | ||
"我在学习日本語少し難しいです。", | ||
"日语真是おもしろい啊", | ||
"你喜欢看アニメ吗?", | ||
"我想去日本旅行、特に京都に行きたいです。", | ||
"昨天見た映画はとても感動的でした。" "我朋友是日本人、彼はとても優しいです。", | ||
"我们一起去カラオケ吧、楽しそうです。", | ||
"你今天吃了什么、朝ごはんは何ですか?", | ||
"我的家在北京、でも、仕事で東京に住んでいます。", | ||
"我在学做日本料理、日本料理を作るのを習っています。", | ||
"你会说几种语言、何ヶ国語話せますか?", | ||
"我昨天看了一本书、その本はとても面白かったです。", | ||
"我们一起去逛街、買い物に行きましょう。", | ||
"你最近好吗、最近どうですか?", | ||
"我在学做日本料理와 한국 요리、日本料理を作るのを習っています。", | ||
"你会说几种语言、何ヶ国語話せますか?몇 개 언어를 할 수 있어요?", | ||
"我昨天看了一本书、その本はとても面白かったです。어제 책을 읽었는데, 정말 재미있었어요。", | ||
"我们一起去逛街와 쇼핑、買い物に行きましょう。쇼핑하러 가요。", | ||
"你最近好吗、最近どうですか?요즘 어떻게 지내요?", | ||
] | ||
|
||
for text in texts: | ||
substr_list = split(text, verbose=True) | ||
for index, substr in enumerate(substr_list): | ||
print(f"{substr.lang}|{index}: {substr.text}") | ||
print("----------------------") | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .split.splitter import split |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
from langdetect import detect | ||
import fast_langdetect | ||
|
||
lang_map = { | ||
"zh": "zh", | ||
"zh-cn": "zh", | ||
"zh-tw": "x", | ||
"ko": "ko", | ||
"ja": "ja", | ||
} | ||
|
||
|
||
def detect_lang(text: str) -> str: | ||
result = str(detect(text)) | ||
result = result.lower() | ||
return result | ||
|
||
|
||
def fast_detect_lang(text: str, text_len_threshold=3) -> str: | ||
if len(text) <= text_len_threshold: | ||
return detect_lang(text) | ||
result = str(fast_langdetect.detect(text, low_memory=False)["lang"]) | ||
result = result.lower() | ||
return result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,210 @@ | ||
from typing import List | ||
from dataclasses import dataclass | ||
|
||
from langdetect.lang_detect_exception import LangDetectException | ||
from wtpsplit import SaT, WtP | ||
|
||
from langsplit.detect_lang.detector import detect_lang, fast_detect_lang, lang_map | ||
|
||
|
||
@dataclass | ||
class SubString: | ||
lang: str | ||
text: str | ||
|
||
|
||
class SentenceSplitter: | ||
def __init__(self, wtp_split_model: WtP | SaT = WtP("wtp-bert-mini")): | ||
self.wtp_split_model = wtp_split_model | ||
|
||
def split(self, text: str, threshold: float = 5e-5, verbose=False): | ||
return self.wtp_split_model.split( | ||
text_or_texts=text, threshold=threshold, verbose=verbose | ||
) | ||
|
||
|
||
default_sentence_splitter = SentenceSplitter() | ||
|
||
|
||
def split( | ||
text: str, | ||
threshold: float = 5e-5, | ||
verbose=False, | ||
splitter: SentenceSplitter = default_sentence_splitter, | ||
): | ||
"""using | ||
1. `wtpsplit` to split sentences into 'small' substring | ||
2. concat substring based on language using `fasttext` and `langdetect` | ||
Args: | ||
text (str): text to split | ||
threshold (float, optional): the lower the more separated (more) substring will return. Defaults to 5e-5. | ||
""" | ||
substr_list = splitter.split(text=text, threshold=threshold, verbose=verbose) | ||
if verbose: | ||
print(f"substr_list: {substr_list}") | ||
substr_list = _init_substr_lang(substr_list) | ||
if verbose: | ||
print(f"substr_list: {substr_list}") | ||
substr_list = _smart_concat(substr_list) | ||
if verbose: | ||
print(f"split_result: {substr_list}") | ||
return substr_list | ||
|
||
|
||
def _smart_concat(substr_list: List[SubString]): | ||
is_concat_complete = False | ||
while is_concat_complete is False: | ||
substr_list = _smart_concat_logic(substr_list) | ||
is_concat_complete = True | ||
for index, block in enumerate(substr_list): | ||
if block.lang == "x": | ||
is_concat_complete = False | ||
break | ||
if index < len(substr_list) - 1: | ||
if substr_list[index].lang == substr_list[index + 1].lang: | ||
is_concat_complete = False | ||
break | ||
return substr_list | ||
|
||
|
||
def _init_substr_lang(substr: List[str]) -> List[SubString]: | ||
concat_result = [] | ||
lang = "" | ||
for block in substr: | ||
try: | ||
cur_lang = detect_lang(block) | ||
except LangDetectException: | ||
cur_lang = lang | ||
cur_lang = lang_map.get(cur_lang, "en") | ||
concat_result.append(SubString(cur_lang, block)) | ||
lang = cur_lang | ||
return concat_result | ||
|
||
|
||
def _merge_middle_substr_to_two_side(substr_list: List[SubString]): | ||
for index in range(len(substr_list) - 2): | ||
left_block = substr_list[index] | ||
middle_block = substr_list[index + 1] | ||
right_block = substr_list[index + 2] | ||
if left_block.lang == right_block.lang and left_block.lang != "x": | ||
if len(middle_block.text) <= 1 or middle_block.lang == "x": | ||
substr_list[index + 1].lang = left_block.lang | ||
return substr_list | ||
|
||
|
||
def _merge_two_side_substr_to_near(concat_result: List[SubString]): | ||
if concat_result[0].lang == "x": | ||
for substr in concat_result: | ||
if substr.lang != "x": | ||
concat_result[0].lang = substr.lang | ||
break | ||
elif len(concat_result[0].text) <= 1: | ||
concat_result[0].lang = _find_nearest_lang_with_direction( | ||
concat_result, 0, is_left=False | ||
) | ||
if concat_result[-1].lang == "x": | ||
concat_result[-1].lang = _find_nearest_lang_with_direction( | ||
concat_result, len(concat_result) - 1, is_left=True | ||
) | ||
return concat_result | ||
|
||
|
||
def _fill_missing_languages(concat_result: List[SubString]): | ||
for index, substr in enumerate(concat_result): | ||
if substr.lang == "x": | ||
if index == 0: | ||
# For head substring, find right substring | ||
concat_result[index].lang = _find_nearest_lang_with_direction( | ||
concat_result, index, is_left=False | ||
) | ||
elif index == len(concat_result) - 1: | ||
# For tail substring, find left substring | ||
concat_result[index].lang = _find_nearest_lang_with_direction( | ||
concat_result, index, is_left=True | ||
) | ||
else: | ||
# For body (middle) substring, find based on rule | ||
is_left = _get_find_direction(concat_result, index) | ||
concat_result[index].lang = _find_nearest_lang_with_direction( | ||
concat_result, index, is_left | ||
) | ||
return concat_result | ||
|
||
|
||
def _find_nearest_lang_with_direction( | ||
concat_result: List[SubString], index: int, is_left: bool | ||
): | ||
if is_left: | ||
for i in range(1, len(concat_result)): | ||
if index - i >= 0 and concat_result[index - i].lang != "x": | ||
return concat_result[index - i].lang | ||
else: | ||
for i in range(1, len(concat_result)): | ||
if index + i < len(concat_result) and concat_result[index + i].lang != "x": | ||
return concat_result[index + i].lang | ||
return "en" | ||
|
||
|
||
def _get_find_direction(substr_list: List[SubString], index: int) -> bool: | ||
is_left = False | ||
if index == 0: | ||
is_left = False | ||
return is_left | ||
elif index == len(substr_list) - 1: | ||
is_left = True | ||
return is_left | ||
left_block = substr_list[index - 1] | ||
right_block = substr_list[index + 1] | ||
if len(left_block.text) < len(right_block.text) or right_block.lang not in [ | ||
"ja", | ||
"zh", | ||
]: | ||
is_left = True | ||
else: | ||
is_left = False | ||
return is_left | ||
|
||
|
||
def _merge_blocks(concat_result: List[SubString]): | ||
smart_concat_result = [] | ||
lang = "" | ||
for block in concat_result: | ||
cur_lang = block.lang | ||
if cur_lang != lang: | ||
smart_concat_result.append(block) | ||
else: | ||
smart_concat_result[-1].text += block.text | ||
lang = cur_lang | ||
return smart_concat_result | ||
|
||
|
||
def _check_languages(lang_text_list: List[SubString]): | ||
for index, block in enumerate(lang_text_list): | ||
try: | ||
cur_lang = fast_detect_lang(block.text) | ||
except LangDetectException: | ||
cur_lang = "en" | ||
cur_lang = lang_map.get(cur_lang, "en") | ||
if cur_lang == "ko": | ||
fast_lang = fast_detect_lang(block.text, text_len_threshold=0) | ||
if fast_lang != "ko": | ||
is_left = _get_find_direction(lang_text_list, index) | ||
cur_lang = _find_nearest_lang_with_direction( | ||
lang_text_list, index, is_left | ||
) | ||
if cur_lang != "x": | ||
block.lang = cur_lang | ||
return lang_text_list | ||
|
||
|
||
def _smart_concat_logic(lang_text_list: List[SubString]): | ||
lang_text_list = _merge_middle_substr_to_two_side(lang_text_list) | ||
lang_text_list = _merge_blocks(lang_text_list) | ||
lang_text_list = _check_languages(lang_text_list) | ||
lang_text_list = _merge_middle_substr_to_two_side(lang_text_list) | ||
lang_text_list = _fill_missing_languages(lang_text_list) | ||
lang_text_list = _merge_two_side_substr_to_near(lang_text_list) | ||
lang_text_list = _merge_blocks(lang_text_list) | ||
lang_text_list = _check_languages(lang_text_list) | ||
return lang_text_list |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
from setuptools import setup, find_packages | ||
from os import path | ||
|
||
|
||
def packagefile(*relpath): | ||
return path.join(path.dirname(__file__), *relpath) | ||
|
||
|
||
def read(*relpath): | ||
with open(packagefile(*relpath), encoding="utf-8") as f: | ||
return f.read() | ||
|
||
|
||
setup( | ||
name="langsplit", | ||
version="0.1.0", | ||
description="A package for splitting sentences by language (concatenating over-split substrings based on their language)", | ||
long_description=read("README.md"), | ||
long_description_content_type="text/markdown", | ||
url="https://github.com/DoodleBears/langsplit", | ||
author="DoodleBear", | ||
author_email="yangmufeng233@gmail.com", | ||
packages=find_packages(), | ||
install_requires=[ | ||
"langdetect", | ||
"fast_langdetect", | ||
"wtpsplit", | ||
], | ||
classifiers=[ | ||
"Programming Language :: Python :: 3", | ||
"License :: OSI Approved :: MIT License", | ||
"Operating System :: OS Independent", | ||
], | ||
python_requires=">=3.9", | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
from langsplit import split | ||
|
||
texts = [ | ||
"我是 VGroupChatBot,一个旨在支持多人通信的助手,通过可视化消息来帮助团队成员更好地交流。我可以帮助团队成员更好地整理和共享信息,特别是在讨论、会议和Brainstorming等情况下。你好我的名字是西野くまですmy name is bob很高兴认识你どうぞよろしくお願いいたします「こんにちは」是什么意思。", | ||
"你好,我的名字是西野くまです。I am from Tokyo, 日本の首都。今天的天气非常好,sky is clear and sunny。おはようございます、皆さん!我们一起来学习吧。Learning languages can be fun and exciting。昨日はとても忙しかったので、今日は少しリラックスしたいです。Let's take a break and enjoy some coffee。中文、日本語、and English are three distinct languages, each with its own unique charm。希望我们能一起进步,一起成长。Let's keep studying and improving our language skills together. ありがとう!", | ||
"你好,今日はどこへ行きますか?", | ||
"我的名字是田中さんです。", | ||
"我喜欢吃寿司和拉面、おいしいです。", | ||
"我喜欢吃寿司和拉面おいしいです。", | ||
"今天の天気はとてもいいですね。", | ||
"我在学习日本語、少し難しいです。", | ||
"我在学习日本語少し難しいです。", | ||
"日语真是おもしろい啊", | ||
"你喜欢看アニメ吗?", | ||
"我想去日本旅行、特に京都に行きたいです。", | ||
"昨天見た映画はとても感動的でした。" "我朋友是日本人、彼はとても優しいです。", | ||
"我们一起去カラオケ吧、楽しそうです。", | ||
"你今天吃了什么、朝ごはんは何ですか?", | ||
"我的家在北京、でも、仕事で東京に住んでいます。", | ||
"我喜欢读书、本を読むのが好きです。", | ||
"这个周末、一緒に公園へ行きましょうか?", | ||
"你的猫很可爱、あなたの猫はかわいいです。", | ||
"我在学做日本料理、日本料理を作るのを習っています。", | ||
"你会说几种语言、何ヶ国語話せますか?", | ||
"我昨天看了一本书、その本はとても面白かったです。", | ||
"我们一起去逛街、買い物に行きましょう。", | ||
"你最近好吗、最近どうですか?", | ||
"我在学做日本料理와 한국 요리、日本料理を作るのを習っています。", | ||
"你会说几种语言、何ヶ国語話せますか?몇 개 언어를 할 수 있어요?", | ||
"我昨天看了一本书、その本はとても面白かったです。어제 책을 읽었는데, 정말 재미있었어요。", | ||
"我们一起去逛街와 쇼핑、買い物に行きましょう。쇼핑하러 가요。", | ||
"你最近好吗、最近どうですか?요즘 어떻게 지내요?", | ||
] | ||
|
||
for text in texts: | ||
substr_list = split(text, verbose=True) | ||
for index, substr in enumerate(substr_list): | ||
print(f"{substr.lang}|{index}: {substr.text}") | ||
print("----------------------") |