Skip to content

Commit

Permalink
version(v0.1.0): init langsplit
Browse files Browse the repository at this point in the history
  • Loading branch information
DoodleBears committed Jun 28, 2024
1 parent 4355530 commit 349e1d2
Show file tree
Hide file tree
Showing 7 changed files with 369 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
__pycache__
build/
dist/
langsplit.egg-info/
56 changes: 56 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# `langsplit`

splitting sentences by language (concatenating over-split substrings based on their language)

# Motivation
1. TTS (Text-To-Speech) model often fail on multi-language sentence, separate sentence based on language will bring better result
2. Existed NLP toolkit (e.g. SpaCy) is helpful for parsing text in one language, however when it comes to multi-language text like below is hard to deal with:

```
你最近好吗、最近どうですか?요즘 어떻게 지내요?sky is clear and sunny。
```

# Usage

## Installation

You can install the package using pip:

```bash
pip install langsplit
```

```python
texts = [
"我是 VGroupChatBot,一个旨在支持多人通信的助手,通过可视化消息来帮助团队成员更好地交流。我可以帮助团队成员更好地整理和共享信息,特别是在讨论、会议和Brainstorming等情况下。你好我的名字是西野くまですmy name is bob很高兴认识你どうぞよろしくお願いいたします「こんにちは」是什么意思。",
"你好,我的名字是西野くまです。I am from Tokyo, 日本の首都。今天的天气非常好,sky is clear and sunny。おはようございます、皆さん!我们一起来学习吧。Learning languages can be fun and exciting。昨日はとても忙しかったので、今日は少しリラックスしたいです。Let's take a break and enjoy some coffee。中文、日本語、and English are three distinct languages, each with its own unique charm。希望我们能一起进步,一起成长。Let's keep studying and improving our language skills together. ありがとう!",
"你好,今日はどこへ行きますか?",
"我的名字是田中さんです。",
"我喜欢吃寿司和拉面おいしいです。",
"今天の天気はとてもいいですね。",
"我在学习日本語少し難しいです。",
"日语真是おもしろい啊",
"你喜欢看アニメ吗?",
"我想去日本旅行、特に京都に行きたいです。",
"昨天見た映画はとても感動的でした。" "我朋友是日本人、彼はとても優しいです。",
"我们一起去カラオケ吧、楽しそうです。",
"你今天吃了什么、朝ごはんは何ですか?",
"我的家在北京、でも、仕事で東京に住んでいます。",
"我在学做日本料理、日本料理を作るのを習っています。",
"你会说几种语言、何ヶ国語話せますか?",
"我昨天看了一本书、その本はとても面白かったです。",
"我们一起去逛街、買い物に行きましょう。",
"你最近好吗、最近どうですか?",
"我在学做日本料理와 한국 요리、日本料理を作るのを習っています。",
"你会说几种语言、何ヶ国語話せますか?몇 개 언어를 할 수 있어요?",
"我昨天看了一本书、その本はとても面白かったです。어제 책을 읽었는데, 정말 재미있었어요。",
"我们一起去逛街와 쇼핑、買い物に行きましょう。쇼핑하러 가요。",
"你最近好吗、最近どうですか?요즘 어떻게 지내요?",
]

for text in texts:
substr_list = split(text, verbose=True)
for index, substr in enumerate(substr_list):
print(f"{substr.lang}|{index}: {substr.text}")
print("----------------------")
```
1 change: 1 addition & 0 deletions langsplit/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .split.splitter import split
24 changes: 24 additions & 0 deletions langsplit/detect_lang/detector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from langdetect import detect
import fast_langdetect

lang_map = {
"zh": "zh",
"zh-cn": "zh",
"zh-tw": "x",
"ko": "ko",
"ja": "ja",
}


def detect_lang(text: str) -> str:
result = str(detect(text))
result = result.lower()
return result


def fast_detect_lang(text: str, text_len_threshold=3) -> str:
if len(text) <= text_len_threshold:
return detect_lang(text)
result = str(fast_langdetect.detect(text, low_memory=False)["lang"])
result = result.lower()
return result
210 changes: 210 additions & 0 deletions langsplit/split/splitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
from typing import List
from dataclasses import dataclass

from langdetect.lang_detect_exception import LangDetectException
from wtpsplit import SaT, WtP

from langsplit.detect_lang.detector import detect_lang, fast_detect_lang, lang_map


@dataclass
class SubString:
lang: str
text: str


class SentenceSplitter:
def __init__(self, wtp_split_model: WtP | SaT = WtP("wtp-bert-mini")):
self.wtp_split_model = wtp_split_model

def split(self, text: str, threshold: float = 5e-5, verbose=False):
return self.wtp_split_model.split(
text_or_texts=text, threshold=threshold, verbose=verbose
)


default_sentence_splitter = SentenceSplitter()


def split(
text: str,
threshold: float = 5e-5,
verbose=False,
splitter: SentenceSplitter = default_sentence_splitter,
):
"""using
1. `wtpsplit` to split sentences into 'small' substring
2. concat substring based on language using `fasttext` and `langdetect`
Args:
text (str): text to split
threshold (float, optional): the lower the more separated (more) substring will return. Defaults to 5e-5.
"""
substr_list = splitter.split(text=text, threshold=threshold, verbose=verbose)
if verbose:
print(f"substr_list: {substr_list}")
substr_list = _init_substr_lang(substr_list)
if verbose:
print(f"substr_list: {substr_list}")
substr_list = _smart_concat(substr_list)
if verbose:
print(f"split_result: {substr_list}")
return substr_list


def _smart_concat(substr_list: List[SubString]):
is_concat_complete = False
while is_concat_complete is False:
substr_list = _smart_concat_logic(substr_list)
is_concat_complete = True
for index, block in enumerate(substr_list):
if block.lang == "x":
is_concat_complete = False
break
if index < len(substr_list) - 1:
if substr_list[index].lang == substr_list[index + 1].lang:
is_concat_complete = False
break
return substr_list


def _init_substr_lang(substr: List[str]) -> List[SubString]:
concat_result = []
lang = ""
for block in substr:
try:
cur_lang = detect_lang(block)
except LangDetectException:
cur_lang = lang
cur_lang = lang_map.get(cur_lang, "en")
concat_result.append(SubString(cur_lang, block))
lang = cur_lang
return concat_result


def _merge_middle_substr_to_two_side(substr_list: List[SubString]):
for index in range(len(substr_list) - 2):
left_block = substr_list[index]
middle_block = substr_list[index + 1]
right_block = substr_list[index + 2]
if left_block.lang == right_block.lang and left_block.lang != "x":
if len(middle_block.text) <= 1 or middle_block.lang == "x":
substr_list[index + 1].lang = left_block.lang
return substr_list


def _merge_two_side_substr_to_near(concat_result: List[SubString]):
if concat_result[0].lang == "x":
for substr in concat_result:
if substr.lang != "x":
concat_result[0].lang = substr.lang
break
elif len(concat_result[0].text) <= 1:
concat_result[0].lang = _find_nearest_lang_with_direction(
concat_result, 0, is_left=False
)
if concat_result[-1].lang == "x":
concat_result[-1].lang = _find_nearest_lang_with_direction(
concat_result, len(concat_result) - 1, is_left=True
)
return concat_result


def _fill_missing_languages(concat_result: List[SubString]):
for index, substr in enumerate(concat_result):
if substr.lang == "x":
if index == 0:
# For head substring, find right substring
concat_result[index].lang = _find_nearest_lang_with_direction(
concat_result, index, is_left=False
)
elif index == len(concat_result) - 1:
# For tail substring, find left substring
concat_result[index].lang = _find_nearest_lang_with_direction(
concat_result, index, is_left=True
)
else:
# For body (middle) substring, find based on rule
is_left = _get_find_direction(concat_result, index)
concat_result[index].lang = _find_nearest_lang_with_direction(
concat_result, index, is_left
)
return concat_result


def _find_nearest_lang_with_direction(
concat_result: List[SubString], index: int, is_left: bool
):
if is_left:
for i in range(1, len(concat_result)):
if index - i >= 0 and concat_result[index - i].lang != "x":
return concat_result[index - i].lang
else:
for i in range(1, len(concat_result)):
if index + i < len(concat_result) and concat_result[index + i].lang != "x":
return concat_result[index + i].lang
return "en"


def _get_find_direction(substr_list: List[SubString], index: int) -> bool:
is_left = False
if index == 0:
is_left = False
return is_left
elif index == len(substr_list) - 1:
is_left = True
return is_left
left_block = substr_list[index - 1]
right_block = substr_list[index + 1]
if len(left_block.text) < len(right_block.text) or right_block.lang not in [
"ja",
"zh",
]:
is_left = True
else:
is_left = False
return is_left


def _merge_blocks(concat_result: List[SubString]):
smart_concat_result = []
lang = ""
for block in concat_result:
cur_lang = block.lang
if cur_lang != lang:
smart_concat_result.append(block)
else:
smart_concat_result[-1].text += block.text
lang = cur_lang
return smart_concat_result


def _check_languages(lang_text_list: List[SubString]):
for index, block in enumerate(lang_text_list):
try:
cur_lang = fast_detect_lang(block.text)
except LangDetectException:
cur_lang = "en"
cur_lang = lang_map.get(cur_lang, "en")
if cur_lang == "ko":
fast_lang = fast_detect_lang(block.text, text_len_threshold=0)
if fast_lang != "ko":
is_left = _get_find_direction(lang_text_list, index)
cur_lang = _find_nearest_lang_with_direction(
lang_text_list, index, is_left
)
if cur_lang != "x":
block.lang = cur_lang
return lang_text_list


def _smart_concat_logic(lang_text_list: List[SubString]):
lang_text_list = _merge_middle_substr_to_two_side(lang_text_list)
lang_text_list = _merge_blocks(lang_text_list)
lang_text_list = _check_languages(lang_text_list)
lang_text_list = _merge_middle_substr_to_two_side(lang_text_list)
lang_text_list = _fill_missing_languages(lang_text_list)
lang_text_list = _merge_two_side_substr_to_near(lang_text_list)
lang_text_list = _merge_blocks(lang_text_list)
lang_text_list = _check_languages(lang_text_list)
return lang_text_list
35 changes: 35 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from setuptools import setup, find_packages
from os import path


def packagefile(*relpath):
return path.join(path.dirname(__file__), *relpath)


def read(*relpath):
with open(packagefile(*relpath), encoding="utf-8") as f:
return f.read()


setup(
name="langsplit",
version="0.1.0",
description="A package for splitting sentences by language (concatenating over-split substrings based on their language)",
long_description=read("README.md"),
long_description_content_type="text/markdown",
url="https://github.com/DoodleBears/langsplit",
author="DoodleBear",
author_email="yangmufeng233@gmail.com",
packages=find_packages(),
install_requires=[
"langdetect",
"fast_langdetect",
"wtpsplit",
],
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
python_requires=">=3.9",
)
39 changes: 39 additions & 0 deletions tests/test_split.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from langsplit import split

texts = [
"我是 VGroupChatBot,一个旨在支持多人通信的助手,通过可视化消息来帮助团队成员更好地交流。我可以帮助团队成员更好地整理和共享信息,特别是在讨论、会议和Brainstorming等情况下。你好我的名字是西野くまですmy name is bob很高兴认识你どうぞよろしくお願いいたします「こんにちは」是什么意思。",
"你好,我的名字是西野くまです。I am from Tokyo, 日本の首都。今天的天气非常好,sky is clear and sunny。おはようございます、皆さん!我们一起来学习吧。Learning languages can be fun and exciting。昨日はとても忙しかったので、今日は少しリラックスしたいです。Let's take a break and enjoy some coffee。中文、日本語、and English are three distinct languages, each with its own unique charm。希望我们能一起进步,一起成长。Let's keep studying and improving our language skills together. ありがとう!",
"你好,今日はどこへ行きますか?",
"我的名字是田中さんです。",
"我喜欢吃寿司和拉面、おいしいです。",
"我喜欢吃寿司和拉面おいしいです。",
"今天の天気はとてもいいですね。",
"我在学习日本語、少し難しいです。",
"我在学习日本語少し難しいです。",
"日语真是おもしろい啊",
"你喜欢看アニメ吗?",
"我想去日本旅行、特に京都に行きたいです。",
"昨天見た映画はとても感動的でした。" "我朋友是日本人、彼はとても優しいです。",
"我们一起去カラオケ吧、楽しそうです。",
"你今天吃了什么、朝ごはんは何ですか?",
"我的家在北京、でも、仕事で東京に住んでいます。",
"我喜欢读书、本を読むのが好きです。",
"这个周末、一緒に公園へ行きましょうか?",
"你的猫很可爱、あなたの猫はかわいいです。",
"我在学做日本料理、日本料理を作るのを習っています。",
"你会说几种语言、何ヶ国語話せますか?",
"我昨天看了一本书、その本はとても面白かったです。",
"我们一起去逛街、買い物に行きましょう。",
"你最近好吗、最近どうですか?",
"我在学做日本料理와 한국 요리、日本料理を作るのを習っています。",
"你会说几种语言、何ヶ国語話せますか?몇 개 언어를 할 수 있어요?",
"我昨天看了一本书、その本はとても面白かったです。어제 책을 읽었는데, 정말 재미있었어요。",
"我们一起去逛街와 쇼핑、買い物に行きましょう。쇼핑하러 가요。",
"你最近好吗、最近どうですか?요즘 어떻게 지내요?",
]

for text in texts:
substr_list = split(text, verbose=True)
for index, substr in enumerate(substr_list):
print(f"{substr.lang}|{index}: {substr.text}")
print("----------------------")

0 comments on commit 349e1d2

Please sign in to comment.