From 4c08e7030d63da500054480afe5b8f6808ad40ae Mon Sep 17 00:00:00 2001 From: Jean-Marc SEVIN Date: Mon, 16 Jun 2025 11:54:08 +0200 Subject: [PATCH 1/4] Add more supported languages to text stats --- welearn_datastack/constants.py | 28 +++++++++++++++++++++ welearn_datastack/utils_/text_stat_utils.py | 6 ++--- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/welearn_datastack/constants.py b/welearn_datastack/constants.py index 184bbf6..95b4721 100644 --- a/welearn_datastack/constants.py +++ b/welearn_datastack/constants.py @@ -116,7 +116,15 @@ DICT_READING_SPEEDS_LANG = { "en": 228, + "de": 179, + "es": 218, "fr": 195, + "jp": 193, + "pt": 181, + "ar": 138, + "it": 188, + "nl": 202, + "zh": 158, } FLESCH_KINCAID_CONSTANTS = { @@ -125,11 +133,31 @@ "fre_sentence_length": 1.015, "fre_syll_per_word": 84.6, }, + "de": { + "fre_base": 180, + "fre_sentence_length": 1, + "fre_syll_per_word": 58.5, + }, + "es": { + "fre_base": 206.84, + "fre_sentence_length": 1.02, + "fre_syll_per_word": 60.0, + }, "fr": { "fre_base": 207, "fre_sentence_length": 1.015, "fre_syll_per_word": 73.6, }, + "it": { + "fre_base": 217, + "fre_sentence_length": 1.3, + "fre_syll_per_word": 60.0, + }, + "nl": { + "fre_base": 206.835, + "fre_sentence_length": 0.93, + "fre_syll_per_word": 77, + }, } ANTI_URL_REGEX = r"\(?((www)|((https?|ftp|file):\/\/))[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|]\)?" diff --git a/welearn_datastack/utils_/text_stat_utils.py b/welearn_datastack/utils_/text_stat_utils.py index 04ae8f2..f60bfad 100644 --- a/welearn_datastack/utils_/text_stat_utils.py +++ b/welearn_datastack/utils_/text_stat_utils.py @@ -123,7 +123,7 @@ def predict_readability(text: str, lang: str) -> str: Args: text (str): text to evaluate - lang (str): 'en' or 'fr' + lang (str): supported language code, e.g. 'en', 'fr', 'de', 'es', 'it', 'nl' Returns: float: flesch reading ease score @@ -149,7 +149,7 @@ def predict_duration(text: str, lang: str) -> str: Args: text (str): text for which to evaluate reading time - lang (code): 'en', 'fr', 'es'... + lang (code): supported language code, e.g. 'en', 'fr', 'de', 'es', 'it', 'nl', 'jp', 'pt', 'ar', 'zh' Returns: int: number of seconds necessary to read text @@ -159,7 +159,7 @@ def predict_duration(text: str, lang: str) -> str: if lang in DICT_READING_SPEEDS_LANG: speed = DICT_READING_SPEEDS_LANG[lang] else: - speed = DICT_READING_SPEEDS_LANG["en"] # default reading speed + speed = 184 # average of reading speeds ret = int(n_words / speed * 60) return str(ret) From 7aa8839dc868c9c89c170c921bfbfbfd16fc4f54 Mon Sep 17 00:00:00 2001 From: Jean-Marc SEVIN Date: Mon, 16 Jun 2025 14:22:12 +0200 Subject: [PATCH 2/4] Update readme for poetry usage --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index f95baa1..d5f430e 100644 --- a/README.md +++ b/README.md @@ -32,16 +32,16 @@ You need te precreate each collections you gonna need. Their form is : ## Setup ### Requirements -- Python 3.12 +- **Python** (version >= 3.12) +- **Poetry** (version >= 2.1) - One relationnal database (We use a [PostgreSQL](https://www.postgresql.org/) one) - One [qdrant](https://qdrant.tech/) instance + ### Setup Environment -Create a virtual environment and install the requirements +Install the dependencies using Poetry ```bash -python3 -m venv venv -source venv/bin/activate -pip install -r requirements.txt +poetry install ``` Then create a file .env From 478774fb1b8fa81a86cc0ad0777218e877044f56 Mon Sep 17 00:00:00 2001 From: Jean-Marc SEVIN Date: Mon, 16 Jun 2025 14:39:28 +0200 Subject: [PATCH 3/4] Refactoring --- welearn_datastack/utils_/text_stat_utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/welearn_datastack/utils_/text_stat_utils.py b/welearn_datastack/utils_/text_stat_utils.py index f60bfad..aa0f682 100644 --- a/welearn_datastack/utils_/text_stat_utils.py +++ b/welearn_datastack/utils_/text_stat_utils.py @@ -156,10 +156,7 @@ def predict_duration(text: str, lang: str) -> str: """ pattern = r"\w+" n_words = len(re.findall(pattern, text)) - if lang in DICT_READING_SPEEDS_LANG: - speed = DICT_READING_SPEEDS_LANG[lang] - else: - speed = 184 # average of reading speeds + speed = DICT_READING_SPEEDS_LANG.get(lang, 184) # 184 is the average of reading speeds from https://irisreading.com/average-reading-speed-in-various-languages/ ret = int(n_words / speed * 60) return str(ret) From bb8b29d9f4c1b7be54ce06e4be81deb3ea1dc6bb Mon Sep 17 00:00:00 2001 From: Jean-Marc SEVIN Date: Mon, 16 Jun 2025 14:46:47 +0200 Subject: [PATCH 4/4] Fix format issue --- welearn_datastack/utils_/text_stat_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/welearn_datastack/utils_/text_stat_utils.py b/welearn_datastack/utils_/text_stat_utils.py index aa0f682..7350399 100644 --- a/welearn_datastack/utils_/text_stat_utils.py +++ b/welearn_datastack/utils_/text_stat_utils.py @@ -156,7 +156,9 @@ def predict_duration(text: str, lang: str) -> str: """ pattern = r"\w+" n_words = len(re.findall(pattern, text)) - speed = DICT_READING_SPEEDS_LANG.get(lang, 184) # 184 is the average of reading speeds from https://irisreading.com/average-reading-speed-in-various-languages/ + speed = DICT_READING_SPEEDS_LANG.get( + lang, 184 + ) # 184 is the average of reading speeds from https://irisreading.com/average-reading-speed-in-various-languages/ ret = int(n_words / speed * 60) return str(ret)