From 4c08e7030d63da500054480afe5b8f6808ad40ae Mon Sep 17 00:00:00 2001
From: Jean-Marc SEVIN <jean-marc.sevin@cri-paris.org>
Date: Mon, 16 Jun 2025 11:54:08 +0200
Subject: [PATCH 1/4] Add more supported languages to text stats

---
 welearn_datastack/constants.py              | 28 +++++++++++++++++++++
 welearn_datastack/utils_/text_stat_utils.py |  6 ++---
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/welearn_datastack/constants.py b/welearn_datastack/constants.py
index 184bbf6..95b4721 100644
--- a/welearn_datastack/constants.py
+++ b/welearn_datastack/constants.py
@@ -116,7 +116,15 @@
 
 DICT_READING_SPEEDS_LANG = {
     "en": 228,
+    "de": 179,
+    "es": 218,
     "fr": 195,
+    "jp": 193,
+    "pt": 181,
+    "ar": 138,
+    "it": 188,
+    "nl": 202,
+    "zh": 158,
 }
 
 FLESCH_KINCAID_CONSTANTS = {
@@ -125,11 +133,31 @@
         "fre_sentence_length": 1.015,
         "fre_syll_per_word": 84.6,
     },
+    "de": {
+        "fre_base": 180,
+        "fre_sentence_length": 1,
+        "fre_syll_per_word": 58.5,
+    },
+    "es": {
+        "fre_base": 206.84,
+        "fre_sentence_length": 1.02,
+        "fre_syll_per_word": 60.0,
+    },
     "fr": {
         "fre_base": 207,
         "fre_sentence_length": 1.015,
         "fre_syll_per_word": 73.6,
     },
+    "it": {
+        "fre_base": 217,
+        "fre_sentence_length": 1.3,
+        "fre_syll_per_word": 60.0,
+    },
+    "nl": {
+        "fre_base": 206.835,
+        "fre_sentence_length": 0.93,
+        "fre_syll_per_word": 77,
+    },
 }
 
 ANTI_URL_REGEX = r"\(?((www)|((https?|ftp|file):\/\/))[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|]\)?"
diff --git a/welearn_datastack/utils_/text_stat_utils.py b/welearn_datastack/utils_/text_stat_utils.py
index 04ae8f2..f60bfad 100644
--- a/welearn_datastack/utils_/text_stat_utils.py
+++ b/welearn_datastack/utils_/text_stat_utils.py
@@ -123,7 +123,7 @@ def predict_readability(text: str, lang: str) -> str:
 
     Args:
         text (str): text to evaluate
-        lang (str): 'en' or 'fr'
+        lang (str): supported language code, e.g. 'en', 'fr', 'de', 'es', 'it', 'nl'
 
     Returns:
         float: flesch reading ease score
@@ -149,7 +149,7 @@ def predict_duration(text: str, lang: str) -> str:
 
     Args:
         text (str): text for which to evaluate reading time
-        lang (code): 'en', 'fr', 'es'...
+        lang (code): supported language code, e.g. 'en', 'fr', 'de', 'es', 'it', 'nl', 'jp', 'pt', 'ar', 'zh'
 
     Returns:
         int: number of seconds necessary to read text
@@ -159,7 +159,7 @@ def predict_duration(text: str, lang: str) -> str:
     if lang in DICT_READING_SPEEDS_LANG:
         speed = DICT_READING_SPEEDS_LANG[lang]
     else:
-        speed = DICT_READING_SPEEDS_LANG["en"]  # default reading speed
+        speed = 184  # average of reading speeds
     ret = int(n_words / speed * 60)
     return str(ret)
 

From 7aa8839dc868c9c89c170c921bfbfbfd16fc4f54 Mon Sep 17 00:00:00 2001
From: Jean-Marc SEVIN <jean-marc.sevin@cri-paris.org>
Date: Mon, 16 Jun 2025 14:22:12 +0200
Subject: [PATCH 2/4] Update readme for poetry usage

---
 README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index f95baa1..d5f430e 100644
--- a/README.md
+++ b/README.md
@@ -32,16 +32,16 @@ You need te precreate each collections you gonna need. Their form is :
 
 ## Setup
 ### Requirements
-- Python 3.12
+- **Python** (version >= 3.12)
+- **Poetry** (version >= 2.1)
 - One relationnal database (We use a [PostgreSQL](https://www.postgresql.org/) one)
 - One [qdrant](https://qdrant.tech/) instance
 
+
 ### Setup Environment
-Create a virtual environment and install the requirements
+Install the dependencies using Poetry
 ```bash
-python3 -m venv venv
-source venv/bin/activate
-pip install -r requirements.txt
+poetry install
 ```
 
 Then create a file .env

From 478774fb1b8fa81a86cc0ad0777218e877044f56 Mon Sep 17 00:00:00 2001
From: Jean-Marc SEVIN <jean-marc.sevin@cri-paris.org>
Date: Mon, 16 Jun 2025 14:39:28 +0200
Subject: [PATCH 3/4] Refactoring

---
 welearn_datastack/utils_/text_stat_utils.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/welearn_datastack/utils_/text_stat_utils.py b/welearn_datastack/utils_/text_stat_utils.py
index f60bfad..aa0f682 100644
--- a/welearn_datastack/utils_/text_stat_utils.py
+++ b/welearn_datastack/utils_/text_stat_utils.py
@@ -156,10 +156,7 @@ def predict_duration(text: str, lang: str) -> str:
     """
     pattern = r"\w+"
     n_words = len(re.findall(pattern, text))
-    if lang in DICT_READING_SPEEDS_LANG:
-        speed = DICT_READING_SPEEDS_LANG[lang]
-    else:
-        speed = 184  # average of reading speeds
+    speed = DICT_READING_SPEEDS_LANG.get(lang, 184)  # 184 is the average of reading speeds from https://irisreading.com/average-reading-speed-in-various-languages/
     ret = int(n_words / speed * 60)
     return str(ret)
 

From bb8b29d9f4c1b7be54ce06e4be81deb3ea1dc6bb Mon Sep 17 00:00:00 2001
From: Jean-Marc SEVIN <jean-marc.sevin@cri-paris.org>
Date: Mon, 16 Jun 2025 14:46:47 +0200
Subject: [PATCH 4/4] Fix format issue

---
 welearn_datastack/utils_/text_stat_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/welearn_datastack/utils_/text_stat_utils.py b/welearn_datastack/utils_/text_stat_utils.py
index aa0f682..7350399 100644
--- a/welearn_datastack/utils_/text_stat_utils.py
+++ b/welearn_datastack/utils_/text_stat_utils.py
@@ -156,7 +156,9 @@ def predict_duration(text: str, lang: str) -> str:
     """
     pattern = r"\w+"
     n_words = len(re.findall(pattern, text))
-    speed = DICT_READING_SPEEDS_LANG.get(lang, 184)  # 184 is the average of reading speeds from https://irisreading.com/average-reading-speed-in-various-languages/
+    speed = DICT_READING_SPEEDS_LANG.get(
+        lang, 184
+    )  # 184 is the average of reading speeds from https://irisreading.com/average-reading-speed-in-various-languages/
     ret = int(n_words / speed * 60)
     return str(ret)