Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,16 @@ You need te precreate each collections you gonna need. Their form is :

## Setup
### Requirements
- Python 3.12
- **Python** (version >= 3.12)
- **Poetry** (version >= 2.1)
- One relationnal database (We use a [PostgreSQL](https://www.postgresql.org/) one)
- One [qdrant](https://qdrant.tech/) instance


### Setup Environment
Create a virtual environment and install the requirements
Install the dependencies using Poetry
```bash
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
poetry install
```

Then create a file .env
Expand Down
28 changes: 28 additions & 0 deletions welearn_datastack/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,15 @@

DICT_READING_SPEEDS_LANG = {
"en": 228,
"de": 179,
"es": 218,
"fr": 195,
"jp": 193,
"pt": 181,
"ar": 138,
"it": 188,
"nl": 202,
"zh": 158,
}

FLESCH_KINCAID_CONSTANTS = {
Expand All @@ -125,11 +133,31 @@
"fre_sentence_length": 1.015,
"fre_syll_per_word": 84.6,
},
"de": {
"fre_base": 180,
"fre_sentence_length": 1,
"fre_syll_per_word": 58.5,
},
"es": {
"fre_base": 206.84,
"fre_sentence_length": 1.02,
"fre_syll_per_word": 60.0,
},
"fr": {
"fre_base": 207,
"fre_sentence_length": 1.015,
"fre_syll_per_word": 73.6,
},
"it": {
"fre_base": 217,
"fre_sentence_length": 1.3,
"fre_syll_per_word": 60.0,
},
"nl": {
"fre_base": 206.835,
"fre_sentence_length": 0.93,
"fre_syll_per_word": 77,
},
}

ANTI_URL_REGEX = r"\(?((www)|((https?|ftp|file):\/\/))[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|]\)?"
Expand Down
11 changes: 5 additions & 6 deletions welearn_datastack/utils_/text_stat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def predict_readability(text: str, lang: str) -> str:

Args:
text (str): text to evaluate
lang (str): 'en' or 'fr'
lang (str): supported language code, e.g. 'en', 'fr', 'de', 'es', 'it', 'nl'

Returns:
float: flesch reading ease score
Expand All @@ -149,17 +149,16 @@ def predict_duration(text: str, lang: str) -> str:

Args:
text (str): text for which to evaluate reading time
lang (code): 'en', 'fr', 'es'...
lang (code): supported language code, e.g. 'en', 'fr', 'de', 'es', 'it', 'nl', 'jp', 'pt', 'ar', 'zh'

Returns:
int: number of seconds necessary to read text
"""
pattern = r"\w+"
n_words = len(re.findall(pattern, text))
if lang in DICT_READING_SPEEDS_LANG:
speed = DICT_READING_SPEEDS_LANG[lang]
else:
speed = DICT_READING_SPEEDS_LANG["en"] # default reading speed
speed = DICT_READING_SPEEDS_LANG.get(
lang, 184
) # 184 is the average of reading speeds from https://irisreading.com/average-reading-speed-in-various-languages/
ret = int(n_words / speed * 60)
return str(ret)

Expand Down