Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate additionnal etymology-free dictionnaries #1440

Merged
merged 7 commits into from Dec 13, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 3 additions & 6 deletions .github/workflows/auto-updates.yml
Expand Up @@ -91,26 +91,23 @@ jobs:
uses: svenstaro/upload-release-action@v2
with:
repo_token: ${{ secrets.TOKEN }}
file: data/${{ matrix.locale }}/dict-${{ matrix.locale }}-${{ matrix.locale }}.df.bz2
asset_name: dict-${{ matrix.locale }}-${{ matrix.locale }}.df.bz2
file: data/${{ matrix.locale }}/dict-${{ matrix.locale }}-${{ matrix.locale }}*.df.bz2
tag: ${{ matrix.locale }}
overwrite: true

- name: Upload the dictionary (Kobo)
uses: svenstaro/upload-release-action@v2
with:
repo_token: ${{ secrets.TOKEN }}
file: data/${{ matrix.locale }}/dicthtml-${{ matrix.locale }}-${{ matrix.locale }}.zip
asset_name: dicthtml-${{ matrix.locale }}-${{ matrix.locale }}.zip
file: data/${{ matrix.locale }}/dicthtml-${{ matrix.locale }}-${{ matrix.locale }}*.zip
tag: ${{ matrix.locale }}
overwrite: true

- name: Upload the dictionary (StarDict)
uses: svenstaro/upload-release-action@v2
with:
repo_token: ${{ secrets.TOKEN }}
file: data/${{ matrix.locale }}/dict-${{ matrix.locale }}-${{ matrix.locale }}.zip
asset_name: dict-${{ matrix.locale }}-${{ matrix.locale }}.zip
file: data/${{ matrix.locale }}/dict-${{ matrix.locale }}-${{ matrix.locale }}*.zip
tag: ${{ matrix.locale }}
overwrite: true

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Expand Up @@ -2,6 +2,7 @@ setuptools>=36.2.1
beautifulsoup4==4.11.1
cachetools==5.2.0
docopt==0.6.2
Jinja2==3.1.2
marisa-trie==0.7.8
mistune==2.0.4 # for DictFile reading
pillow==9.3.0
Expand Down
194 changes: 147 additions & 47 deletions tests/test_3_convert.py
Expand Up @@ -20,16 +20,67 @@

Mis 脿 jour le"""

WORDS = {
"empty": Word.empty(),
"foo": Word(["pron"], ["gender"], ["etyl"], ["def 1", ("sdef 1",)], []),
"foos": Word(
["pron"], ["gender"], ["etyl"], ["def 1", ("sdef 1", ("ssdef 1",))], ["baz"]
),
"baz": Word(["pron"], ["gender"], ["etyl"], ["def 1", ("sdef 1",)], ["foobar"]),
"empty1": Word([], [], [], [], ["foo"]),
"empty2": Word([], [], [], [], ["empty1"]),
"Multiple Etymologies": Word(
["pron"],
["gender"],
["etyl 1", ("setyl 1",)],
["def 1", ("sdef 1",)],
[],
),
"Multiple Etymology": Word(
["pron0"],
["gender0"],
["etyl0"],
["def 0"],
["Multiple Etymologies"],
),
"GIF": Word(
["pron"],
["gender"],
["etyl"],
[
'<img style="height:100%;max-height:0.8em;width:auto;vertical-align:bottom"'
' src="data:image/gif;base64,R0lGODdhNwAZAIEAAAAAAP///wAAAAAAACwAAAAANwAZAE'
"AIwwADCAwAAMDAgwgTKlzIUKDBgwUZFnw4cGLDihEvOjSYseFEigQtLhSpsaNGiSdTQgS5kiVG"
"lwhJeuRoMuHHkDBH1pT4cKdKmSpjUjT50efGnEWTsuxo9KbQnC1TFp051KhNpUid8tR6EijPkC"
"V3en2J9erLoBjRXl1qVS1amTWn6oSK1WfGpnjDQo1q1Wvbs125PgX5l6zctW1JFgas96/FxYwv"
'RnQsODHkyXuPDt5aVihYt5pBr9woGrJktmpNfxUYEAA7"/>'
],
["gif"],
),
}


def test_simple() -> None:
assert convert.main("fr") == 0

# Check for all dictionaries
output_dir = Path(os.environ["CWD"]) / "data" / "fr"
assert (output_dir / "dict-fr-fr.df").is_file() # DictFile
assert (output_dir / "dict-fr-fr.df.bz2").is_file() # DictFile bz2
assert (output_dir / "dict-fr-fr.zip").is_file() # StarDict
dicthtml = output_dir / "dicthtml-fr-fr.zip" # Kobo

# DictFile
assert (output_dir / "dict-fr-fr.df").is_file()
assert (output_dir / "dict-fr-fr-noetym.df").is_file()

# DictFile bz2
assert (output_dir / "dict-fr-fr.df.bz2").is_file()
assert (output_dir / "dict-fr-fr-noetym.df.bz2").is_file()

# StarDict
assert (output_dir / "dict-fr-fr.zip").is_file()
assert (output_dir / "dict-fr-fr-noetym.zip").is_file()

# Kobo
assert (output_dir / "dicthtml-fr-fr-noetym.zip").is_file()
dicthtml = output_dir / "dicthtml-fr-fr.zip"
assert dicthtml.is_file()

# Check the Kobo ZIP content
Expand Down Expand Up @@ -89,66 +140,115 @@ def test_no_json_file() -> None:

@pytest.mark.dependency()
@pytest.mark.parametrize(
"formatter, filename",
"formatter, filename, include_etymology",
[
(convert.DictFileFormat, "dict-fr-fr.df"),
(convert.KoboFormat, "dicthtml-fr-fr.zip"),
(convert.DictFileFormat, "dict-fr-fr.df", True),
(convert.DictFileFormat, "dict-fr-fr-noetym.df", False),
(convert.KoboFormat, "dicthtml-fr-fr.zip", True),
(convert.KoboFormat, "dicthtml-fr-fr-noetym.zip", False),
],
)
def test_generate_primary_dict(
formatter: Type[convert.BaseFormat], filename: str
formatter: Type[convert.BaseFormat], filename: str, include_etymology: bool
) -> None:
output_dir = Path(os.environ["CWD"]) / "data" / "fr"
words = {
"empty": Word.empty(),
"foo": Word(["pron"], ["gender"], ["etyl"], ["def 1", ("sdef 1",)], []),
"foos": Word(
["pron"], ["gender"], ["etyl"], ["def 1", ("sdef 1", ("ssdef 1",))], ["baz"]
),
"baz": Word(["pron"], ["gender"], ["etyl"], ["def 1", ("sdef 1",)], ["foobar"]),
"empty1": Word([], [], [], [], ["foo"]),
"empty2": Word([], [], [], [], ["empty1"]),
"Multiple Etymologies": Word(
["pron"],
["gender"],
["etyl 1", ("setyl 1",)],
["def 1", ("sdef 1",)],
["foobar"],
),
"GIF": Word(
["pron"],
["gender"],
["etyl"],
[
'<img style="height:100%;max-height:0.8em;width:auto;vertical-align:bottom"'
' src="data:image/gif;base64,R0lGODdhNwAZAIEAAAAAAP///wAAAAAAACwAAAAANwAZAE'
"AIwwADCAwAAMDAgwgTKlzIUKDBgwUZFnw4cGLDihEvOjSYseFEigQtLhSpsaNGiSdTQgS5kiVG"
"lwhJeuRoMuHHkDBH1pT4cKdKmSpjUjT50efGnEWTsuxo9KbQnC1TFp051KhNpUid8tR6EijPkC"
"V3en2J9erLoBjRXl1qVS1amTWn6oSK1WfGpnjDQo1q1Wvbs125PgX5l6zctW1JFgas96/FxYwv"
'RnQsODHkyXuPDt5aVihYt5pBr9woGrJktmpNfxUYEAA7"/>'
],
["GIF"],
),
}
variants = convert.make_variants(words)
convert.run_formatter(formatter, "fr", output_dir, words, variants, "20201218")
variants = convert.make_variants(WORDS)
convert.run_formatter(
formatter,
"fr",
output_dir,
WORDS,
variants,
"20201218",
include_etymology=include_etymology,
)

assert (output_dir / filename).is_file()


@pytest.mark.parametrize(
"formatter, filename",
"formatter, filename, include_etymology",
[
(convert.StarDictFormat, "dict-fr-fr.zip"),
(convert.BZ2DictFileFormat, "dict-fr-fr.df.bz2"),
(convert.StarDictFormat, "dict-fr-fr.zip", True),
(convert.StarDictFormat, "dict-fr-fr-noetym.zip", False),
(convert.BZ2DictFileFormat, "dict-fr-fr.df.bz2", True),
(convert.BZ2DictFileFormat, "dict-fr-fr-noetym.df.bz2", False),
],
)
@pytest.mark.dependency(
depends=["test_generate_primary_dict[DictFileFormat-dict-fr-fr.df]"]
depends=[
"test_generate_primary_dict[DictFileFormat-dict-fr-fr.df]",
"test_generate_primary_dict[DictFileFormat-dict-fr-fr-noetym.df]",
]
)
def test_generate_secondary_dict(
formatter: Type[convert.BaseFormat], filename: str
formatter: Type[convert.BaseFormat], filename: str, include_etymology: bool
) -> None:
output_dir = Path(os.environ["CWD"]) / "data" / "fr"
convert.run_formatter(formatter, "fr", output_dir, {}, {}, "20201218")
convert.run_formatter(
formatter,
"fr",
output_dir,
{},
{},
"20201218",
include_etymology=include_etymology,
)
assert (output_dir / filename).is_file()


FORMATTED_WORD_KOBO = """\
<w><p><a name="Multiple Etymologies"/><b>Multiple Etymologies</b> pron <i>gender</i>.<br/><br/><p>etyl 1</p><ol><li>setyl 1</li></ol><br/><ol><li>def 1</li><ol style="list-style-type:lower-alpha"><li>sdef 1</li></ol></ol></p><var><variant name="multiple etymology"/></var></w>
""" # noqa
FORMATTED_WORD_KOBO_NO_ETIMOLOGY = """\
BoboTiG marked this conversation as resolved.
Show resolved Hide resolved
<w><p><a name="Multiple Etymologies"/><b>Multiple Etymologies</b> pron <i>gender</i>.<br/><ol><li>def 1</li><ol style="list-style-type:lower-alpha"><li>sdef 1</li></ol></ol></p><var><variant name="multiple etymology"/></var></w>
""" # noqa
FORMATTED_WORD_DICTFILE = """\
@ Multiple Etymologies
: pron <i>gender</i>.
& Multiple Etymology
<html><p>etyl 1</p><ol><li>setyl 1</li></ol><br/><ol><li>def 1</li><ol style="list-style-type:lower-alpha"><li>sdef 1</li></ol></ol></html>\


""" # noqa
FORMATTED_WORD_DICTFILE_NO_ETIMOLOGY = """\
@ Multiple Etymologies
: pron <i>gender</i>.
& Multiple Etymology
<html><ol><li>def 1</li><ol style="list-style-type:lower-alpha"><li>sdef 1</li></ol></ol></html>\


""" # noqa


@pytest.mark.parametrize(
"formatter, include_etymology, expected",
[
(convert.KoboFormat, True, FORMATTED_WORD_KOBO),
(convert.KoboFormat, False, FORMATTED_WORD_KOBO_NO_ETIMOLOGY),
(convert.DictFileFormat, True, FORMATTED_WORD_DICTFILE),
(convert.DictFileFormat, False, FORMATTED_WORD_DICTFILE_NO_ETIMOLOGY),
],
)
def test_word_rendering(
formatter: Type[convert.BaseFormat],
include_etymology: bool,
expected: str,
) -> None:
output_dir = Path(os.environ["CWD"]) / "data" / "fr"
cls = formatter(
"fr",
output_dir,
WORDS,
convert.make_variants(WORDS),
"20221212",
include_etymology=include_etymology,
)

kwargs = (
{"name": "mu", "words": WORDS} if isinstance(cls, convert.KoboFormat) else {}
)
content = next(
cls.handle_word("Multiple Etymologies", WORDS["Multiple Etymologies"], **kwargs)
)
assert content == expected
60 changes: 6 additions & 54 deletions tests/test_4_check_word.py
Expand Up @@ -57,15 +57,7 @@ def _craft_urls(locale: str, word: str, body: str) -> str:


@responses.activate
def test_simple(
craft_urls: Callable[
[
str,
str,
],
str,
]
) -> None:
def test_simple(craft_urls: Callable[[str, str], str]) -> None:
craft_urls("fr", "42")
assert check_word.main("fr", "42") == 0

Expand All @@ -75,57 +67,25 @@ def test_word_of_the_day() -> None:


@responses.activate
def test_etymology_list(
craft_urls: Callable[
[
str,
str,
],
str,
]
) -> None:
def test_etymology_list(craft_urls: Callable[[str, str], str]) -> None:
craft_urls("fr", "bath")
assert check_word.main("fr", "bath") == 0


@responses.activate
def test_sublist(
craft_urls: Callable[
[
str,
str,
],
str,
]
) -> None:
def test_sublist(craft_urls: Callable[[str, str], str]) -> None:
craft_urls("fr", "茅peron")
assert check_word.main("fr", "茅peron") == 0


@responses.activate
def test_subsublist(
craft_urls: Callable[
[
str,
str,
],
str,
]
) -> None:
def test_subsublist(craft_urls: Callable[[str, str], str]) -> None:
craft_urls("fr", "base")
assert check_word.main("fr", "base") == 0


@responses.activate
def test_error_and_lock(
craft_urls: Callable[
[
str,
str,
],
str,
]
) -> None:
def test_error_and_lock(craft_urls: Callable[[str, str], str]) -> None:
craft_urls("fr", "42")
with patch.object(check_word, "contains", return_value=False):
assert check_word.main("fr", "42") > 0
Expand All @@ -134,15 +94,7 @@ def test_error_and_lock(


@responses.activate
def test_no_definition_nor_etymology(
craft_urls: Callable[
[
str,
str,
],
str,
]
) -> None:
def test_no_definition_nor_etymology(craft_urls: Callable[[str, str], str]) -> None:
craft_urls("fr", "<vide>")
assert check_word.main("fr", "<vide>") == 0

Expand Down
2 changes: 1 addition & 1 deletion tests/test_4_check_words.py
Expand Up @@ -22,7 +22,7 @@ def test_no_json_file() -> None:
def test_all(tmp_path: Path) -> None:
file = tmp_path / "test.json"
file.write_text('{"base":""}')
with patch("render.get_latest_json_file", return_value=file):
with patch.object(render, "get_latest_json_file", return_value=file):
assert check_words.main("fr", -1, False, "", "") == 0


Expand Down
15 changes: 0 additions & 15 deletions wikidict/constants.py
Expand Up @@ -15,21 +15,6 @@
DOWNLOAD_URL_KOBO = f"{GH_REPOS}/releases/download/{{0}}/dicthtml-{{0}}-{{0}}.zip"
DOWNLOAD_URL_STARDICT = f"{GH_REPOS}/releases/download/{{0}}/dict-{{0}}-{{0}}.zip"

# HTML formatting for each word
# TODO: move that into the dict specific class
WORD_FORMAT = """
<w>
<p>
<a name="{word}"/><b>{current_word}</b>{pronunciation}{gender}
<br/>
<br/>
{etymology}
<ol>{definitions}</ol>
</p>
{var}
</w>
"""

# Inline CSS for inline images handling <math> tags.
IMG_CSS = ";".join(
[
Expand Down