Skip to content

Commit

Permalink
fix #1732: [DE] Support "Ref-behindthename" template, and others (#1747)
Browse files Browse the repository at this point in the history
  • Loading branch information
BoboTiG committed Feb 3, 2023
1 parent 995529d commit 87de4e4
Show file tree
Hide file tree
Showing 7 changed files with 92 additions and 9 deletions.
19 changes: 19 additions & 0 deletions tests/test_2_render.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,22 @@ def test_find_section_definitions_and_es_replace_defs_list_with_numbered_lists()
"artículo de un diccionario, enciclopedia u obra de referencia.",
),
]


@pytest.mark.parametrize(
"locale, code, expected",
[
(
"de",
"{{Bedeutungen}}\n:[1] \n\n{{Herkunft}}\n:[[Abkürzung]] von [[Sturmkanone]]",
"=== {{Bedeutungen}} ===\n# \n\n=== {{Herkunft}} ===\n:[[Abkürzung]] von [[Sturmkanone]]",
),
(
"de",
"{{Bedeutungen}}\n:[1] {{K|Handwerk|Architektur|ft=[[defektives Verb{{!}}defektiv]]}}",
"=== {{Bedeutungen}} ===\n# {{K|Handwerk|Architektur|ft=[[defektives Verb|defektiv]]}}",
),
],
)
def test_adjust_wikicode(locale: str, code: str, expected: str) -> None:
assert render.adjust_wikicode(code, locale) == expected
6 changes: 6 additions & 0 deletions tests/test_4_check_word.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,12 @@ def test_no_definition_nor_etymology(craft_urls: Callable[[str, str], str]) -> N
'<small class="noprint" title="Luther 2017 bei www.bibleserver.com"></small>', # noqa
"",
],
# DE - lang link in {{Üxx5}}
[
"de",
'<a href="/w/index.php?title=grc:%E1%BC%80%CE%BD%CE%AE%CF%81&amp;action=edit&amp;redlink=1" class="new" title="grc:ἀνήρ (Seite nicht vorhanden)"><sup>→&nbsp;grc</sup>', # noqa
"",
],
# DE - other Wikis
[
"de",
Expand Down
12 changes: 12 additions & 0 deletions tests/test_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,18 @@ def test_parse_word(
"{{MZ|0|2|3|4|5|6|7|8|9|10|11}}",
"[0] 2<br/>3<br/>4<br/>5<br/>6<br/>7<br/>8<br/>9<br/>10<br/>11",
),
(
"{{Plainlink|1=http://de.wikipedia.org/wiki/Ludwig_XIV.|2=Ludwig XIV.}}",
"Ludwig XIV.",
),
(
"{{Plainlink|1=http://de.wikipedia.org/wiki/Ludwig_XIV.|Ludwig XIV.}}",
"Ludwig XIV.",
),
(
"{{Plainlink|http://de.wikipedia.org/wiki/Ludwig_XIV.|2=Ludwig XIV.}}",
"Ludwig XIV.",
),
("{{Ü|pl|dzień}}", "dzień"),
("{{übertr.}}", "<i>übertragen</i>"),
("{{übertr.|:}}", "<i>übertragen:</i>"),
Expand Down
4 changes: 4 additions & 0 deletions wikidict/check_word.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ def filter_html(html: str, locale: str) -> str:
for a in bs.find_all("a", {"class": "external"}):
if "archive.org" in a["href"]:
a.decompose()
# Lang link in {{Üxx5}}
for a in bs.find_all("a"):
if (sup := a.find("sup")) and sup.text.startswith("→"):
a.decompose()
# Other Wikis
for a in bs.find_all("a", {"class": "extiw"}):
if (
Expand Down
19 changes: 18 additions & 1 deletion wikidict/lang/de/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
etyl_section = ("{{Herkunft}}",)
sections = (
*etyl_section,
"{{Alte Schreibweise|",
"{{Aussprache}",
"{{Bedeutungen}",
"{{Grundformverweis ",
"{{Alte Schreibweise|",
)

# Variants
Expand All @@ -37,17 +37,22 @@
"Audio",
"Bpur",
"Fremdsprachige Beispiele",
"Herkunft fehlt",
"Herkunft unbelegt",
"Hintergrundfarbe",
"Hörbeispiele",
"IA",
"IPA",
"Lautschrift",
"QS Bedeutung",
"QS Bedeutungen",
"QS_Bedeutungen",
"QS Herkunft",
"QS_Herkunft",
"Ref-Adelung",
"Ref-Bibel",
"Ref-Duden",
"Ref-DWDS",
"Wikipedia",
)

Expand Down Expand Up @@ -102,10 +107,19 @@
"nf": "italic('n, f')",
# {{noredlink|diminutiv}}
"noredlink": "parts[-1]",
# {{Plainlink|1=http://de.wikipedia.org/wiki/Ludwig_XIV.|2=Ludwig XIV.}}
"Plainlink": "parts[-1].removeprefix('2=')",
# {{Polytonisch|(το)}}
"Polytonisch": "parts[-1]",
# {{Ref-behindthename|Alan}}
"Ref-behindthename": "f'behindthename.com „{word}“'",
"Ref-Grimm": "f'Jacob Grimm, Wilhelm Grimm: Deutsches Wörterbuch. 16 Bände in 32 Teilbänden. Leipzig 1854–1961 „{word}“'", # noqa
# {{Ü|pl|dzień}}
"Ü": "parts[-1]",
# {{Unicode|kɔ}}
"Unicode": "parts[-1]",
# {{vergleiche}}
"vergleiche": "italic('vergleiche:')",
# {{vgl.}}
"vgl.": "italic('vergleiche:')",
# {{W|Datenkompression|Datenkompressionen}}
Expand All @@ -115,16 +129,19 @@

# Templates that will be completed/replaced using custom style.
templates_other = {
"(R)": "®",
"DMG": "'DMG:'",
"Gen.": "Genitiv:",
"İA": "'İA:'",
"ISO 9": "ISO 9:",
"NNBSP": "&nbsp;",
"Part.": "Partizip II: ",
"Pl.": "Plural:",
"Pl.1": "Plural 1:",
"Pl.2": "Plural 2:",
"Pl.3": "Plural 3:",
"Pl.4": "Plural 4:",
"Prät.": "Präteritum: ",
}

templates_markierung = {
Expand Down
16 changes: 15 additions & 1 deletion wikidict/lang/de/template_handlers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from collections import defaultdict # noqa
from typing import Dict, List, Tuple

from ...user_functions import extract_keywords_from, italic
from ...user_functions import extract_keywords_from, italic, strong
from .abk import abk

bibel_names = {
Expand Down Expand Up @@ -299,10 +299,14 @@ def render_Uxx4(tpl: str, parts: List[str], data: Dict[str, str]) -> str:
'ܡܫܺܝܚܳܐ (ALA-LC: mšiḥāʾ) ‚Messias‘'
>>> render_Uxx4("Üxx4", ["fr", "ܡܫܝܚܐ"], defaultdict(str, {"v":"ܡܫܺܝܚܳܐ", "d":"mšiḥāʾ", "b":"Messias"}))
'ܡܫܺܝܚܳܐ (mšiḥāʾ) ‚Messias‘'
>>> render_Uxx4("Üxx4?", ["fr", "ܡܫܝܚܐ"], defaultdict(str, {"v":"ܡܫܺܝܚܳܐ", "d":"mšiḥāʾ", "b":"Messias"}))
'<b>?</b>&nbsp;ܡܫܺܝܚܳܐ (mšiḥāʾ) ‚Messias‘'
"""
language = parts.pop(0)
phrase = parts.pop(0) if parts else ""
phrase = data.get("v", data.get("2", phrase))
if tpl == "Üxx4?":
phrase = f"{strong('?')}&nbsp;{phrase}"
if "d" in data:
if language in ("ar", "fa", "ha", "ota", "pnb"):
phrase += f" (DMG: {data['d']})"
Expand All @@ -320,13 +324,23 @@ def render_Uxx4(tpl: str, parts: List[str], data: Dict[str, str]) -> str:
return phrase


def render_Uxx5(tpl: str, parts: List[str], data: Dict[str, str]) -> str:
"""
>>> render_Uxx5("Üxx5", ["grc", "anḗr, andrós", "ἀνήρ, ἀνδρός", "ἀνήρ"], defaultdict(str))
'ἀνήρ, ἀνδρός (anḗr, andrós)'
"""
return f"{parts[2]} ({parts[1]})"


template_mapping = {
"Bibel": render_bibel,
"K": render_K,
"Ref-dejure": render_ref_dejure,
"Üt": render_Ut,
"Üt?": render_Ut,
"Üxx4": render_Uxx4,
"Üxx4?": render_Uxx4,
"Üxx5": render_Uxx5,
}


Expand Down
25 changes: 18 additions & 7 deletions wikidict/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,20 +332,21 @@ def add_potential_variant(
variants.append(variant)


def parse_word(word: str, code: str, locale: str, force: bool = False) -> Word:
"""Parse *code* Wikicode to find word details.
*force* can be set to True to force the pronunciation and gender guessing.
It is disabled by default to speed-up the overall process, but enabled when
called from get_and_parse_word().
"""
def adjust_wikicode(code: str, locale: str) -> str:
"""Sometimes we need to adapt the Wikicode."""
code = re.sub(r"(<!--.*?-->)", "", code, flags=re.DOTALL)

if locale == "de":
# {{Bedeutungen}} -> === {{Bedeutungen}} ===
code = re.sub(r"^\{\{(.+)\}\}", r"=== {{\1}} ===", code, flags=re.MULTILINE)

# Definition lists are not well supported by the parser, replace them by numbered lists
code = re.sub(r":\[\d+\]\s*", "# ", code)
# Note: using `[ ]*` rather than `\s*` to bypass issues when a section above another one
# contains an empty item.
code = re.sub(r":\[\d+\][ ]*", "# ", code)

# {{!}} -> "|"
code = code.replace("{{!}}", "|")

elif locale == "es":
# {{ES|xxx|núm=n}} -> == {{lengua|es}} ==
Expand All @@ -367,6 +368,16 @@ def parse_word(word: str, code: str, locale: str, force: bool = False) -> Word:
# {{-avv-}} -> === {{avv}} ===
code = re.sub(r"^\{\{-(\w+)-\}\}", r"=== {{\1}} ===", code, flags=re.MULTILINE)

return code


def parse_word(word: str, code: str, locale: str, force: bool = False) -> Word:
"""Parse *code* Wikicode to find word details.
*force* can be set to True to force the pronunciation and gender guessing.
It is disabled by default to speed-up the overall process, but enabled when
called from get_and_parse_word().
"""
code = adjust_wikicode(code, locale)
top_sections, parsed_sections = find_sections(code, locale)
prons = []
genders = []
Expand Down

0 comments on commit 87de4e4

Please sign in to comment.