Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate additionnal etymology-free dictionnaries #1440

Merged
merged 7 commits into from Dec 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
12 changes: 6 additions & 6 deletions .github/workflows/auto-updates.yml
Expand Up @@ -91,28 +91,28 @@ jobs:
uses: svenstaro/upload-release-action@v2
with:
repo_token: ${{ secrets.TOKEN }}
file: data/${{ matrix.locale }}/dict-${{ matrix.locale }}-${{ matrix.locale }}.df.bz2
asset_name: dict-${{ matrix.locale }}-${{ matrix.locale }}.df.bz2
file: data/${{ matrix.locale }}/dict-${{ matrix.locale }}-${{ matrix.locale }}*.df.bz2
tag: ${{ matrix.locale }}
overwrite: true
file_glob: true

- name: Upload the dictionary (Kobo)
uses: svenstaro/upload-release-action@v2
with:
repo_token: ${{ secrets.TOKEN }}
file: data/${{ matrix.locale }}/dicthtml-${{ matrix.locale }}-${{ matrix.locale }}.zip
asset_name: dicthtml-${{ matrix.locale }}-${{ matrix.locale }}.zip
file: data/${{ matrix.locale }}/dicthtml-${{ matrix.locale }}-${{ matrix.locale }}*.zip
tag: ${{ matrix.locale }}
overwrite: true
file_glob: true

- name: Upload the dictionary (StarDict)
uses: svenstaro/upload-release-action@v2
with:
repo_token: ${{ secrets.TOKEN }}
file: data/${{ matrix.locale }}/dict-${{ matrix.locale }}-${{ matrix.locale }}.zip
asset_name: dict-${{ matrix.locale }}-${{ matrix.locale }}.zip
file: data/${{ matrix.locale }}/dict-${{ matrix.locale }}-${{ matrix.locale }}*.zip
tag: ${{ matrix.locale }}
overwrite: true
file_glob: true

- name: Update the release description
run: python -Wd -m wikidict ${{ matrix.locale }} --update-release
Expand Down
2 changes: 1 addition & 1 deletion check.sh
Expand Up @@ -5,4 +5,4 @@
python -m isort wikidict tests scripts
python -m black wikidict tests scripts
python -m flake8 wikidict tests scripts
python -m mypy wikidict
python -m mypy wikidict scripts tests
1 change: 1 addition & 0 deletions requirements.txt
Expand Up @@ -2,6 +2,7 @@ setuptools>=36.2.1
beautifulsoup4==4.11.1
cachetools==5.2.0
docopt==0.6.2
Jinja2==3.1.2
marisa-trie==0.7.8
mistune==2.0.4 # for DictFile reading
pillow==9.3.0
Expand Down
3 changes: 2 additions & 1 deletion scripts/__main__.py
Expand Up @@ -50,7 +50,7 @@ def process_script(script: str, file: str) -> None:
print(f" !! Error processing {script}", flush=True)


def main():
def main() -> int:
"""Entry point."""
threads = []

Expand All @@ -63,6 +63,7 @@ def main():
th.join()

print("\nFriendly reminder: run ./check.sh")
return 0


if __name__ == "__main__":
Expand Down
4 changes: 2 additions & 2 deletions scripts/de-abk.py
Expand Up @@ -3,7 +3,7 @@
from scripts_utils import get_soup


def cleanWiki(text):
def clean_wiki(text: str) -> str:
text = re.sub(r"\[\[([^||:\]]+)\]\]", "\\1", text) # [[a]] -> a
text = re.sub(r"\[\[[^|]+\|(.+?(?=\]\]))\]\]", "\\1", text) # [[a|b]] -> b
return text
Expand All @@ -19,7 +19,7 @@ def cleanWiki(text):
for line in text.split("\n"):
if not line.startswith("|"):
continue
line = cleanWiki(line)
line = clean_wiki(line)
keys = []
value = ""
sArray = line.split("|")
Expand Down
6 changes: 4 additions & 2 deletions scripts/de-langs.py
@@ -1,3 +1,5 @@
from typing import Dict

from scripts_utils import get_soup

ROOT_URL = "https://de.wiktionary.org"
Expand All @@ -7,7 +9,7 @@
ALIAS_URL = "https://de.wiktionary.org/w/index.php?title=Spezial:Linkliste/{}&hidetrans=1&hidelinks=1"


def process_page(page_url, languages):
def process_page(page_url: str, languages: Dict[str, str]) -> str:
soup = get_soup(page_url)

nextpage = ""
Expand Down Expand Up @@ -40,7 +42,7 @@ def process_page(page_url, languages):


next_page_url = START_URL
languages = {}
languages: Dict[str, str] = {}

while next_page_url:
next_page_url = process_page(next_page_url, languages)
Expand Down
2 changes: 1 addition & 1 deletion scripts/el-langs.py
Expand Up @@ -41,7 +41,7 @@
script += line

exec(script)
languages = {key: Languages[key]["name"] for key in Languages.keys()} # noqa
languages = {key: Languages[key]["name"] for key in Languages.keys()} # type: ignore # noqa

print("langs = {")
for key, value in sorted(languages.items()):
Expand Down
12 changes: 5 additions & 7 deletions scripts/en-form-of.py
@@ -1,14 +1,13 @@
from scripts_utils import get_soup


def get_text(url):
def get_text(url: str) -> str:
soup = get_soup(url)
div = soup.find("span", "form-of-definition")
if not div:
return ""
res = div.text.replace(" term", "")
res = res.replace(" [Term?]", "")
return res
res = str(div.text).replace(" term", "")
return res.replace(" [Term?]", "")


ROOT = "https://en.wiktionary.org"
Expand All @@ -21,13 +20,12 @@ def get_text(url):
body = tables[0].find("tbody")
trs = body.find_all("tr")
trs.pop(0) # remove header
alias_dict = {}
count = 0
print("form_of_templates = {")
for tr in trs:
tds_html = tr.find_all("td")
tds = [t.text.strip() for t in tds_html]
if tds := dict(zip(columns, tds)):
tds0 = [t.text.strip() for t in tds_html]
if tds := dict(zip(columns, tds0)):
link = tr.find("a")
url_template = ROOT + link["href"]
if text := get_text(url_template):
Expand Down
25 changes: 16 additions & 9 deletions scripts/en-labels.py
@@ -1,9 +1,10 @@
import re
from typing import Dict, Tuple

from scripts_utils import get_soup


def process_display(display):
def process_display(display: str) -> str:
if "[[" in display:
display = re.sub(
r"\[\[(?:w|wikipedia|Wiktionary):[^|]*\|(^\])*",
Expand All @@ -19,7 +20,13 @@ def process_display(display):
return display


def process_page(url, repl, stop_line, var_name, print_result=True):
def process_page(
url: str,
repl: Tuple[str, ...],
stop_line: str,
var_name: str,
print_result: bool = True,
) -> Dict[str, str]:
soup = get_soup(url)
div = soup.find("div", {"class": "mw-highlight-lines"})
text = div.text
Expand All @@ -44,13 +51,13 @@ def process_page(url, repl, stop_line, var_name, print_result=True):
code += line + "\n"

exec(code, globals())
results = {}
results: Dict[str, str] = {}

for k, v in labels.items(): # noqa
for k, v in labels.items(): # type: ignore # noqa
label_v = v
label_k = k
if isinstance(v, str):
label_v = labels.get(v, v) # noqa
label_v = labels.get(v, v) # type: ignore # noqa
if label_v != v:
label_k = v
if isinstance(label_v, str):
Expand Down Expand Up @@ -96,10 +103,10 @@ def process_page(url, repl, stop_line, var_name, print_result=True):
process_page(url, repl, stop_line, var_name)

syntaxes = {}
for k, v in labels.items(): # noqa
for k, v in labels.items(): # type: ignore # noqa
label_v = v
if isinstance(v, str):
label_v = labels.get(v) # noqa
label_v = labels.get(v) # type: ignore # noqa
if not label_v:
continue
omit_preComma = label_v.get("omit_preComma")
Expand Down Expand Up @@ -144,7 +151,7 @@ def process_page(url, repl, stop_line, var_name, print_result=True):
soup = get_soup(url)
div = soup.find("div", {"class": "mw-prefixindex-body"})
lis = div.findAll("li")
results = {}
results: Dict[str, str] = {}
for li in lis:
if not li.text.endswith("documentation"):
href = li.find("a")["href"]
Expand All @@ -154,6 +161,6 @@ def process_page(url, repl, stop_line, var_name, print_result=True):
results |= process_page(page_url, repl, stop_line, var_name, print_result=False)

print(f"{var_name} = {{")
for key, value in sorted(results.items()):
for key, value in sorted(results.items()): # type: ignore
print(f' "{key}": "{value}",')
print(f"}} # {len(results):,}")
46 changes: 19 additions & 27 deletions scripts/en-langs.py
@@ -1,13 +1,14 @@
import re
from typing import Dict, List

from scripts_utils import get_soup


def read_all_lines_etym(lines):
def read_all_lines_etym(lines: List[str]) -> Dict[str, Dict[str, str]]:
pattern = re.compile(r"(\w*)\s*=\s*([{|\"].*[}|\"])")
pattern2 = re.compile(r"(\w*)\s*=\s*{")

m = {} # noqa
m: Dict[str, Dict[str, str]] = {} # noqa
concat = ""
in_comment = False
for line in lines:
Expand All @@ -28,9 +29,9 @@ def read_all_lines_etym(lines):
matches = pattern.findall(line)
matches2 = pattern2.findall(line)
if matches:
result = '"' + matches[0][0].strip() + '": ' + matches[0][1] + ","
result = f'"{matches[0][0].strip()}": {matches[0][1]},'
elif matches2 and matches2[0]:
result = '"' + matches2[0].strip() + '" : {' + line[line.index("{") + 1 :]
result = f'"{matches2[0].strip()}' + '" : {' + line[line.index("{") + 1 :]
else:
result = line

Expand All @@ -43,9 +44,9 @@ def read_all_lines_etym(lines):
return m


def read_all_lines_lang(lines):
def read_all_lines_lang(lines: List[str]) -> Dict[str, str]:
code = ""
m = {}
m: Dict[str, str] = {}
pattern = re.compile(r"m\[\"(.*)\"\]\s+=\s+{")
for line in lines:
if code:
Expand All @@ -58,45 +59,36 @@ def read_all_lines_lang(lines):
return m


def get_content(url):
def get_content(url: str) -> List[str]:
soup = get_soup(url)
content_div = soup.find("div", "mw-parser-output")
content_div = content_div.findChild(
"div", {"class": "mw-highlight"}, recursive=False
)
return content_div.text.split("\n")
return str(content_div.text).split("\n")


def process_lang_page(url):
def process_lang_page(url: str) -> Dict[str, str]:
lines = get_content(url)
return read_all_lines_lang(lines)


# Etymology languages
url = "https://en.wiktionary.org/wiki/Module:etymology_languages/data"
lines = get_content(url)
m = read_all_lines_etym(lines)
languages = {key: m[key]["canonicalName"] for key in m.keys()}
lines = get_content("https://en.wiktionary.org/wiki/Module:etymology_languages/data")
m: Dict[str, Dict[str, str]] = read_all_lines_etym(lines)
languages = {key: val["canonicalName"] for key, val in m.items()}

# Families
url = "https://en.wiktionary.org/wiki/Module:families/data"
lines = get_content(url)
m = read_all_lines_etym(lines)
for key in m.keys():
languages[key] = m[key]["canonicalName"]
lines = get_content("https://en.wiktionary.org/wiki/Module:families/data")
for key, val in read_all_lines_etym(lines).items():
languages[key] = val["canonicalName"]

url = "https://en.wiktionary.org/wiki/Module:languages/data2"
m = process_lang_page(url)
languages |= m

url = "https://en.wiktionary.org/wiki/Module:languages/datax"
m = process_lang_page(url)
languages |= m
languages |= process_lang_page("https://en.wiktionary.org/wiki/Module:languages/data2")
languages |= process_lang_page("https://en.wiktionary.org/wiki/Module:languages/datax")

for letter in "abcdefghijklmnopqrstuvwxyz":
url = f"https://en.wiktionary.org/wiki/Module:languages/data3/{letter}"
m = process_lang_page(url)
languages.update(m)
languages.update(process_lang_page(url))

print("langs = {")
for key, value in sorted(languages.items()):
Expand Down
8 changes: 5 additions & 3 deletions scripts/es-campos-semanticos.py
@@ -1,3 +1,5 @@
from typing import Dict

from scripts_utils import get_soup

START_URL = (
Expand All @@ -8,7 +10,7 @@
NEXTPAGE_TEXT = "p谩gina siguiente"


def process_alias_page(model, template_text, results):
def process_alias_page(model: str, template_text: str, results: Dict[str, str]) -> None:
url = ALIAS_URL.format(model)
soup = get_soup(url)
ul = soup.find("ul", {"id": ["mw-whatlinkshere-list"]})
Expand All @@ -21,7 +23,7 @@ def process_alias_page(model, template_text, results):
results[alias] = template_text


def process_cs_page(url, results):
def process_cs_page(url: str, results: Dict[str, str]) -> str:
soup = get_soup(url)

nextpage = ""
Expand All @@ -48,7 +50,7 @@ def process_cs_page(url, results):
return nextpage


results = {}
results: Dict[str, str] = {}
next_page_url = START_URL
while next_page_url:
next_page_url = process_cs_page(next_page_url, results)
Expand Down
8 changes: 5 additions & 3 deletions scripts/fr-domain-templates.py
@@ -1,3 +1,5 @@
from typing import Dict

from scripts_utils import get_soup

ROOT = "https://fr.wiktionary.org"
Expand All @@ -8,7 +10,7 @@
ALIAS_URL = "https://fr.wiktionary.org/w/index.php?title=Sp%C3%A9cial:Pages_li%C3%A9es/Mod%C3%A8le:{}&limit=10&hidetrans=1&hidelinks=1" # noqa


def process_category_page(url, results):
def process_category_page(url: str, results: Dict[str, str]) -> str:
soup = get_soup(url)

nextpage = ""
Expand All @@ -31,7 +33,7 @@ def process_category_page(url, results):
return nextpage


def process_alias_page(key, value, results):
def process_alias_page(key: str, value: str, results: Dict[str, str]) -> None:
url = ALIAS_URL.format(key)
soup = get_soup(url)
ul = soup.find("ul", {"id": ["mw-whatlinkshere-list"]})
Expand All @@ -45,7 +47,7 @@ def process_alias_page(key, value, results):


next_page_url = START_URL
results = {}
results: Dict[str, str] = {}

while next_page_url:
next_page_url = process_category_page(next_page_url, results)
Expand Down