From e0a1a79286748178ec74503686c2620859a9a1bb Mon Sep 17 00:00:00 2001 From: Ben Fernandes Date: Mon, 19 Oct 2020 22:00:34 +0100 Subject: [PATCH 01/11] ISSUE-241: Ignoring 'env' and '.idea' directories --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index d69c9bce..718f2b7d 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,7 @@ __pycache__/ *.egg-info/ *.log **/tars -**/freq_tsvs \ No newline at end of file +**/freq_tsvs +env/ + +.idea/ From 5c6a5046af2d8c77cdd10b520aa95e283fd01673 Mon Sep 17 00:00:00 2001 From: Ben Fernandes Date: Mon, 19 Oct 2020 22:01:00 +0100 Subject: [PATCH 02/11] ISSUE-241: Added 'mypy' to 'requirements.txt' --- data/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/data/requirements.txt b/data/requirements.txt index f1a1ab28..b4b89509 100644 --- a/data/requirements.txt +++ b/data/requirements.txt @@ -2,3 +2,4 @@ regex>=2019.12.9 requests requests-html wikipron>=1.0.0 +mypy From 3fb46ed0ae3f37bbacb438212b8e372c12920cd0 Mon Sep 17 00:00:00 2001 From: Ben Fernandes Date: Mon, 19 Oct 2020 22:05:33 +0100 Subject: [PATCH 03/11] ISSUE-241: Added 'Type checking' step to CircleCI --- .circleci/config.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index e942ba88..f32d5985 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -31,6 +31,9 @@ jobs: - run: name: Show installed Python packages command: pip list + - run: + name: Type checking + command: mypy project/wikipron project/tests project/data - run: name: Lint working_directory: ~/ From eb946c32df0c9a14953547019cbbc6237b0c0181 Mon Sep 17 00:00:00 2001 From: Ben Fernandes Date: Mon, 19 Oct 2020 22:36:28 +0100 Subject: [PATCH 04/11] ISSUE-241: Fixed mypy issues --- .circleci/config.yml | 2 +- data/src/codes.py | 10 ++++++++-- mypy.ini | 3 +++ tests/test_data/__init__.py | 3 ++- tests/test_data/test_scrape.py | 5 +++-- tests/test_wikipron/__init__.py | 8 +++++--- tests/test_wikipron/test_extract.py | 5 ++--- wikipron/config.py | 8 ++++++-- wikipron/extract/cmn.py | 7 +++---- wikipron/extract/default.py | 7 +++---- wikipron/extract/jpn.py | 8 ++++---- wikipron/extract/khb.py | 4 ++-- wikipron/extract/khm.py | 5 ++--- wikipron/extract/lat.py | 15 ++++++--------- wikipron/extract/shn.py | 4 ++-- wikipron/extract/tha.py | 5 ++--- wikipron/extract/vie.py | 6 +++--- wikipron/scrape.py | 7 +++++-- 18 files changed, 62 insertions(+), 50 deletions(-) create mode 100644 mypy.ini diff --git a/.circleci/config.yml b/.circleci/config.yml index f32d5985..e1023341 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -33,7 +33,7 @@ jobs: command: pip list - run: name: Type checking - command: mypy project/wikipron project/tests project/data + command: mypy - run: name: Lint working_directory: ~/ diff --git a/data/src/codes.py b/data/src/codes.py index 3db81f37..6d343f39 100755 --- a/data/src/codes.py +++ b/data/src/codes.py @@ -91,9 +91,15 @@ def _get_language_sizes(categories: List[str]) -> Dict[str, int]: ).json() for page in data["query"]["pages"].values(): size = page["categoryinfo"]["size"] - language = re.search( + + language_search = re.search( r"Category:(.+?) terms with IPA pronunciation", page["title"] - ).group(1) + ) + + if not language_search: + continue + + language = language_search.group(1) language_sizes[language] = size return language_sizes diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 00000000..2f97cb32 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,3 @@ +[mypy] +files=wikipron,data,tests +ignore_missing_imports=true \ No newline at end of file diff --git a/tests/test_data/__init__.py b/tests/test_data/__init__.py index 7731788e..5e9aee4b 100644 --- a/tests/test_data/__init__.py +++ b/tests/test_data/__init__.py @@ -2,6 +2,7 @@ import shutil from contextlib import contextmanager +from typing import Iterator _TESTS_DIR = os.path.dirname(os.getcwd()) _TSV_PATH = f"{_TESTS_DIR}/tsv" @@ -21,7 +22,7 @@ def write_dummy_phones_files(key: str, dialect: str) -> None: @contextmanager -def handle_dummy_files(phones: bool, key: str, dialect: str) -> str: +def handle_dummy_files(phones: bool, key: str, dialect: str) -> Iterator[str]: """Creates and removes dummy directories for housing TSV and phones files.""" os.mkdir(_TSV_PATH) diff --git a/tests/test_data/test_scrape.py b/tests/test_data/test_scrape.py index f9bd5eba..03e1fa99 100644 --- a/tests/test_data/test_scrape.py +++ b/tests/test_data/test_scrape.py @@ -1,6 +1,6 @@ import os -from typing import List +from typing import List, Any, Dict import pytest @@ -46,7 +46,7 @@ ], ) def test_file_creation( - config_settings: object, + config_settings: Dict[str, Any], dialect_suffix: str, phones: bool, expected_file_name: List[str], @@ -55,6 +55,7 @@ def test_file_creation( file names based on presence or absence of dialect specification or .phones files for a given language. """ + dummy_tsv_path: str with handle_dummy_files( phones, config_settings["key"], dialect_suffix ) as dummy_tsv_path: diff --git a/tests/test_wikipron/__init__.py b/tests/test_wikipron/__init__.py index 5968896e..043cf4a5 100644 --- a/tests/test_wikipron/__init__.py +++ b/tests/test_wikipron/__init__.py @@ -1,12 +1,14 @@ +from typing import Dict + import requests -from wikipron.scrape import HTTP_HEADERS from wikipron.config import Config +from wikipron.scrape import HTTP_HEADERS def config_factory(**kwargs) -> Config: """Create a Config object for testing.""" - config_dict = {"key": "eng"} # The one default; may be overridden. + config_dict: Dict = {"key": "eng"} # The one default; may be overridden. config_dict.update(**kwargs) return Config(**config_dict) @@ -17,7 +19,7 @@ def can_connect_to_wiktionary() -> bool: requests.get( "https://en.wiktionary.org/wiki/linguistics", headers=HTTP_HEADERS ) - except (requests.ConnectionError, requests.ConnectTimeout): + except requests.ConnectionError: return False else: return True diff --git a/tests/test_wikipron/test_extract.py b/tests/test_wikipron/test_extract.py index ac0147bf..69129776 100644 --- a/tests/test_wikipron/test_extract.py +++ b/tests/test_wikipron/test_extract.py @@ -1,10 +1,9 @@ import pytest -import requests +import requests_html from wikipron.extract import EXTRACTION_FUNCTIONS from wikipron.extract.core import _skip_pron from wikipron.extract.default import extract_word_pron_default - from . import config_factory @@ -14,7 +13,7 @@ def test_extraction_functions_have_the_same_signature(func): expected_annotations = { "word": "Word", - "request": requests.Response, + "request": requests_html, "config": "Config", "return": "Iterator[WordPronPair]", } diff --git a/wikipron/config.py b/wikipron/config.py index 6686ea78..238a1534 100644 --- a/wikipron/config.py +++ b/wikipron/config.py @@ -3,7 +3,7 @@ import logging import re -from typing import Callable, Optional +from typing import Callable, Optional, cast import iso639 import segments @@ -121,7 +121,11 @@ def _get_cut_off_date(self, cut_off_date: Optional[str]) -> str: return cut_off_date def _get_casefold(self, casefold: bool) -> Callable[[Word], Word]: - return str.casefold if casefold else lambda word: word # noqa: E731 + default_func: Callable[[Word], Word] = lambda word: word # noqa: E731 + return self._casefold_word if casefold else default_func + + def _casefold_word(self, word: Word): + return cast(Word, str.casefold(word)) def _get_process_pron( self, diff --git a/wikipron/extract/cmn.py b/wikipron/extract/cmn.py index c55c65db..ee06ba33 100644 --- a/wikipron/extract/cmn.py +++ b/wikipron/extract/cmn.py @@ -3,11 +3,10 @@ import itertools import typing -import requests +import requests_html from wikipron.extract.default import yield_pron, IPA_XPATH_SELECTOR - if typing.TYPE_CHECKING: from wikipron.config import Config from wikipron.typing import Iterator, Word, Pron, WordPronPair @@ -22,14 +21,14 @@ def yield_cmn_pron( - request: requests.Response, config: "Config" + request: requests_html, config: "Config" ) -> "Iterator[Pron]": for li_container in request.html.xpath(_PRON_XPATH_TEMPLATE): yield from yield_pron(li_container, IPA_XPATH_SELECTOR, config) def extract_word_pron_cmn( - word: "Word", request: requests.Response, config: "Config" + word: "Word", request: requests_html, config: "Config" ) -> "Iterator[WordPronPair]": words = itertools.repeat(word) prons = yield_cmn_pron(request, config) diff --git a/wikipron/extract/default.py b/wikipron/extract/default.py index f9ef0289..79fd7d1c 100644 --- a/wikipron/extract/default.py +++ b/wikipron/extract/default.py @@ -3,11 +3,10 @@ import itertools import typing -import requests +import requests_html from wikipron.extract.core import yield_pron - if typing.TYPE_CHECKING: from wikipron.config import Config from wikipron.typing import Iterator, Pron, Word, WordPronPair @@ -17,14 +16,14 @@ def _yield_phn( - request: requests.Response, config: "Config" + request: requests_html, config: "Config" ) -> "Iterator[Pron]": for pron_element in request.html.xpath(config.pron_xpath_selector): yield from yield_pron(pron_element, IPA_XPATH_SELECTOR, config) def extract_word_pron_default( - word: "Word", request: requests.Response, config: "Config" + word: "Word", request: requests_html, config: "Config" ) -> "Iterator[WordPronPair]": words = itertools.repeat(word) prons = _yield_phn(request, config) diff --git a/wikipron/extract/jpn.py b/wikipron/extract/jpn.py index 2eb092ba..6cb5e232 100644 --- a/wikipron/extract/jpn.py +++ b/wikipron/extract/jpn.py @@ -16,7 +16,7 @@ import itertools import typing -import requests +import requests_html from wikipron.extract.default import yield_pron, IPA_XPATH_SELECTOR @@ -34,7 +34,7 @@ def yield_jpn_pron( - request: requests.Response, config: "Config" + request: requests_html, config: "Config" ) -> "Iterator[Pron]": # For simplicity, just want to grab the first transcription. # Will encounter words that have no transcription. @@ -44,7 +44,7 @@ def yield_jpn_pron( def yield_jpn_word( - word: "Word", request: requests.Response + word: "Word", request: requests_html ) -> "Iterator[Word]": # Again for simplicity, only grabbing first "sub"-word. word_element = request.html.xpath(_WORD_XPATH_SELECTOR, first=True) @@ -56,7 +56,7 @@ def yield_jpn_word( def extract_word_pron_jpn( - word: "Word", request: requests.Response, config: "Config" + word: "Word", request: requests_html, config: "Config" ) -> "Iterator[WordPronPair]": # If we can't find a kana alternative, then the headword # must itself be kana. diff --git a/wikipron/extract/khb.py b/wikipron/extract/khb.py index 9ebdfb81..eace7170 100644 --- a/wikipron/extract/khb.py +++ b/wikipron/extract/khb.py @@ -7,7 +7,7 @@ import itertools import typing -import requests +import requests_html from wikipron.extract.default import yield_pron @@ -29,7 +29,7 @@ def extract_word_pron_lu( - word: "Word", request: requests.Response, config: "Config" + word: "Word", request: requests_html, config: "Config" ) -> "Iterator[WordPronPair]": words = itertools.repeat(word) prons = yield_pron(request.html, _IPA_XPATH_SELECTOR, config) diff --git a/wikipron/extract/khm.py b/wikipron/extract/khm.py index 9ede84cf..93f14048 100644 --- a/wikipron/extract/khm.py +++ b/wikipron/extract/khm.py @@ -3,11 +3,10 @@ import itertools import typing -import requests +import requests_html from wikipron.extract.default import yield_pron - if typing.TYPE_CHECKING: from wikipron.config import Config from wikipron.typing import Iterator, Word, WordPronPair @@ -17,7 +16,7 @@ def extract_word_pron_khmer( - word: "Word", request: requests.Response, config: "Config" + word: "Word", request: requests_html, config: "Config" ) -> "Iterator[WordPronPair]": words = itertools.repeat(word) prons = yield_pron(request.html, _IPA_XPATH_SELECTOR, config) diff --git a/wikipron/extract/lat.py b/wikipron/extract/lat.py index 6a800749..d450ab3b 100644 --- a/wikipron/extract/lat.py +++ b/wikipron/extract/lat.py @@ -43,14 +43,12 @@ import itertools import typing +from typing import List -import requests +import requests_html from wikipron.extract.default import yield_pron, IPA_XPATH_SELECTOR -from typing import List - - if typing.TYPE_CHECKING: from wikipron.config import Config from wikipron.typing import Iterator, Pron, Word, WordPronPair @@ -87,7 +85,7 @@ """ -def _get_tags(request: requests.Response) -> List[str]: +def _get_tags(request: requests_html) -> List[str]: """Extract the Latin Etymology ID tags from the table of contents.""" tags = [] for a_element in request.html.xpath(_TOC_ETYMOLOGY_XPATH_SELECTOR): @@ -101,7 +99,7 @@ def _get_tags(request: requests.Response) -> List[str]: def _yield_latin_word( - request: requests.Response, tag: str + request: requests_html, tag: str ) -> "Iterator[Word]": heading = "h2" if tag == "Latin" else "h3" word_xpath_selector = _WORD_XPATH_TEMPLATE.format(heading=heading, tag=tag) @@ -119,7 +117,7 @@ def _yield_latin_word( def _yield_latin_pron( - request: requests.Response, config: "Config", tag: str + request: requests_html, config: "Config", tag: str ) -> "Iterator[Pron]": heading = "h2" if tag == "Latin" else "h3" if config.dialect: @@ -140,12 +138,11 @@ def _yield_latin_pron( def extract_word_pron_latin( - word: "Word", request: requests.Response, config: "Config" + word: "Word", request: requests_html, config: "Config" ) -> "Iterator[WordPronPair]": # For Latin, we don't use the title word from the Wiktionary page, # because it never has macrons (necessary for Latin vowel length). # We will get the word from each "Etymology" section within the page. - word = None # noqa: F841 tags = _get_tags(request) for tag in tags: # The words and prons are extracted from the same request response but diff --git a/wikipron/extract/shn.py b/wikipron/extract/shn.py index 48ed4534..bcaea2d3 100644 --- a/wikipron/extract/shn.py +++ b/wikipron/extract/shn.py @@ -3,7 +3,7 @@ import itertools import typing -import requests +import requests_html from wikipron.extract.default import yield_pron @@ -25,7 +25,7 @@ def extract_word_pron_shan( - word: "Word", request: requests.Response, config: "Config" + word: "Word", request: requests_html, config: "Config" ) -> "Iterator[WordPronPair]": words = itertools.repeat(word) prons = yield_pron(request.html, _IPA_XPATH_SELECTOR, config) diff --git a/wikipron/extract/tha.py b/wikipron/extract/tha.py index 3e6b0120..9b36b39c 100644 --- a/wikipron/extract/tha.py +++ b/wikipron/extract/tha.py @@ -3,18 +3,17 @@ import itertools import typing -import requests +import requests_html from wikipron.extract.default import yield_pron, IPA_XPATH_SELECTOR - if typing.TYPE_CHECKING: from wikipron.config import Config from wikipron.typing import Iterator, Word, WordPronPair def extract_word_pron_thai( - word: "Word", request: requests.Response, config: "Config" + word: "Word", request: requests_html, config: "Config" ) -> "Iterator[WordPronPair]": words = itertools.repeat(word) prons = yield_pron(request.html, IPA_XPATH_SELECTOR, config) diff --git a/wikipron/extract/vie.py b/wikipron/extract/vie.py index 223f66ce..a29d7c50 100644 --- a/wikipron/extract/vie.py +++ b/wikipron/extract/vie.py @@ -3,7 +3,7 @@ import itertools import typing -import requests +import requests_html from wikipron.extract.default import yield_pron, IPA_XPATH_SELECTOR @@ -32,14 +32,14 @@ def extract_pron( - request: requests.Response, selector: str, config: "Config" + request: requests_html, selector: str, config: "Config" ) -> "Iterator[Pron]": for pron_element in request.html.xpath(selector): yield from yield_pron(pron_element, IPA_XPATH_SELECTOR, config) def extract_word_pron_vie( - word: "Word", request: requests.Response, config: "Config" + word: "Word", request: requests_html, config: "Config" ) -> "Iterator[WordPronPair]": if config.dialect: dialect_selector = _DIALECT_XPATH_SELECTOR_TEMPLATE.format( diff --git a/wikipron/scrape.py b/wikipron/scrape.py index 1590fd31..7811d085 100644 --- a/wikipron/scrape.py +++ b/wikipron/scrape.py @@ -1,12 +1,14 @@ import re import unicodedata +from typing import cast + import pkg_resources import requests import requests_html from wikipron.config import Config -from wikipron.typing import Iterator, WordPronPair +from wikipron.typing import Iterator, WordPronPair, Word, Pron # Queries for the MediaWiki backend. # Documentation here: https://www.mediawiki.org/wiki/API:Categorymembers @@ -55,7 +57,8 @@ def _scrape_once(data, config: Config) -> Iterator[WordPronPair]: for word, pron in config.extract_word_pron(word, request, config): # Pronunciation processing is done in NFD-space; # we convert back to NFC aftewards. - yield word, unicodedata.normalize("NFC", pron) + normalized_pron = unicodedata.normalize("NFC", pron) + yield cast(Word, word), cast(Pron, normalized_pron) def scrape(config: Config) -> Iterator[WordPronPair]: From 20e5127c6cd301ef9e2529c47eeae5acbd9d9977 Mon Sep 17 00:00:00 2001 From: Ben Fernandes Date: Fri, 23 Oct 2020 16:28:19 +0100 Subject: [PATCH 05/11] ISSUE-241: Updated documentation --- CHANGELOG.md | 1 + CONTRIBUTING.md | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e15363cb..39aaccf3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -66,6 +66,7 @@ Unreleased - Added Shan (`shn`) with custom extraction. (\#229) - Split Latin (`lat`) into its dialects. (\#233) - Added support for python 3.9 (\236) +- Added MyPy coverage for `wikipron`, `data` and `tests` directories ### Changed diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index bd698463..ea523361 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -74,9 +74,9 @@ To work on a feature or bug fix, here are the development steps: The `wikipron` repo has continuous integration (CI) turned on, with autobuilds running pytest and flake8 for the test suite (in the [`tests/`](tests) directory) and code style checks, respectively. -If an autobuild at a pending pull request fails because of pytest or flake8 -errors, then the errors must be fixed by further commits pushed to the branch -by the author. +If an autobuild at a pending pull request fails because of `pytest`, `flake8` or +`mypy` errors, then the errors must be fixed by further commits pushed to the +branch by the author. If you would like to help avoid wasting free Internet resources (every push triggers a new CI autobuild), @@ -85,4 +85,5 @@ you can run pytest and flake8 checks locally before pushing commits: ```bash flake8 setup.py wikipron/ tests/ pytest -vv tests/ +mypy ``` From d5bd8f9c2e45918af4a5e7c9b43bfe0ba04c22fc Mon Sep 17 00:00:00 2001 From: Ben Fernandes Date: Fri, 23 Oct 2020 17:02:53 +0100 Subject: [PATCH 06/11] ISSUE-241: Added mypy to the correct 'requirements.txt' --- data/requirements.txt | 1 - requirements.txt | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/data/requirements.txt b/data/requirements.txt index b4b89509..f1a1ab28 100644 --- a/data/requirements.txt +++ b/data/requirements.txt @@ -2,4 +2,3 @@ regex>=2019.12.9 requests requests-html wikipron>=1.0.0 -mypy diff --git a/requirements.txt b/requirements.txt index a6bac7f8..9f206115 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ requests==2.24.0 requests-html==0.10.0 segments==2.1.3 setuptools==50.3.1 -black==20.8b1 \ No newline at end of file +black==20.8b1 +mypy==0.790 \ No newline at end of file From 8ae7400a4ad9ef5201ff245a76cc5a97c7b9f46b Mon Sep 17 00:00:00 2001 From: Ben Fernandes Date: Fri, 23 Oct 2020 17:11:49 +0100 Subject: [PATCH 07/11] ISSUE-241: Ran Black formatter Also updated the contribution guidelines to include this as a step --- CONTRIBUTING.md | 13 ++++++------- wikipron/extract/default.py | 4 +--- wikipron/extract/jpn.py | 4 +--- wikipron/extract/lat.py | 4 +--- 4 files changed, 9 insertions(+), 16 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ea523361..22f67250 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -80,10 +80,9 @@ branch by the author. If you would like to help avoid wasting free Internet resources (every push triggers a new CI autobuild), -you can run pytest and flake8 checks locally before pushing commits: - -```bash -flake8 setup.py wikipron/ tests/ -pytest -vv tests/ -mypy -``` +you can run the following checks locally before pushing commits: +* `mypy` +* `flake8 setup.py wikipron/ tests/` +* `black --line-length=79 --check setup.py wikipron tests data` + * You can fix any errors by running the same command without `--check` +* `pytest tests/` diff --git a/wikipron/extract/default.py b/wikipron/extract/default.py index 79fd7d1c..2113344a 100644 --- a/wikipron/extract/default.py +++ b/wikipron/extract/default.py @@ -15,9 +15,7 @@ IPA_XPATH_SELECTOR = '//span[@class = "IPA"]' -def _yield_phn( - request: requests_html, config: "Config" -) -> "Iterator[Pron]": +def _yield_phn(request: requests_html, config: "Config") -> "Iterator[Pron]": for pron_element in request.html.xpath(config.pron_xpath_selector): yield from yield_pron(pron_element, IPA_XPATH_SELECTOR, config) diff --git a/wikipron/extract/jpn.py b/wikipron/extract/jpn.py index 6cb5e232..44bc3b59 100644 --- a/wikipron/extract/jpn.py +++ b/wikipron/extract/jpn.py @@ -43,9 +43,7 @@ def yield_jpn_pron( yield from yield_pron(pron_element, IPA_XPATH_SELECTOR, config) -def yield_jpn_word( - word: "Word", request: requests_html -) -> "Iterator[Word]": +def yield_jpn_word(word: "Word", request: requests_html) -> "Iterator[Word]": # Again for simplicity, only grabbing first "sub"-word. word_element = request.html.xpath(_WORD_XPATH_SELECTOR, first=True) if word_element: diff --git a/wikipron/extract/lat.py b/wikipron/extract/lat.py index d450ab3b..204d2898 100644 --- a/wikipron/extract/lat.py +++ b/wikipron/extract/lat.py @@ -98,9 +98,7 @@ def _get_tags(request: requests_html) -> List[str]: return tags -def _yield_latin_word( - request: requests_html, tag: str -) -> "Iterator[Word]": +def _yield_latin_word(request: requests_html, tag: str) -> "Iterator[Word]": heading = "h2" if tag == "Latin" else "h3" word_xpath_selector = _WORD_XPATH_TEMPLATE.format(heading=heading, tag=tag) try: From 4fd09d2df86a05883af104029b94ddfb54f3a42d Mon Sep 17 00:00:00 2001 From: Ben Fernandes Date: Fri, 23 Oct 2020 21:52:17 +0100 Subject: [PATCH 08/11] ISSUE-241: Markups ISSUE-241: Markup - Alphabetised 'requirements.txt' ISSUE-241: Markup - Log invalid page title ISSUE-241: Markup - Alphabetised 'test_scrape.py' imports ISSUE-241: Markup - Added explanatory comment ISSUE-241: Markup - Improved 'config_dict' typing ISSUE-241: Markup - Improved 'scrape.py' typing --- data/src/codes.py | 3 +++ requirements.txt | 4 ++-- tests/test_data/test_scrape.py | 2 +- tests/test_wikipron/__init__.py | 4 ++-- wikipron/config.py | 3 ++- wikipron/scrape.py | 20 +++++++++++--------- 6 files changed, 21 insertions(+), 15 deletions(-) diff --git a/data/src/codes.py b/data/src/codes.py index 6d343f39..91298416 100755 --- a/data/src/codes.py +++ b/data/src/codes.py @@ -97,6 +97,9 @@ def _get_language_sizes(categories: List[str]) -> Dict[str, int]: ) if not language_search: + logging.warning( + f"Could not extract language from title: {page['title']}" + ) continue language = language_search.group(1) diff --git a/requirements.txt b/requirements.txt index 9f206115..8be90fab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ +black==20.8b1 flake8==3.8.4 iso639==0.1.4 +mypy==0.790 pytest==6.1.1 requests==2.24.0 requests-html==0.10.0 segments==2.1.3 setuptools==50.3.1 -black==20.8b1 -mypy==0.790 \ No newline at end of file diff --git a/tests/test_data/test_scrape.py b/tests/test_data/test_scrape.py index 03e1fa99..3edd59f8 100644 --- a/tests/test_data/test_scrape.py +++ b/tests/test_data/test_scrape.py @@ -1,6 +1,6 @@ import os -from typing import List, Any, Dict +from typing import Any, Dict, List import pytest diff --git a/tests/test_wikipron/__init__.py b/tests/test_wikipron/__init__.py index 043cf4a5..720b4a4f 100644 --- a/tests/test_wikipron/__init__.py +++ b/tests/test_wikipron/__init__.py @@ -1,4 +1,4 @@ -from typing import Dict +from typing import Any, Dict import requests @@ -8,7 +8,7 @@ def config_factory(**kwargs) -> Config: """Create a Config object for testing.""" - config_dict: Dict = {"key": "eng"} # The one default; may be overridden. + config_dict: Dict[str, Any] = {"key": "eng"} # Default; may be overridden. config_dict.update(**kwargs) return Config(**config_dict) diff --git a/wikipron/config.py b/wikipron/config.py index 0a7032f2..3e8adbf1 100644 --- a/wikipron/config.py +++ b/wikipron/config.py @@ -124,7 +124,8 @@ def _get_casefold(self, casefold: bool) -> Callable[[Word], Word]: default_func: Callable[[Word], Word] = lambda word: word # noqa: E731 return self._casefold_word if casefold else default_func - def _casefold_word(self, word: Word): + def _casefold_word(self, word: Word) -> Word: + # 'str.casefold' returns a 'str' so we need to cast it to a 'Word' return cast(Word, str.casefold(word)) def _get_process_pron( diff --git a/wikipron/scrape.py b/wikipron/scrape.py index 7811d085..bc373657 100644 --- a/wikipron/scrape.py +++ b/wikipron/scrape.py @@ -3,12 +3,11 @@ from typing import cast import pkg_resources - import requests import requests_html from wikipron.config import Config -from wikipron.typing import Iterator, WordPronPair, Word, Pron +from wikipron.typing import Iterator, WordPronPair, Pron # Queries for the MediaWiki backend. # Documentation here: https://www.mediawiki.org/wiki/API:Categorymembers @@ -45,20 +44,23 @@ def _skip_date(date_from_word: str, cut_off_date: str) -> bool: def _scrape_once(data, config: Config) -> Iterator[WordPronPair]: session = requests_html.HTMLSession() for member in data["query"]["categorymembers"]: - word = member["title"] - date = member["timestamp"] - if _skip_word(word, config.no_skip_spaces_word) or _skip_date( - date, config.cut_off_date + title = member["title"] + timestamp = member["timestamp"] + if _skip_word(title, config.no_skip_spaces_word) or _skip_date( + timestamp, config.cut_off_date ): continue request = session.get( - _PAGE_TEMPLATE.format(word=word), timeout=10, headers=HTTP_HEADERS + _PAGE_TEMPLATE.format(word=title), timeout=10, headers=HTTP_HEADERS ) - for word, pron in config.extract_word_pron(word, request, config): + + # word_prons = config.extract_word_pron(word, request, config) + for word, pron in config.extract_word_pron(title, request, config): # Pronunciation processing is done in NFD-space; # we convert back to NFC aftewards. normalized_pron = unicodedata.normalize("NFC", pron) - yield cast(Word, word), cast(Pron, normalized_pron) + # 'cast' is required 'normalize' doesn't return a 'Pron' + yield word, cast(Pron, normalized_pron) def scrape(config: Config) -> Iterator[WordPronPair]: From adf125e70454940cf2b9b0355d266b2c5c990e5c Mon Sep 17 00:00:00 2001 From: Ben Fernandes Date: Sat, 24 Oct 2020 00:25:52 +0100 Subject: [PATCH 09/11] ISSUE-241: Markup - Using logger interpolation --- data/src/codes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/src/codes.py b/data/src/codes.py index 91298416..a5ca2ab5 100755 --- a/data/src/codes.py +++ b/data/src/codes.py @@ -98,7 +98,7 @@ def _get_language_sizes(categories: List[str]) -> Dict[str, int]: if not language_search: logging.warning( - f"Could not extract language from title: {page['title']}" + "Could not extract language from title: %s", page["title"] ) continue From 1d3089abd14b58e42df71af6c93315dd6bf67403 Mon Sep 17 00:00:00 2001 From: Ben Fernandes Date: Sun, 25 Oct 2020 01:37:03 +0000 Subject: [PATCH 10/11] ISSUE-241: Markups --- .circleci/config.yml | 2 +- CHANGELOG.md | 3 +-- CONTRIBUTING.md | 4 ++-- mypy.ini | 3 --- tests/test_wikipron/__init__.py | 5 ++++- wikipron/scrape.py | 1 - 6 files changed, 8 insertions(+), 10 deletions(-) delete mode 100644 mypy.ini diff --git a/.circleci/config.yml b/.circleci/config.yml index e1023341..a860823b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -33,7 +33,7 @@ jobs: command: pip list - run: name: Type checking - command: mypy + command: mypy --ignore-missing-imports project/wikipron project/tests project/data - run: name: Lint working_directory: ~/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 8d638167..9992f56c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -63,8 +63,7 @@ Unreleased - Split `ban` into Latin and Balinese scripts. (\#214) - Split `kir` into Cyrillic and Arabic. (\#216) - Split Latin (`lat`) into its dialects. (\#233) -- Added support for python 3.9 (\236) -- Added MyPy coverage for `wikipron`, `data` and `tests` directories +- Added MyPy coverage for `wikipron`, `tests` and `data` directories. (\#247) #### Fixed diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 22f67250..a3cfe308 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -81,8 +81,8 @@ branch by the author. If you would like to help avoid wasting free Internet resources (every push triggers a new CI autobuild), you can run the following checks locally before pushing commits: -* `mypy` +* `mypy --ignore-missing-imports wikipron/ tests/ data/` * `flake8 setup.py wikipron/ tests/` -* `black --line-length=79 --check setup.py wikipron tests data` +* `black --line-length=79 --check setup.py wikipron/ tests/ data/` * You can fix any errors by running the same command without `--check` * `pytest tests/` diff --git a/mypy.ini b/mypy.ini deleted file mode 100644 index 2f97cb32..00000000 --- a/mypy.ini +++ /dev/null @@ -1,3 +0,0 @@ -[mypy] -files=wikipron,data,tests -ignore_missing_imports=true \ No newline at end of file diff --git a/tests/test_wikipron/__init__.py b/tests/test_wikipron/__init__.py index 720b4a4f..a4cdd90c 100644 --- a/tests/test_wikipron/__init__.py +++ b/tests/test_wikipron/__init__.py @@ -19,7 +19,10 @@ def can_connect_to_wiktionary() -> bool: requests.get( "https://en.wiktionary.org/wiki/linguistics", headers=HTTP_HEADERS ) - except requests.ConnectionError: + except ( + requests.exceptions.ConnectionError, + requests.exceptions.ConnectTimeout, + ): return False else: return True diff --git a/wikipron/scrape.py b/wikipron/scrape.py index bc373657..d209bf0e 100644 --- a/wikipron/scrape.py +++ b/wikipron/scrape.py @@ -54,7 +54,6 @@ def _scrape_once(data, config: Config) -> Iterator[WordPronPair]: _PAGE_TEMPLATE.format(word=title), timeout=10, headers=HTTP_HEADERS ) - # word_prons = config.extract_word_pron(word, request, config) for word, pron in config.extract_word_pron(title, request, config): # Pronunciation processing is done in NFD-space; # we convert back to NFC aftewards. From afb795c1a4e2ceccd8f24c406bc9fbd7d49748f9 Mon Sep 17 00:00:00 2001 From: Ben Fernandes Date: Sun, 25 Oct 2020 02:23:51 +0000 Subject: [PATCH 11/11] ISSUE-241: Markup - Added working dir to Circle CI config --- .circleci/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index a860823b..9ef16ce3 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -33,6 +33,7 @@ jobs: command: pip list - run: name: Type checking + working_directory: ~/ command: mypy --ignore-missing-imports project/wikipron project/tests project/data - run: name: Lint