Use mypy for type checking (#247)

* ISSUE-241: Ignoring 'env' and '.idea' directories * ISSUE-241: Added 'mypy' to 'requirements.txt' * ISSUE-241: Added 'Type checking' step to CircleCI * ISSUE-241: Fixed mypy issues * ISSUE-241: Updated documentation * ISSUE-241: Added mypy to the correct 'requirements.txt' * ISSUE-241: Ran Black formatter Also updated the contribution guidelines to include this as a step * ISSUE-241: Markups ISSUE-241: Markup - Alphabetised 'requirements.txt' ISSUE-241: Markup - Log invalid page title ISSUE-241: Markup - Alphabetised 'test_scrape.py' imports ISSUE-241: Markup - Added explanatory comment ISSUE-241: Markup - Improved 'config_dict' typing ISSUE-241: Markup - Improved 'scrape.py' typing * ISSUE-241: Markup - Using logger interpolation * ISSUE-241: Markups * ISSUE-241: Markup - Added working dir to Circle CI config
CUNY-CL · Oct 25, 2020 · 711873d · 711873d
1 parent baa00c1
commit 711873d
Show file tree

Hide file tree

Showing 21 changed files with 93 additions and 73 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -31,6 +31,10 @@ jobs:
       - run:
           name: Show installed Python packages
           command: pip list
+      - run:
+          name: Type checking
+          working_directory: ~/
+          command: mypy --ignore-missing-imports project/wikipron project/tests project/data
       - run:
           name: Lint
           working_directory: ~/

diff --git a/.gitignore b/.gitignore
@@ -5,4 +5,7 @@ __pycache__/
 *.egg-info/
 *.log
 **/tars
-**/freq_tsvs
+**/freq_tsvs
+env/
+
+.idea/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -63,6 +63,7 @@ Unreleased
 -   Split `ban` into Latin and Balinese scripts. (\#214)
 -   Split `kir` into Cyrillic and Arabic. (\#216)
 -   Split Latin (`lat`) into its dialects. (\#233)
+-   Added MyPy coverage for `wikipron`, `tests` and `data` directories. (\#247)
 
 #### Fixed
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -74,15 +74,15 @@ To work on a feature or bug fix, here are the development steps:
 The `wikipron` repo has continuous integration (CI) turned on,
 with autobuilds running pytest and flake8 for the test suite
 (in the [`tests/`](tests) directory) and code style checks, respectively.
-If an autobuild at a pending pull request fails because of pytest or flake8
-errors, then the errors must be fixed by further commits pushed to the branch
-by the author.
+If an autobuild at a pending pull request fails because of `pytest`, `flake8` or
+`mypy` errors, then the errors must be fixed by further commits pushed to the
+branch by the author.
 
 If you would like to help avoid wasting free Internet resources
 (every push triggers a new CI autobuild),
-you can run pytest and flake8 checks locally before pushing commits:
-
-```bash
-flake8 setup.py wikipron/ tests/
-pytest -vv tests/
-```
+you can run the following checks locally before pushing commits:
+* `mypy --ignore-missing-imports wikipron/ tests/ data/`
+* `flake8 setup.py wikipron/ tests/`
+* `black --line-length=79 --check setup.py wikipron/ tests/ data/`
+    * You can fix any errors by running the same command without `--check`
+* `pytest tests/`
diff --git a/data/src/codes.py b/data/src/codes.py
@@ -91,9 +91,18 @@ def _get_language_sizes(categories: List[str]) -> Dict[str, int]:
         ).json()
         for page in data["query"]["pages"].values():
             size = page["categoryinfo"]["size"]
-            language = re.search(
+
+            language_search = re.search(
                 r"Category:(.+?) terms with IPA pronunciation", page["title"]
-            ).group(1)
+            )
+
+            if not language_search:
+                logging.warning(
+                    "Could not extract language from title: %s", page["title"]
+                )
+                continue
+
+            language = language_search.group(1)
             language_sizes[language] = size
     return language_sizes
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,9 @@
+black==20.8b1
 flake8==3.8.4
 iso639==0.1.4
+mypy==0.790
 pytest==6.1.1
 requests==2.24.0
 requests-html==0.10.0
 segments==2.1.3
 setuptools==50.3.1
-black==20.8b1
diff --git a/tests/test_data/__init__.py b/tests/test_data/__init__.py
@@ -2,6 +2,7 @@
 import shutil
 
 from contextlib import contextmanager
+from typing import Iterator
 
 _TESTS_DIR = os.path.dirname(os.getcwd())
 _TSV_PATH = f"{_TESTS_DIR}/tsv"
@@ -21,7 +22,7 @@ def write_dummy_phones_files(key: str, dialect: str) -> None:
 
 
 @contextmanager
-def handle_dummy_files(phones: bool, key: str, dialect: str) -> str:
+def handle_dummy_files(phones: bool, key: str, dialect: str) -> Iterator[str]:
     """Creates and removes dummy directories for housing
     TSV and phones files."""
     os.mkdir(_TSV_PATH)

diff --git a/tests/test_data/test_scrape.py b/tests/test_data/test_scrape.py
@@ -1,6 +1,6 @@
 import os
 
-from typing import List
+from typing import Any, Dict, List
 
 import pytest
 
@@ -46,7 +46,7 @@
     ],
 )
 def test_file_creation(
-    config_settings: object,
+    config_settings: Dict[str, Any],
     dialect_suffix: str,
     phones: bool,
     expected_file_name: List[str],
@@ -55,6 +55,7 @@ def test_file_creation(
     file names based on presence or absence of dialect specification
     or .phones files for a given language.
     """
+    dummy_tsv_path: str
     with handle_dummy_files(
         phones, config_settings["key"], dialect_suffix
     ) as dummy_tsv_path:

diff --git a/tests/test_wikipron/__init__.py b/tests/test_wikipron/__init__.py
@@ -1,12 +1,14 @@
+from typing import Any, Dict
+
 import requests
 
-from wikipron.scrape import HTTP_HEADERS
 from wikipron.config import Config
+from wikipron.scrape import HTTP_HEADERS
 
 
 def config_factory(**kwargs) -> Config:
     """Create a Config object for testing."""
-    config_dict = {"key": "eng"}  # The one default; may be overridden.
+    config_dict: Dict[str, Any] = {"key": "eng"}  # Default; may be overridden.
     config_dict.update(**kwargs)
     return Config(**config_dict)
 
@@ -17,7 +19,10 @@ def can_connect_to_wiktionary() -> bool:
         requests.get(
             "https://en.wiktionary.org/wiki/linguistics", headers=HTTP_HEADERS
         )
-    except (requests.ConnectionError, requests.ConnectTimeout):
+    except (
+        requests.exceptions.ConnectionError,
+        requests.exceptions.ConnectTimeout,
+    ):
         return False
     else:
         return True
diff --git a/tests/test_wikipron/test_extract.py b/tests/test_wikipron/test_extract.py
@@ -1,10 +1,9 @@
 import pytest
-import requests
+import requests_html
 
 from wikipron.extract import EXTRACTION_FUNCTIONS
 from wikipron.extract.core import _skip_pron
 from wikipron.extract.default import extract_word_pron_default
-
 from . import config_factory
 
 
@@ -14,7 +13,7 @@
 def test_extraction_functions_have_the_same_signature(func):
     expected_annotations = {
         "word": "Word",
-        "request": requests.Response,
+        "request": requests_html,
         "config": "Config",
         "return": "Iterator[WordPronPair]",
     }

diff --git a/wikipron/config.py b/wikipron/config.py
@@ -3,7 +3,7 @@
 import logging
 import re
 
-from typing import Callable, Optional
+from typing import Callable, Optional, cast
 
 import iso639
 import segments
@@ -121,7 +121,12 @@ def _get_cut_off_date(self, cut_off_date: Optional[str]) -> str:
         return cut_off_date
 
     def _get_casefold(self, casefold: bool) -> Callable[[Word], Word]:
-        return str.casefold if casefold else lambda word: word  # noqa: E731
+        default_func: Callable[[Word], Word] = lambda word: word  # noqa: E731
+        return self._casefold_word if casefold else default_func
+
+    def _casefold_word(self, word: Word) -> Word:
+        # 'str.casefold' returns a 'str' so we need to cast it to a 'Word'
+        return cast(Word, str.casefold(word))
 
     def _get_process_pron(
         self,

diff --git a/wikipron/extract/cmn.py b/wikipron/extract/cmn.py
@@ -3,11 +3,10 @@
 import itertools
 import typing
 
-import requests
+import requests_html
 
 from wikipron.extract.default import yield_pron, IPA_XPATH_SELECTOR
 
-
 if typing.TYPE_CHECKING:
     from wikipron.config import Config
     from wikipron.typing import Iterator, Word, Pron, WordPronPair
@@ -22,14 +21,14 @@
 
 
 def yield_cmn_pron(
-    request: requests.Response, config: "Config"
+    request: requests_html, config: "Config"
 ) -> "Iterator[Pron]":
     for li_container in request.html.xpath(_PRON_XPATH_TEMPLATE):
         yield from yield_pron(li_container, IPA_XPATH_SELECTOR, config)
 
 
 def extract_word_pron_cmn(
-    word: "Word", request: requests.Response, config: "Config"
+    word: "Word", request: requests_html, config: "Config"
 ) -> "Iterator[WordPronPair]":
     words = itertools.repeat(word)
     prons = yield_cmn_pron(request, config)

diff --git a/wikipron/extract/default.py b/wikipron/extract/default.py
@@ -3,11 +3,10 @@
 import itertools
 import typing
 
-import requests
+import requests_html
 
 from wikipron.extract.core import yield_pron
 
-
 if typing.TYPE_CHECKING:
     from wikipron.config import Config
     from wikipron.typing import Iterator, Pron, Word, WordPronPair
@@ -16,15 +15,13 @@
 IPA_XPATH_SELECTOR = '//span[@class = "IPA"]'
 
 
-def _yield_phn(
-    request: requests.Response, config: "Config"
-) -> "Iterator[Pron]":
+def _yield_phn(request: requests_html, config: "Config") -> "Iterator[Pron]":
     for pron_element in request.html.xpath(config.pron_xpath_selector):
         yield from yield_pron(pron_element, IPA_XPATH_SELECTOR, config)
 
 
 def extract_word_pron_default(
-    word: "Word", request: requests.Response, config: "Config"
+    word: "Word", request: requests_html, config: "Config"
 ) -> "Iterator[WordPronPair]":
     words = itertools.repeat(word)
     prons = _yield_phn(request, config)

diff --git a/wikipron/extract/jpn.py b/wikipron/extract/jpn.py
@@ -16,7 +16,7 @@
 import itertools
 import typing
 
-import requests
+import requests_html
 
 from wikipron.extract.default import yield_pron, IPA_XPATH_SELECTOR
 
@@ -34,7 +34,7 @@
 
 
 def yield_jpn_pron(
-    request: requests.Response, config: "Config"
+    request: requests_html, config: "Config"
 ) -> "Iterator[Pron]":
     # For simplicity, just want to grab the first transcription.
     # Will encounter words that have no transcription.
@@ -43,9 +43,7 @@ def yield_jpn_pron(
         yield from yield_pron(pron_element, IPA_XPATH_SELECTOR, config)
 
 
-def yield_jpn_word(
-    word: "Word", request: requests.Response
-) -> "Iterator[Word]":
+def yield_jpn_word(word: "Word", request: requests_html) -> "Iterator[Word]":
     # Again for simplicity, only grabbing first "sub"-word.
     word_element = request.html.xpath(_WORD_XPATH_SELECTOR, first=True)
     if word_element:
@@ -56,7 +54,7 @@ def yield_jpn_word(
 
 
 def extract_word_pron_jpn(
-    word: "Word", request: requests.Response, config: "Config"
+    word: "Word", request: requests_html, config: "Config"
 ) -> "Iterator[WordPronPair]":
     # If we can't find a kana alternative, then the headword
     # must itself be kana.

diff --git a/wikipron/extract/khb.py b/wikipron/extract/khb.py
@@ -7,7 +7,7 @@
 import itertools
 import typing
 
-import requests
+import requests_html
 
 from wikipron.extract.default import yield_pron
 
@@ -29,7 +29,7 @@
 
 
 def extract_word_pron_lu(
-    word: "Word", request: requests.Response, config: "Config"
+    word: "Word", request: requests_html, config: "Config"
 ) -> "Iterator[WordPronPair]":
     words = itertools.repeat(word)
     prons = yield_pron(request.html, _IPA_XPATH_SELECTOR, config)

diff --git a/wikipron/extract/khm.py b/wikipron/extract/khm.py
@@ -3,11 +3,10 @@
 import itertools
 import typing
 
-import requests
+import requests_html
 
 from wikipron.extract.default import yield_pron
 
-
 if typing.TYPE_CHECKING:
     from wikipron.config import Config
     from wikipron.typing import Iterator, Word, WordPronPair
@@ -17,7 +16,7 @@
 
 
 def extract_word_pron_khmer(
-    word: "Word", request: requests.Response, config: "Config"
+    word: "Word", request: requests_html, config: "Config"
 ) -> "Iterator[WordPronPair]":
     words = itertools.repeat(word)
     prons = yield_pron(request.html, _IPA_XPATH_SELECTOR, config)