CUNY-CL · jacksonllee · Oct 25, 2020 · Oct 19, 2020 · Oct 19, 2020 · Oct 19, 2020
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -31,6 +31,9 @@ jobs:
       - run:
           name: Show installed Python packages
           command: pip list
+      - run:
+          name: Type checking
+          command: mypy
       - run:
           name: Lint
           working_directory: ~/

diff --git a/.gitignore b/.gitignore
@@ -5,4 +5,7 @@ __pycache__/
 *.egg-info/
 *.log
 **/tars
-**/freq_tsvs
+**/freq_tsvs
+env/
+
+.idea/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -66,6 +66,7 @@ Unreleased
 -   Added Shan (`shn`) with custom extraction. (\#229)
 -   Split Latin (`lat`) into its dialects. (\#233)
 -   Added support for python 3.9 (\236)
+-   Added MyPy coverage for `wikipron`, `data` and `tests` directories
 
 ### Changed
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -74,9 +74,9 @@ To work on a feature or bug fix, here are the development steps:
 The `wikipron` repo has continuous integration (CI) turned on,
 with autobuilds running pytest and flake8 for the test suite
 (in the [`tests/`](tests) directory) and code style checks, respectively.
-If an autobuild at a pending pull request fails because of pytest or flake8
-errors, then the errors must be fixed by further commits pushed to the branch
-by the author.
+If an autobuild at a pending pull request fails because of `pytest`, `flake8` or
+`mypy` errors, then the errors must be fixed by further commits pushed to the
+branch by the author.
 
 If you would like to help avoid wasting free Internet resources
 (every push triggers a new CI autobuild),
@@ -85,4 +85,5 @@ you can run pytest and flake8 checks locally before pushing commits:
 ```bash
 flake8 setup.py wikipron/ tests/
 pytest -vv tests/
+mypy
 ```
diff --git a/data/requirements.txt b/data/requirements.txt
@@ -2,3 +2,4 @@ regex>=2019.12.9
 requests
 requests-html
 wikipron>=1.0.0
+mypy
diff --git a/data/src/codes.py b/data/src/codes.py
@@ -91,9 +91,15 @@ def _get_language_sizes(categories: List[str]) -> Dict[str, int]:
         ).json()
         for page in data["query"]["pages"].values():
             size = page["categoryinfo"]["size"]
-            language = re.search(
+
+            language_search = re.search(
                 r"Category:(.+?) terms with IPA pronunciation", page["title"]
-            ).group(1)
+            )
+
+            if not language_search:
+                continue
+
+            language = language_search.group(1)
             language_sizes[language] = size
     return language_sizes
 

diff --git a/mypy.ini b/mypy.ini
@@ -0,0 +1,3 @@
+[mypy]
+files=wikipron,data,tests
+ignore_missing_imports=true
diff --git a/tests/test_data/__init__.py b/tests/test_data/__init__.py
@@ -2,6 +2,7 @@
 import shutil
 
 from contextlib import contextmanager
+from typing import Iterator
 
 _TESTS_DIR = os.path.dirname(os.getcwd())
 _TSV_PATH = f"{_TESTS_DIR}/tsv"
@@ -21,7 +22,7 @@ def write_dummy_phones_files(key: str, dialect: str) -> None:
 
 
 @contextmanager
-def handle_dummy_files(phones: bool, key: str, dialect: str) -> str:
+def handle_dummy_files(phones: bool, key: str, dialect: str) -> Iterator[str]:
     """Creates and removes dummy directories for housing
     TSV and phones files."""
     os.mkdir(_TSV_PATH)

diff --git a/tests/test_data/test_scrape.py b/tests/test_data/test_scrape.py
@@ -1,6 +1,6 @@
 import os
 
-from typing import List
+from typing import List, Any, Dict
 
 import pytest
 
@@ -46,7 +46,7 @@
     ],
 )
 def test_file_creation(
-    config_settings: object,
+    config_settings: Dict[str, Any],
     dialect_suffix: str,
     phones: bool,
     expected_file_name: List[str],
@@ -55,6 +55,7 @@ def test_file_creation(
     file names based on presence or absence of dialect specification
     or .phones files for a given language.
     """
+    dummy_tsv_path: str
     with handle_dummy_files(
         phones, config_settings["key"], dialect_suffix
     ) as dummy_tsv_path:

diff --git a/tests/test_wikipron/__init__.py b/tests/test_wikipron/__init__.py
@@ -1,12 +1,14 @@
+from typing import Dict
+
 import requests
 
-from wikipron.scrape import HTTP_HEADERS
 from wikipron.config import Config
+from wikipron.scrape import HTTP_HEADERS
 
 
 def config_factory(**kwargs) -> Config:
     """Create a Config object for testing."""
-    config_dict = {"key": "eng"}  # The one default; may be overridden.
+    config_dict: Dict = {"key": "eng"}  # The one default; may be overridden.
     config_dict.update(**kwargs)
     return Config(**config_dict)
 
@@ -17,7 +19,7 @@ def can_connect_to_wiktionary() -> bool:
         requests.get(
             "https://en.wiktionary.org/wiki/linguistics", headers=HTTP_HEADERS
         )
-    except (requests.ConnectionError, requests.ConnectTimeout):
+    except requests.ConnectionError:
         return False
     else:
         return True
diff --git a/tests/test_wikipron/test_extract.py b/tests/test_wikipron/test_extract.py
@@ -1,10 +1,9 @@
 import pytest
-import requests
+import requests_html
 
 from wikipron.extract import EXTRACTION_FUNCTIONS
 from wikipron.extract.core import _skip_pron
 from wikipron.extract.default import extract_word_pron_default
-
 from . import config_factory
 
 
@@ -14,7 +13,7 @@
 def test_extraction_functions_have_the_same_signature(func):
     expected_annotations = {
         "word": "Word",
-        "request": requests.Response,
+        "request": requests_html,
         "config": "Config",
         "return": "Iterator[WordPronPair]",
     }

diff --git a/wikipron/config.py b/wikipron/config.py
@@ -3,7 +3,7 @@
 import logging
 import re
 
-from typing import Callable, Optional
+from typing import Callable, Optional, cast
 
 import iso639
 import segments
@@ -121,7 +121,11 @@ def _get_cut_off_date(self, cut_off_date: Optional[str]) -> str:
         return cut_off_date
 
     def _get_casefold(self, casefold: bool) -> Callable[[Word], Word]:
-        return str.casefold if casefold else lambda word: word  # noqa: E731
+        default_func: Callable[[Word], Word] = lambda word: word  # noqa: E731
+        return self._casefold_word if casefold else default_func
+
+    def _casefold_word(self, word: Word):
+        return cast(Word, str.casefold(word))
 
     def _get_process_pron(
         self,

diff --git a/wikipron/extract/cmn.py b/wikipron/extract/cmn.py
@@ -3,11 +3,10 @@
 import itertools
 import typing
 
-import requests
+import requests_html
 
 from wikipron.extract.default import yield_pron, IPA_XPATH_SELECTOR
 
-
 if typing.TYPE_CHECKING:
     from wikipron.config import Config
     from wikipron.typing import Iterator, Word, Pron, WordPronPair
@@ -22,14 +21,14 @@
 
 
 def yield_cmn_pron(
-    request: requests.Response, config: "Config"
+    request: requests_html, config: "Config"
 ) -> "Iterator[Pron]":
     for li_container in request.html.xpath(_PRON_XPATH_TEMPLATE):
         yield from yield_pron(li_container, IPA_XPATH_SELECTOR, config)
 
 
 def extract_word_pron_cmn(
-    word: "Word", request: requests.Response, config: "Config"
+    word: "Word", request: requests_html, config: "Config"
 ) -> "Iterator[WordPronPair]":
     words = itertools.repeat(word)
     prons = yield_cmn_pron(request, config)

diff --git a/wikipron/extract/default.py b/wikipron/extract/default.py
@@ -3,11 +3,10 @@
 import itertools
 import typing
 
-import requests
+import requests_html
 
 from wikipron.extract.core import yield_pron
 
-
 if typing.TYPE_CHECKING:
     from wikipron.config import Config
     from wikipron.typing import Iterator, Pron, Word, WordPronPair
@@ -17,14 +16,14 @@
 
 
 def _yield_phn(
-    request: requests.Response, config: "Config"
+    request: requests_html, config: "Config"
 ) -> "Iterator[Pron]":
     for pron_element in request.html.xpath(config.pron_xpath_selector):
         yield from yield_pron(pron_element, IPA_XPATH_SELECTOR, config)
 
 
 def extract_word_pron_default(
-    word: "Word", request: requests.Response, config: "Config"
+    word: "Word", request: requests_html, config: "Config"
 ) -> "Iterator[WordPronPair]":
     words = itertools.repeat(word)
     prons = _yield_phn(request, config)

diff --git a/wikipron/extract/jpn.py b/wikipron/extract/jpn.py
@@ -16,7 +16,7 @@
 import itertools
 import typing
 
-import requests
+import requests_html
 
 from wikipron.extract.default import yield_pron, IPA_XPATH_SELECTOR
 
@@ -34,7 +34,7 @@
 
 
 def yield_jpn_pron(
-    request: requests.Response, config: "Config"
+    request: requests_html, config: "Config"
 ) -> "Iterator[Pron]":
     # For simplicity, just want to grab the first transcription.
     # Will encounter words that have no transcription.
@@ -44,7 +44,7 @@ def yield_jpn_pron(
 
 
 def yield_jpn_word(
-    word: "Word", request: requests.Response
+    word: "Word", request: requests_html
 ) -> "Iterator[Word]":
     # Again for simplicity, only grabbing first "sub"-word.
     word_element = request.html.xpath(_WORD_XPATH_SELECTOR, first=True)
@@ -56,7 +56,7 @@ def yield_jpn_word(
 
 
 def extract_word_pron_jpn(
-    word: "Word", request: requests.Response, config: "Config"
+    word: "Word", request: requests_html, config: "Config"
 ) -> "Iterator[WordPronPair]":
     # If we can't find a kana alternative, then the headword
     # must itself be kana.

diff --git a/wikipron/extract/khb.py b/wikipron/extract/khb.py
@@ -7,7 +7,7 @@
 import itertools
 import typing
 
-import requests
+import requests_html
 
 from wikipron.extract.default import yield_pron
 
@@ -29,7 +29,7 @@
 
 
 def extract_word_pron_lu(
-    word: "Word", request: requests.Response, config: "Config"
+    word: "Word", request: requests_html, config: "Config"
 ) -> "Iterator[WordPronPair]":
     words = itertools.repeat(word)
     prons = yield_pron(request.html, _IPA_XPATH_SELECTOR, config)

diff --git a/wikipron/extract/khm.py b/wikipron/extract/khm.py
@@ -3,11 +3,10 @@
 import itertools
 import typing
 
-import requests
+import requests_html
 
 from wikipron.extract.default import yield_pron
 
-
 if typing.TYPE_CHECKING:
     from wikipron.config import Config
     from wikipron.typing import Iterator, Word, WordPronPair
@@ -17,7 +16,7 @@
 
 
 def extract_word_pron_khmer(
-    word: "Word", request: requests.Response, config: "Config"
+    word: "Word", request: requests_html, config: "Config"
 ) -> "Iterator[WordPronPair]":
     words = itertools.repeat(word)
     prons = yield_pron(request.html, _IPA_XPATH_SELECTOR, config)

diff --git a/wikipron/extract/lat.py b/wikipron/extract/lat.py
@@ -43,14 +43,12 @@
 
 import itertools
 import typing
+from typing import List
 
-import requests
+import requests_html
 
 from wikipron.extract.default import yield_pron, IPA_XPATH_SELECTOR
 
-from typing import List
-
-
 if typing.TYPE_CHECKING:
     from wikipron.config import Config
     from wikipron.typing import Iterator, Pron, Word, WordPronPair
@@ -87,7 +85,7 @@
 """
 
 
-def _get_tags(request: requests.Response) -> List[str]:
+def _get_tags(request: requests_html) -> List[str]:
     """Extract the Latin Etymology ID tags from the table of contents."""
     tags = []
     for a_element in request.html.xpath(_TOC_ETYMOLOGY_XPATH_SELECTOR):
@@ -101,7 +99,7 @@ def _get_tags(request: requests.Response) -> List[str]:
 
 
 def _yield_latin_word(
-    request: requests.Response, tag: str
+    request: requests_html, tag: str
 ) -> "Iterator[Word]":
     heading = "h2" if tag == "Latin" else "h3"
     word_xpath_selector = _WORD_XPATH_TEMPLATE.format(heading=heading, tag=tag)
@@ -119,7 +117,7 @@ def _yield_latin_word(
 
 
 def _yield_latin_pron(
-    request: requests.Response, config: "Config", tag: str
+    request: requests_html, config: "Config", tag: str
 ) -> "Iterator[Pron]":
     heading = "h2" if tag == "Latin" else "h3"
     if config.dialect:
@@ -140,12 +138,11 @@ def _yield_latin_pron(
 
 
 def extract_word_pron_latin(
-    word: "Word", request: requests.Response, config: "Config"
+    word: "Word", request: requests_html, config: "Config"
 ) -> "Iterator[WordPronPair]":
     # For Latin, we don't use the title word from the Wiktionary page,
     # because it never has macrons (necessary for Latin vowel length).
     # We will get the word from each "Etymology" section within the page.
-    word = None  # noqa: F841
     tags = _get_tags(request)
     for tag in tags:
         # The words and prons are extracted from the same request response but

diff --git a/wikipron/extract/shn.py b/wikipron/extract/shn.py
@@ -3,7 +3,7 @@
 import itertools
 import typing
 
-import requests
+import requests_html
 
 from wikipron.extract.default import yield_pron
 
@@ -25,7 +25,7 @@
 
 
 def extract_word_pron_shan(
-    word: "Word", request: requests.Response, config: "Config"
+    word: "Word", request: requests_html, config: "Config"
 ) -> "Iterator[WordPronPair]":
     words = itertools.repeat(word)
     prons = yield_pron(request.html, _IPA_XPATH_SELECTOR, config)