Skip to content

Commit

Permalink
ISSUE-241: Markups
Browse files Browse the repository at this point in the history
ISSUE-241: Markup - Alphabetised 'requirements.txt'
ISSUE-241: Markup - Log invalid page title
ISSUE-241: Markup - Alphabetised 'test_scrape.py' imports
ISSUE-241: Markup - Added explanatory comment
ISSUE-241: Markup - Improved 'config_dict' typing
ISSUE-241: Markup - Improved 'scrape.py' typing
  • Loading branch information
ben-fernandes committed Oct 23, 2020
1 parent 8ae7400 commit 4fd09d2
Show file tree
Hide file tree
Showing 6 changed files with 21 additions and 15 deletions.
3 changes: 3 additions & 0 deletions data/src/codes.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ def _get_language_sizes(categories: List[str]) -> Dict[str, int]:
)

if not language_search:
logging.warning(
f"Could not extract language from title: {page['title']}"

This comment has been minimized.

Copy link
@kylebgorman

kylebgorman Oct 23, 2020

Collaborator

Suggest doing:

logging.warning("Could not extract language from title: %s", page['title'])

It's usually a good idea to let the logger do string interpolation for you, because if loglevel > warning, it doesn't even bother to...

)
continue

language = language_search.group(1)
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
black==20.8b1
flake8==3.8.4
iso639==0.1.4
mypy==0.790
pytest==6.1.1
requests==2.24.0
requests-html==0.10.0
segments==2.1.3
setuptools==50.3.1
black==20.8b1
mypy==0.790
2 changes: 1 addition & 1 deletion tests/test_data/test_scrape.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os

from typing import List, Any, Dict
from typing import Any, Dict, List

import pytest

Expand Down
4 changes: 2 additions & 2 deletions tests/test_wikipron/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Dict
from typing import Any, Dict

import requests

Expand All @@ -8,7 +8,7 @@

def config_factory(**kwargs) -> Config:
"""Create a Config object for testing."""
config_dict: Dict = {"key": "eng"} # The one default; may be overridden.
config_dict: Dict[str, Any] = {"key": "eng"} # Default; may be overridden.
config_dict.update(**kwargs)
return Config(**config_dict)

Expand Down
3 changes: 2 additions & 1 deletion wikipron/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,8 @@ def _get_casefold(self, casefold: bool) -> Callable[[Word], Word]:
default_func: Callable[[Word], Word] = lambda word: word # noqa: E731
return self._casefold_word if casefold else default_func

def _casefold_word(self, word: Word):
def _casefold_word(self, word: Word) -> Word:
# 'str.casefold' returns a 'str' so we need to cast it to a 'Word'
return cast(Word, str.casefold(word))

def _get_process_pron(
Expand Down
20 changes: 11 additions & 9 deletions wikipron/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@
from typing import cast

import pkg_resources

import requests
import requests_html

from wikipron.config import Config
from wikipron.typing import Iterator, WordPronPair, Word, Pron
from wikipron.typing import Iterator, WordPronPair, Pron

# Queries for the MediaWiki backend.
# Documentation here: https://www.mediawiki.org/wiki/API:Categorymembers
Expand Down Expand Up @@ -45,20 +44,23 @@ def _skip_date(date_from_word: str, cut_off_date: str) -> bool:
def _scrape_once(data, config: Config) -> Iterator[WordPronPair]:
session = requests_html.HTMLSession()
for member in data["query"]["categorymembers"]:
word = member["title"]
date = member["timestamp"]
if _skip_word(word, config.no_skip_spaces_word) or _skip_date(
date, config.cut_off_date
title = member["title"]
timestamp = member["timestamp"]
if _skip_word(title, config.no_skip_spaces_word) or _skip_date(
timestamp, config.cut_off_date
):
continue
request = session.get(
_PAGE_TEMPLATE.format(word=word), timeout=10, headers=HTTP_HEADERS
_PAGE_TEMPLATE.format(word=title), timeout=10, headers=HTTP_HEADERS
)
for word, pron in config.extract_word_pron(word, request, config):

# word_prons = config.extract_word_pron(word, request, config)
for word, pron in config.extract_word_pron(title, request, config):
# Pronunciation processing is done in NFD-space;
# we convert back to NFC aftewards.
normalized_pron = unicodedata.normalize("NFC", pron)
yield cast(Word, word), cast(Pron, normalized_pron)
# 'cast' is required 'normalize' doesn't return a 'Pron'
yield word, cast(Pron, normalized_pron)


def scrape(config: Config) -> Iterator[WordPronPair]:
Expand Down

0 comments on commit 4fd09d2

Please sign in to comment.