Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Basic mypy support #613

Merged
merged 6 commits into from
Feb 26, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 6 additions & 4 deletions .github/workflows/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,16 @@ jobs: # jobs. We will have two jobs (test and publish) with multiple steps.
python -m pip install --upgrade pip
poetry config virtualenvs.create false --local
poetry install
pip install pytest pylint coverage mypy coveralls
pip install pytest pylint coverage coveralls
python -m nltk.downloader punkt stopwords
- name: Pylint # Run pylint static analysis
run: |
poetry run pylint newspaper --fail-under=8.0
# - name: mypy # Run mypy static analysis
# run: |
# poetry run mypy -p newspaper
- name: mypy # Run mypy static analysis
run: |
poetry run mypy -p newspaper
env:
MYPYPATH: stubs
- name: Pytest # Run pytest
run: |
poetry run coverage run -m --source=newspaper pytest tests
Expand Down
4 changes: 2 additions & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,8 @@ source venv/bin/activate
# Upgrade pip (very important!)
pip install --upgrade pip

# Install Newspaper3k in editable mode
pip install -e '.[dev]'
# Install Newspaper4k in editable mode
pip install -e '.'
```

Last, install the pre-commit hooks with:
Expand Down
2 changes: 1 addition & 1 deletion newspaper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
logging.getLogger(__name__).addHandler(NullHandler())


def article(url: str, language: Optional[str] = "en", **kwargs) -> Article:
def article(url: str, language: str = "en", **kwargs) -> Article:
"""Shortcut function to fetch and parse a newspaper article from a URL.

Args:
Expand Down
27 changes: 14 additions & 13 deletions newspaper/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,11 +140,11 @@ class Article:
def __init__(
self,
url: str,
title: Optional[str] = "",
source_url: Optional[str] = "",
read_more_link: Optional[str] = "",
title: str = "",
source_url: str = "",
read_more_link: str = "",
config: Optional[Configuration] = None,
**kwargs: Dict[str, Any],
**kwargs: Any,
):
"""Constructs the article class. Will not download or parse the article

Expand All @@ -166,8 +166,8 @@ def __init__(
use the default settingsDefaults to None.

Keyword Args:
**kwargs: Any Configuration class propriety can be overwritten
through init keyword params.
**kwargs: Any Configuration class property can be overwritten
through init keyword params.
Additionally, you can specify any of the following
requests parameters:
headers, cookies, auth, timeout, allow_redirects,
Expand Down Expand Up @@ -448,7 +448,8 @@ def download(
break

self.html = html
self.title = title
if title is not None:
self.title = title

return self

Expand Down Expand Up @@ -745,14 +746,14 @@ def to_json(self, as_string: Optional[bool] = True) -> Union[str, Dict]:

self.throw_if_not_parsed_verbose()

article_dict = {}
article_dict: Dict[str, Any] = {}

for metadata in settings.article_json_fields:
article_dict[metadata] = getattr(
self, metadata, getattr(self.config, metadata, None)
)
if isinstance(article_dict[metadata], datetime):
article_dict[metadata] = article_dict[metadata].isoformat()
value = getattr(self, metadata, getattr(self.config, metadata, None))
if isinstance(value, datetime):
article_dict[metadata] = value.isoformat()
else:
article_dict[metadata] = value
if as_string:
return json.dumps(article_dict, indent=4, ensure_ascii=False)
else:
Expand Down
2 changes: 2 additions & 0 deletions newspaper/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,10 +221,12 @@ def csv_string(article_dict: dict) -> str:

output = article.to_json(as_string=args.output_format == "json")
if args.output_format == "json":
assert isinstance(output, str)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think that maybe a test case for article.to_json would be better than an assert here?
My thoughts: only if to_json is buggy, this would trigger. For the enduser it would not mean a lot to get an assertion error here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added some overload bits (see https://stackoverflow.com/a/76302414/320546 for explanation) to to_json to make it have a more explicit link between the params and output, and so have removed these.

if idx < len(urls) - 1:
output += ","
write_output(output)
elif args.output_format == "csv":
assert isinstance(output, Dict)
if idx == 0:
write_output(csv_header())
write_output(csv_string(output))
Expand Down
1 change: 1 addition & 0 deletions newspaper/extractors/articlebody_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ def is_tag_match(node, tag_dict):
scores = []
for tag in defines.ARTICLE_BODY_TAGS:
if is_tag_match(node, tag):
assert isinstance(tag["score_boost"], int)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not needed in my opinion, since tag is from the constant ARTICLE_BODY_TAGS, which is static defined, and should not be anything other than int

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ARTICLE_BODY_TAGS was defined as a dict with str|int keys, which meant it could be a str from the typing perspective even if the actual data forbid it. I've rewritten ARTICLE_BODY_TAGS with an explicit TypedDict type to solve this.

scores.append(tag["score_boost"])
if scores:
return max(scores)
Expand Down
9 changes: 4 additions & 5 deletions newspaper/extractors/categories_extractor.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import re
import lxml
import tldextract
from typing import Any, Dict, List, Optional, Tuple
from typing import Any, Dict, Iterator, List, Optional, Tuple
from newspaper import urls
from newspaper.configuration import Configuration
from newspaper.extractors.defines import url_stopwords, category_url_prefixes
Expand Down Expand Up @@ -57,10 +57,9 @@ def parse(self, source_url: str, doc: lxml.html.Element) -> List[str]:
)

if len(_valid_categories) == 0:
other_links_in_doc = self._get_other_links(
doc, filter_tld=domain_tld.domain
other_links_in_doc = set(
self._get_other_links(doc, filter_tld=domain_tld.domain)
)
other_links_in_doc = set(other_links_in_doc)
for p_url in other_links_in_doc:
ok, parsed_url = self.is_valid_link(p_url, domain_tld.domain)
if ok:
Expand Down Expand Up @@ -91,7 +90,7 @@ def parse(self, source_url: str, doc: lxml.html.Element) -> List[str]:

def _get_other_links(
self, doc: lxml.html.Element, filter_tld: Optional[str] = None
) -> List[str]:
) -> Iterator[str]:
"""Return all links that are not as <a> tags. These can be
links in javascript tags, json objects, etc.
"""
Expand Down
13 changes: 12 additions & 1 deletion newspaper/extractors/defines.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import Dict, List, Union
from typing_extensions import TypedDict

MOTLEY_REPLACEMENT = ("&#65533;", "")
TITLE_REPLACEMENTS = ("&raquo;", "»")
Expand Down Expand Up @@ -120,7 +121,17 @@
"score_boost": 15,
},
]
META_IMAGE_TAGS: List[Dict[str, Union[str, int]]] = [


class MetaImageDict(TypedDict):
tag: str
attr: str
value: str
content: str
score: int


META_IMAGE_TAGS: List[MetaImageDict] = [
{
"tag": "meta",
"attr": "property",
Expand Down
2 changes: 1 addition & 1 deletion newspaper/extractors/image_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def _get_favicon(self, doc: lxml.html.Element) -> str:
)
if meta:
favicon = parsers.get_attribute(meta[0], "href")
return favicon
return favicon if favicon is not None else ""
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this works too:
return favicon or ""

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does. I tend towards more verbose options and not liking relying on truthy values v.s. explicit tests, but I've changed it to the form you suggested.

return ""

def _get_meta_image(self, doc: lxml.html.Element) -> str:
Expand Down
2 changes: 1 addition & 1 deletion newspaper/extractors/metadata_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def _get_meta_language(self, doc: lxml.html.Element) -> Optional[str]:
determined.
"""

def get_if_valid(s: str) -> Optional[str]:
def get_if_valid(s: Optional[str]) -> Optional[str]:
if s is None or len(s) < 2:
return None
s = s[:2]
Expand Down
10 changes: 6 additions & 4 deletions newspaper/extractors/pubdate_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ def parse_date_str(date_str):
date_matches = []
date_match = re.search(urls.STRICT_DATE_REGEX, article_url)
if date_match:
date_str = date_match.group(0)
datetime_obj = parse_date_str(date_str)
date_match_str = date_match.group(0)
datetime_obj = parse_date_str(date_match_str)
if datetime_obj:
date_matches.append((datetime_obj, 10)) # date and matchscore

Expand All @@ -54,6 +54,8 @@ def parse_date_str(date_str):
if not isinstance(item, dict):
continue
date_str = item.get("datePublished")
if date_str is None:
continue
datetime_obj = parse_date_str(date_str)
if datetime_obj:
date_matches.append((datetime_obj, 10))
Expand All @@ -79,8 +81,8 @@ def parse_date_str(date_str):
date_matches.append((datetime_obj, 5))
candidates = []

for known_meta_tag in PUBLISH_DATE_META_INFO:
candidates.extend(parsers.get_metatags(doc, value=known_meta_tag))
for known_meta_info in PUBLISH_DATE_META_INFO:
candidates.extend(parsers.get_metatags(doc, value=known_meta_info))
candidates = [(x, "content") for x in candidates] # property that contains
# the date
# is always 'content'
Expand Down
3 changes: 2 additions & 1 deletion newspaper/extractors/title_extractor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from typing import Optional
import lxml

from newspaper.configuration import Configuration
Expand Down Expand Up @@ -121,7 +122,7 @@ def get_fb_title():

return self.title

def _split_title(self, title: str, delimiter: str, hint: str = None):
def _split_title(self, title: str, delimiter: str, hint: Optional[str] = None):
"""Split the title to best part possible"""
large_text_length = 0
large_text_index = 0
Expand Down
8 changes: 4 additions & 4 deletions newspaper/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"""
from concurrent.futures import ThreadPoolExecutor
import logging
from typing import Callable, List, Tuple
from typing import Callable, List, Tuple, Union
import requests
from requests import RequestException
from requests import Response
Expand Down Expand Up @@ -101,7 +101,7 @@ def is_binary_url(url: str) -> bool:
allow_redirects=True,
)

content = resp.content
content: Union[str, bytes] = resp.content
if isinstance(content, bytes):
try:
content = content.decode("utf-8", errors="replace")
Expand All @@ -119,8 +119,8 @@ def is_binary_url(url: str) -> bool:
chars = len(
[
char
for char in content
if 31 < ord(char) < 128 or ord(char) in [9, 10, 13]
for char in [ord(c) if isinstance(c, str) else c for c in content]
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why the precaution? we ensured at line 105 that content is not bytes
In my opinion it makes the code less readable like this. Am i missing something?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, that's not quite true. The except UnicodeDecodeError: pass code means it could still be bytes.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep, i see, the mistake is that decode with errors=replace should never raise an error.
I will fix that in the merge. I think then it's safe to leave the ord(c) without any checking
Thanks for pointing that out 👍
(no need to cange anything, i am in the merging process)

if 31 < char < 128 or char in [9, 10, 13]
]
)
if chars / len(content) < 0.6: # 40% of the content is binary
Expand Down
4 changes: 2 additions & 2 deletions newspaper/nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def split_sentences(text: str) -> List[str]:
List[str]: a list of sentences
"""
try:
tokenizer = split_sentences._tokenizer
tokenizer = split_sentences._tokenizer # type: ignore[attr-defined]
except AttributeError:
import nltk

Expand All @@ -176,7 +176,7 @@ def split_sentences(text: str) -> List[str]:

# TODO: load a language specific tokenizer
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
split_sentences._tokenizer = tokenizer
split_sentences._tokenizer = tokenizer # type: ignore[attr-defined]

sentences = tokenizer.tokenize(text)
sentences = [re.sub("[\n ]+", " ", x) for x in sentences if len(x) > 10]
Expand Down
6 changes: 3 additions & 3 deletions newspaper/outputformatters.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import logging
import re
from statistics import mean, stdev
from typing import Any, Dict, Tuple
from typing import Any, Dict, Optional, Tuple

import lxml
from newspaper import parsers
Expand All @@ -36,7 +36,7 @@ def __init__(self, config=None):
self.config = config or Configuration()

def get_formatted(
self, top_node: lxml.html.HtmlElement, article_title: str = None
self, top_node: lxml.html.HtmlElement, article_title: Optional[str] = None
) -> Tuple[str, str]:
"""Returns the body text of an article, and also the cleaned html body
article of the article.
Expand Down Expand Up @@ -79,7 +79,7 @@ def get_formatted(
return (text, html)

def _convert_to_text(
self, top_node: lxml.html.HtmlElement, article_title: str = None
self, top_node: lxml.html.HtmlElement, article_title: Optional[str] = None
) -> str:
article_cleaner = lxml.html.clean.Cleaner()
article_cleaner.javascript = True
Expand Down
2 changes: 1 addition & 1 deletion newspaper/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import lxml.html
import lxml.html.clean

from bs4 import UnicodeDammit
from bs4.dammit import UnicodeDammit

from . import text as txt

Expand Down
2 changes: 1 addition & 1 deletion newspaper/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ class Source:
def __init__(
self,
url: str,
read_more_link: Optional[str] = None,
read_more_link: str = "",
config: Optional[Configuration] = None,
**kwargs
):
Expand Down