Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Basic mypy support #613

Merged
merged 6 commits into from
Feb 26, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 6 additions & 4 deletions .github/workflows/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,16 @@ jobs: # jobs. We will have two jobs (test and publish) with multiple steps.
python -m pip install --upgrade pip
poetry config virtualenvs.create false --local
poetry install
pip install pytest pylint coverage mypy coveralls
pip install pytest pylint coverage coveralls
python -m nltk.downloader punkt stopwords
- name: Pylint # Run pylint static analysis
run: |
poetry run pylint newspaper --fail-under=8.0
# - name: mypy # Run mypy static analysis
# run: |
# poetry run mypy -p newspaper
- name: mypy # Run mypy static analysis
run: |
poetry run mypy -p newspaper
env:
MYPYPATH: stubs
- name: Pytest # Run pytest
run: |
poetry run coverage run -m --source=newspaper pytest tests
Expand Down
4 changes: 2 additions & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,8 @@ source venv/bin/activate
# Upgrade pip (very important!)
pip install --upgrade pip

# Install Newspaper3k in editable mode
pip install -e '.[dev]'
# Install Newspaper4k in editable mode
pip install -e '.'
```

Last, install the pre-commit hooks with:
Expand Down
2 changes: 1 addition & 1 deletion newspaper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
logging.getLogger(__name__).addHandler(NullHandler())


def article(url: str, language: Optional[str] = "en", **kwargs) -> Article:
def article(url: str, language: str = "en", **kwargs) -> Article:
"""Shortcut function to fetch and parse a newspaper article from a URL.

Args:
Expand Down
37 changes: 23 additions & 14 deletions newspaper/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import json
import logging
import copy
from typing import Any, Dict, List, Optional, Set, Union
from typing import Any, Dict, List, Literal, Optional, Set, Union, overload
from urllib.parse import urlparse
import lxml

Expand Down Expand Up @@ -140,11 +140,11 @@ class Article:
def __init__(
self,
url: str,
title: Optional[str] = "",
source_url: Optional[str] = "",
read_more_link: Optional[str] = "",
title: str = "",
source_url: str = "",
read_more_link: str = "",
config: Optional[Configuration] = None,
**kwargs: Dict[str, Any],
**kwargs: Any,
):
"""Constructs the article class. Will not download or parse the article

Expand All @@ -166,8 +166,8 @@ def __init__(
use the default settingsDefaults to None.

Keyword Args:
**kwargs: Any Configuration class propriety can be overwritten
through init keyword params.
**kwargs: Any Configuration class property can be overwritten
through init keyword params.
Additionally, you can specify any of the following
requests parameters:
headers, cookies, auth, timeout, allow_redirects,
Expand Down Expand Up @@ -448,7 +448,8 @@ def download(
break

self.html = html
self.title = title
if title is not None:
self.title = title

return self

Expand Down Expand Up @@ -730,6 +731,14 @@ def throw_if_not_parsed_verbose(self):
if not self.is_parsed:
raise ArticleException("You must `parse()` an article first!")

@overload
def to_json(self, as_string: Literal[True]) -> str:
pass

@overload
def to_json(self, as_string: Literal[False]) -> Dict:
pass

def to_json(self, as_string: Optional[bool] = True) -> Union[str, Dict]:
"""Create a json string from the article data. It will include the most
important attributes such as title, text, authors, publish_date, etc.
Expand All @@ -745,14 +754,14 @@ def to_json(self, as_string: Optional[bool] = True) -> Union[str, Dict]:

self.throw_if_not_parsed_verbose()

article_dict = {}
article_dict: Dict[str, Any] = {}

for metadata in settings.article_json_fields:
article_dict[metadata] = getattr(
self, metadata, getattr(self.config, metadata, None)
)
if isinstance(article_dict[metadata], datetime):
article_dict[metadata] = article_dict[metadata].isoformat()
value = getattr(self, metadata, getattr(self.config, metadata, None))
if isinstance(value, datetime):
article_dict[metadata] = value.isoformat()
else:
article_dict[metadata] = value
if as_string:
return json.dumps(article_dict, indent=4, ensure_ascii=False)
else:
Expand Down
9 changes: 4 additions & 5 deletions newspaper/extractors/categories_extractor.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import re
import lxml
import tldextract
from typing import Any, Dict, List, Optional, Tuple
from typing import Any, Dict, Iterator, List, Optional, Tuple
from newspaper import urls
from newspaper.configuration import Configuration
from newspaper.extractors.defines import url_stopwords, category_url_prefixes
Expand Down Expand Up @@ -57,10 +57,9 @@ def parse(self, source_url: str, doc: lxml.html.Element) -> List[str]:
)

if len(_valid_categories) == 0:
other_links_in_doc = self._get_other_links(
doc, filter_tld=domain_tld.domain
other_links_in_doc = set(
self._get_other_links(doc, filter_tld=domain_tld.domain)
)
other_links_in_doc = set(other_links_in_doc)
for p_url in other_links_in_doc:
ok, parsed_url = self.is_valid_link(p_url, domain_tld.domain)
if ok:
Expand Down Expand Up @@ -91,7 +90,7 @@ def parse(self, source_url: str, doc: lxml.html.Element) -> List[str]:

def _get_other_links(
self, doc: lxml.html.Element, filter_tld: Optional[str] = None
) -> List[str]:
) -> Iterator[str]:
"""Return all links that are not as <a> tags. These can be
links in javascript tags, json objects, etc.
"""
Expand Down
30 changes: 27 additions & 3 deletions newspaper/extractors/defines.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import Dict, List, Union
from typing import Dict, List
from typing_extensions import TypedDict, NotRequired

MOTLEY_REPLACEMENT = ("&#65533;", "")
TITLE_REPLACEMENTS = ("&raquo;", "»")
Expand Down Expand Up @@ -105,7 +106,20 @@
{"attribute": "pubdate", "value": "pubdate", "content": "datetime"},
{"attribute": "class", "value": "entry-date", "content": "datetime"},
]
ARTICLE_BODY_TAGS: List[Dict[str, Union[str, int]]] = [

ArticleBodyTag = TypedDict(
"ArticleBodyTag",
{
"score_boost": int,
"role": NotRequired[str],
"tag": NotRequired[str],
"class": NotRequired[str],
"itemprop": NotRequired[str],
"itemtype": NotRequired[str],
},
)

ARTICLE_BODY_TAGS: List[ArticleBodyTag] = [
{"tag": "article", "role": "article", "score_boost": 25},
{"itemprop": "articleBody", "score_boost": 100},
{"itemprop": "articleText", "score_boost": 40},
Expand All @@ -120,7 +134,17 @@
"score_boost": 15,
},
]
META_IMAGE_TAGS: List[Dict[str, Union[str, int]]] = [


class MetaImageDict(TypedDict):
tag: str
attr: str
value: str
content: str
score: int


META_IMAGE_TAGS: List[MetaImageDict] = [
{
"tag": "meta",
"attr": "property",
Expand Down
2 changes: 1 addition & 1 deletion newspaper/extractors/image_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def _get_favicon(self, doc: lxml.html.Element) -> str:
)
if meta:
favicon = parsers.get_attribute(meta[0], "href")
return favicon
return favicon or ""
return ""

def _get_meta_image(self, doc: lxml.html.Element) -> str:
Expand Down
2 changes: 1 addition & 1 deletion newspaper/extractors/metadata_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def _get_meta_language(self, doc: lxml.html.Element) -> Optional[str]:
determined.
"""

def get_if_valid(s: str) -> Optional[str]:
def get_if_valid(s: Optional[str]) -> Optional[str]:
if s is None or len(s) < 2:
return None
s = s[:2]
Expand Down
10 changes: 6 additions & 4 deletions newspaper/extractors/pubdate_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ def parse_date_str(date_str):
date_matches = []
date_match = re.search(urls.STRICT_DATE_REGEX, article_url)
if date_match:
date_str = date_match.group(0)
datetime_obj = parse_date_str(date_str)
date_match_str = date_match.group(0)
datetime_obj = parse_date_str(date_match_str)
if datetime_obj:
date_matches.append((datetime_obj, 10)) # date and matchscore

Expand All @@ -54,6 +54,8 @@ def parse_date_str(date_str):
if not isinstance(item, dict):
continue
date_str = item.get("datePublished")
if date_str is None:
continue
datetime_obj = parse_date_str(date_str)
if datetime_obj:
date_matches.append((datetime_obj, 10))
Expand All @@ -79,8 +81,8 @@ def parse_date_str(date_str):
date_matches.append((datetime_obj, 5))
candidates = []

for known_meta_tag in PUBLISH_DATE_META_INFO:
candidates.extend(parsers.get_metatags(doc, value=known_meta_tag))
for known_meta_info in PUBLISH_DATE_META_INFO:
candidates.extend(parsers.get_metatags(doc, value=known_meta_info))
candidates = [(x, "content") for x in candidates] # property that contains
# the date
# is always 'content'
Expand Down
3 changes: 2 additions & 1 deletion newspaper/extractors/title_extractor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from typing import Optional
import lxml

from newspaper.configuration import Configuration
Expand Down Expand Up @@ -121,7 +122,7 @@ def get_fb_title():

return self.title

def _split_title(self, title: str, delimiter: str, hint: str = None):
def _split_title(self, title: str, delimiter: str, hint: Optional[str] = None):
"""Split the title to best part possible"""
large_text_length = 0
large_text_index = 0
Expand Down
8 changes: 4 additions & 4 deletions newspaper/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"""
from concurrent.futures import ThreadPoolExecutor
import logging
from typing import Callable, List, Tuple
from typing import Callable, List, Tuple, Union
import requests
from requests import RequestException
from requests import Response
Expand Down Expand Up @@ -101,7 +101,7 @@ def is_binary_url(url: str) -> bool:
allow_redirects=True,
)

content = resp.content
content: Union[str, bytes] = resp.content
if isinstance(content, bytes):
try:
content = content.decode("utf-8", errors="replace")
Expand All @@ -119,8 +119,8 @@ def is_binary_url(url: str) -> bool:
chars = len(
[
char
for char in content
if 31 < ord(char) < 128 or ord(char) in [9, 10, 13]
for char in [ord(c) if isinstance(c, str) else c for c in content]
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why the precaution? we ensured at line 105 that content is not bytes
In my opinion it makes the code less readable like this. Am i missing something?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, that's not quite true. The except UnicodeDecodeError: pass code means it could still be bytes.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep, i see, the mistake is that decode with errors=replace should never raise an error.
I will fix that in the merge. I think then it's safe to leave the ord(c) without any checking
Thanks for pointing that out 👍
(no need to cange anything, i am in the merging process)

if 31 < char < 128 or char in [9, 10, 13]
]
)
if chars / len(content) < 0.6: # 40% of the content is binary
Expand Down
4 changes: 2 additions & 2 deletions newspaper/nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def split_sentences(text: str) -> List[str]:
List[str]: a list of sentences
"""
try:
tokenizer = split_sentences._tokenizer
tokenizer = split_sentences._tokenizer # type: ignore[attr-defined]
except AttributeError:
import nltk

Expand All @@ -176,7 +176,7 @@ def split_sentences(text: str) -> List[str]:

# TODO: load a language specific tokenizer
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
split_sentences._tokenizer = tokenizer
split_sentences._tokenizer = tokenizer # type: ignore[attr-defined]

sentences = tokenizer.tokenize(text)
sentences = [re.sub("[\n ]+", " ", x) for x in sentences if len(x) > 10]
Expand Down
6 changes: 3 additions & 3 deletions newspaper/outputformatters.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import logging
import re
from statistics import mean, stdev
from typing import Any, Dict, Tuple
from typing import Any, Dict, Optional, Tuple

import lxml
from newspaper import parsers
Expand All @@ -36,7 +36,7 @@ def __init__(self, config=None):
self.config = config or Configuration()

def get_formatted(
self, top_node: lxml.html.HtmlElement, article_title: str = None
self, top_node: lxml.html.HtmlElement, article_title: Optional[str] = None
) -> Tuple[str, str]:
"""Returns the body text of an article, and also the cleaned html body
article of the article.
Expand Down Expand Up @@ -79,7 +79,7 @@ def get_formatted(
return (text, html)

def _convert_to_text(
self, top_node: lxml.html.HtmlElement, article_title: str = None
self, top_node: lxml.html.HtmlElement, article_title: Optional[str] = None
) -> str:
article_cleaner = lxml.html.clean.Cleaner()
article_cleaner.javascript = True
Expand Down
2 changes: 1 addition & 1 deletion newspaper/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import lxml.html
import lxml.html.clean

from bs4 import UnicodeDammit
from bs4.dammit import UnicodeDammit

from . import text as txt

Expand Down
2 changes: 1 addition & 1 deletion newspaper/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ class Source:
def __init__(
self,
url: str,
read_more_link: Optional[str] = None,
read_more_link: str = "",
config: Optional[Configuration] = None,
**kwargs
):
Expand Down