AndyTheFactory · AndyTheFactory · Feb 26, 2024 · Jan 17, 2024 · Feb 18, 2024 · Feb 25, 2024
diff --git a/.github/workflows/pipeline.yml b/.github/workflows/pipeline.yml
@@ -29,14 +29,16 @@ jobs: # jobs. We will have two jobs (test and publish) with multiple steps.
           python -m pip install --upgrade pip
           poetry config virtualenvs.create false --local
           poetry install
-          pip install pytest pylint coverage mypy coveralls
+          pip install pytest pylint coverage coveralls
           python -m nltk.downloader punkt stopwords
       - name: Pylint # Run pylint static analysis
         run: |
           poetry run pylint newspaper --fail-under=8.0
-      # - name: mypy  # Run mypy static analysis
-      #   run: |
-      #     poetry run mypy -p newspaper
+      - name: mypy  # Run mypy static analysis
+        run: |
+          poetry run mypy -p newspaper
+        env:
+          MYPYPATH: stubs
       - name: Pytest # Run pytest
         run: |
           poetry run coverage run -m --source=newspaper pytest tests

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -126,8 +126,8 @@ source venv/bin/activate
 # Upgrade pip (very important!)
 pip install --upgrade pip
 
-# Install Newspaper3k in editable mode
-pip install -e '.[dev]'
+# Install Newspaper4k in editable mode
+pip install -e '.'
 ```
 
 Last, install the pre-commit hooks with:

diff --git a/newspaper/__init__.py b/newspaper/__init__.py
@@ -34,7 +34,7 @@
 logging.getLogger(__name__).addHandler(NullHandler())
 
 
-def article(url: str, language: Optional[str] = "en", **kwargs) -> Article:
+def article(url: str, language: str = "en", **kwargs) -> Article:
     """Shortcut function to fetch and parse a newspaper article from a URL.
 
     Args:

diff --git a/newspaper/article.py b/newspaper/article.py
@@ -10,7 +10,7 @@
 import json
 import logging
 import copy
-from typing import Any, Dict, List, Optional, Set, Union
+from typing import Any, Dict, List, Literal, Optional, Set, Union, overload
 from urllib.parse import urlparse
 import lxml
 
@@ -140,11 +140,11 @@ class Article:
     def __init__(
         self,
         url: str,
-        title: Optional[str] = "",
-        source_url: Optional[str] = "",
-        read_more_link: Optional[str] = "",
+        title: str = "",
+        source_url: str = "",
+        read_more_link: str = "",
         config: Optional[Configuration] = None,
-        **kwargs: Dict[str, Any],
+        **kwargs: Any,
     ):
         """Constructs the article class. Will not download or parse the article
 
@@ -166,8 +166,8 @@ def __init__(
             use the default settingsDefaults to None.
 
         Keyword Args:
-            **kwargs: Any Configuration class propriety can be overwritten
-                    through init keyword  params.
+            **kwargs: Any Configuration class property can be overwritten
+                    through init keyword params.
                     Additionally, you can specify any of the following
                     requests parameters:
                     headers, cookies, auth, timeout, allow_redirects,
@@ -448,7 +448,8 @@ def download(
                     break
 
         self.html = html
-        self.title = title
+        if title is not None:
+            self.title = title
 
         return self
 
@@ -730,6 +731,14 @@ def throw_if_not_parsed_verbose(self):
         if not self.is_parsed:
             raise ArticleException("You must `parse()` an article first!")
 
+    @overload
+    def to_json(self, as_string: Literal[True]) -> str:
+        pass
+
+    @overload
+    def to_json(self, as_string: Literal[False]) -> Dict:
+        pass
+
     def to_json(self, as_string: Optional[bool] = True) -> Union[str, Dict]:
         """Create a json string from the article data. It will include the most
         important attributes such as title, text, authors, publish_date, etc.
@@ -745,14 +754,14 @@ def to_json(self, as_string: Optional[bool] = True) -> Union[str, Dict]:
 
         self.throw_if_not_parsed_verbose()
 
-        article_dict = {}
+        article_dict: Dict[str, Any] = {}
 
         for metadata in settings.article_json_fields:
-            article_dict[metadata] = getattr(
-                self, metadata, getattr(self.config, metadata, None)
-            )
-            if isinstance(article_dict[metadata], datetime):
-                article_dict[metadata] = article_dict[metadata].isoformat()
+            value = getattr(self, metadata, getattr(self.config, metadata, None))
+            if isinstance(value, datetime):
+                article_dict[metadata] = value.isoformat()
+            else:
+                article_dict[metadata] = value
         if as_string:
             return json.dumps(article_dict, indent=4, ensure_ascii=False)
         else:

diff --git a/newspaper/extractors/categories_extractor.py b/newspaper/extractors/categories_extractor.py
@@ -1,7 +1,7 @@
 import re
 import lxml
 import tldextract
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, Iterator, List, Optional, Tuple
 from newspaper import urls
 from newspaper.configuration import Configuration
 from newspaper.extractors.defines import url_stopwords, category_url_prefixes
@@ -57,10 +57,9 @@ def parse(self, source_url: str, doc: lxml.html.Element) -> List[str]:
                 )
 
         if len(_valid_categories) == 0:
-            other_links_in_doc = self._get_other_links(
-                doc, filter_tld=domain_tld.domain
+            other_links_in_doc = set(
+                self._get_other_links(doc, filter_tld=domain_tld.domain)
             )
-            other_links_in_doc = set(other_links_in_doc)
             for p_url in other_links_in_doc:
                 ok, parsed_url = self.is_valid_link(p_url, domain_tld.domain)
                 if ok:
@@ -91,7 +90,7 @@ def parse(self, source_url: str, doc: lxml.html.Element) -> List[str]:
 
     def _get_other_links(
         self, doc: lxml.html.Element, filter_tld: Optional[str] = None
-    ) -> List[str]:
+    ) -> Iterator[str]:
         """Return all links that are not as <a> tags. These can be
         links in javascript tags, json objects, etc.
         """

diff --git a/newspaper/extractors/defines.py b/newspaper/extractors/defines.py
@@ -1,4 +1,5 @@
-from typing import Dict, List, Union
+from typing import Dict, List
+from typing_extensions import TypedDict, NotRequired
 
 MOTLEY_REPLACEMENT = ("&#65533;", "")
 TITLE_REPLACEMENTS = ("&raquo;", "»")
@@ -105,7 +106,20 @@
     {"attribute": "pubdate", "value": "pubdate", "content": "datetime"},
     {"attribute": "class", "value": "entry-date", "content": "datetime"},
 ]
-ARTICLE_BODY_TAGS: List[Dict[str, Union[str, int]]] = [
+
+ArticleBodyTag = TypedDict(
+    "ArticleBodyTag",
+    {
+        "score_boost": int,
+        "role": NotRequired[str],
+        "tag": NotRequired[str],
+        "class": NotRequired[str],
+        "itemprop": NotRequired[str],
+        "itemtype": NotRequired[str],
+    },
+)
+
+ARTICLE_BODY_TAGS: List[ArticleBodyTag] = [
     {"tag": "article", "role": "article", "score_boost": 25},
     {"itemprop": "articleBody", "score_boost": 100},
     {"itemprop": "articleText", "score_boost": 40},
@@ -120,7 +134,17 @@
         "score_boost": 15,
     },
 ]
-META_IMAGE_TAGS: List[Dict[str, Union[str, int]]] = [
+
+
+class MetaImageDict(TypedDict):
+    tag: str
+    attr: str
+    value: str
+    content: str
+    score: int
+
+
+META_IMAGE_TAGS: List[MetaImageDict] = [
     {
         "tag": "meta",
         "attr": "property",

diff --git a/newspaper/extractors/image_extractor.py b/newspaper/extractors/image_extractor.py
@@ -58,7 +58,7 @@ def _get_favicon(self, doc: lxml.html.Element) -> str:
         )
         if meta:
             favicon = parsers.get_attribute(meta[0], "href")
-            return favicon
+            return favicon or ""
         return ""
 
     def _get_meta_image(self, doc: lxml.html.Element) -> str:

diff --git a/newspaper/extractors/metadata_extractor.py b/newspaper/extractors/metadata_extractor.py
@@ -48,7 +48,7 @@ def _get_meta_language(self, doc: lxml.html.Element) -> Optional[str]:
         determined.
         """
 
-        def get_if_valid(s: str) -> Optional[str]:
+        def get_if_valid(s: Optional[str]) -> Optional[str]:
             if s is None or len(s) < 2:
                 return None
             s = s[:2]

diff --git a/newspaper/extractors/pubdate_extractor.py b/newspaper/extractors/pubdate_extractor.py
@@ -39,8 +39,8 @@ def parse_date_str(date_str):
         date_matches = []
         date_match = re.search(urls.STRICT_DATE_REGEX, article_url)
         if date_match:
-            date_str = date_match.group(0)
-            datetime_obj = parse_date_str(date_str)
+            date_match_str = date_match.group(0)
+            datetime_obj = parse_date_str(date_match_str)
             if datetime_obj:
                 date_matches.append((datetime_obj, 10))  # date and matchscore
 
@@ -54,6 +54,8 @@ def parse_date_str(date_str):
                     if not isinstance(item, dict):
                         continue
                     date_str = item.get("datePublished")
+                    if date_str is None:
+                        continue
                     datetime_obj = parse_date_str(date_str)
                     if datetime_obj:
                         date_matches.append((datetime_obj, 10))
@@ -79,8 +81,8 @@ def parse_date_str(date_str):
                         date_matches.append((datetime_obj, 5))
         candidates = []
 
-        for known_meta_tag in PUBLISH_DATE_META_INFO:
-            candidates.extend(parsers.get_metatags(doc, value=known_meta_tag))
+        for known_meta_info in PUBLISH_DATE_META_INFO:
+            candidates.extend(parsers.get_metatags(doc, value=known_meta_info))
         candidates = [(x, "content") for x in candidates]  # property that contains
         # the date
         # is always 'content'

diff --git a/newspaper/extractors/title_extractor.py b/newspaper/extractors/title_extractor.py
@@ -1,4 +1,5 @@
 import re
+from typing import Optional
 import lxml
 
 from newspaper.configuration import Configuration
@@ -121,7 +122,7 @@ def get_fb_title():
 
         return self.title
 
-    def _split_title(self, title: str, delimiter: str, hint: str = None):
+    def _split_title(self, title: str, delimiter: str, hint: Optional[str] = None):
         """Split the title to best part possible"""
         large_text_length = 0
         large_text_index = 0

diff --git a/newspaper/network.py b/newspaper/network.py
@@ -6,7 +6,7 @@
 """
 from concurrent.futures import ThreadPoolExecutor
 import logging
-from typing import Callable, List, Tuple
+from typing import Callable, List, Tuple, Union
 import requests
 from requests import RequestException
 from requests import Response
@@ -101,7 +101,7 @@ def is_binary_url(url: str) -> bool:
                     allow_redirects=True,
                 )
 
-        content = resp.content
+        content: Union[str, bytes] = resp.content
         if isinstance(content, bytes):
             try:
                 content = content.decode("utf-8", errors="replace")
@@ -119,8 +119,8 @@ def is_binary_url(url: str) -> bool:
         chars = len(
             [
                 char
-                for char in content
-                if 31 < ord(char) < 128 or ord(char) in [9, 10, 13]
+                for char in [ord(c) if isinstance(c, str) else c for c in content]
+                if 31 < char < 128 or char in [9, 10, 13]
             ]
         )
         if chars / len(content) < 0.6:  # 40% of the content is binary

diff --git a/newspaper/nlp.py b/newspaper/nlp.py
@@ -162,7 +162,7 @@ def split_sentences(text: str) -> List[str]:
         List[str]: a list of sentences
     """
     try:
-        tokenizer = split_sentences._tokenizer
+        tokenizer = split_sentences._tokenizer  # type: ignore[attr-defined]
     except AttributeError:
         import nltk
 
@@ -176,7 +176,7 @@ def split_sentences(text: str) -> List[str]:
 
         # TODO: load a language specific tokenizer
         tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
-        split_sentences._tokenizer = tokenizer
+        split_sentences._tokenizer = tokenizer  # type: ignore[attr-defined]
 
     sentences = tokenizer.tokenize(text)
     sentences = [re.sub("[\n ]+", " ", x) for x in sentences if len(x) > 10]

diff --git a/newspaper/outputformatters.py b/newspaper/outputformatters.py
@@ -10,7 +10,7 @@
 import logging
 import re
 from statistics import mean, stdev
-from typing import Any, Dict, Tuple
+from typing import Any, Dict, Optional, Tuple
 
 import lxml
 from newspaper import parsers
@@ -36,7 +36,7 @@ def __init__(self, config=None):
         self.config = config or Configuration()
 
     def get_formatted(
-        self, top_node: lxml.html.HtmlElement, article_title: str = None
+        self, top_node: lxml.html.HtmlElement, article_title: Optional[str] = None
     ) -> Tuple[str, str]:
         """Returns the body text of an article, and also the cleaned html body
         article of the article.
@@ -79,7 +79,7 @@ def get_formatted(
         return (text, html)
 
     def _convert_to_text(
-        self, top_node: lxml.html.HtmlElement, article_title: str = None
+        self, top_node: lxml.html.HtmlElement, article_title: Optional[str] = None
     ) -> str:
         article_cleaner = lxml.html.clean.Cleaner()
         article_cleaner.javascript = True

diff --git a/newspaper/parsers.py b/newspaper/parsers.py
@@ -17,7 +17,7 @@
 import lxml.html
 import lxml.html.clean
 
-from bs4 import UnicodeDammit
+from bs4.dammit import UnicodeDammit
 
 from . import text as txt
 

diff --git a/newspaper/source.py b/newspaper/source.py
@@ -90,7 +90,7 @@ class Source:
     def __init__(
         self,
         url: str,
-        read_more_link: Optional[str] = None,
+        read_more_link: str = "",
         config: Optional[Configuration] = None,
         **kwargs
     ):