Fix 3.12 build (#619)

* Upgrade numpy for >= 3.9 * Upgrade pandas for >=3.9 * Upgrade regex * Upgrade virtualenv to avoid breaking pre-commit * Enable distutils for everyone * Fix the mypy issues
AndyTheFactory · Mar 9, 2024 · 9c9e886 · 9c9e886
1 parent 1b69cf3
commit 9c9e886
Show file tree

Hide file tree

Showing 13 changed files with 278 additions and 139 deletions.
diff --git a/.github/workflows/pipeline.yml b/.github/workflows/pipeline.yml
@@ -30,10 +30,10 @@ jobs: # jobs. We will have two jobs (test and publish) with multiple steps.
           python -m pip install --upgrade pip
           poetry config virtualenvs.create false --local
           poetry install --all-extras
-          pip install pytest pylint coverage mypy coveralls
+          pip install pylint coveralls
       #   python -m nltk.downloader punkt stopwords
         env:
-          SETUPTOOLS_USE_DISTUTILS: stdlib
+          SETUPTOOLS_USE_DISTUTILS: local
       - name: Pylint # Run pylint static analysis
         run: |
           poetry run pylint newspaper --fail-under=8.0

diff --git a/newspaper/extractors/articlebody_extractor.py b/newspaper/extractors/articlebody_extractor.py
@@ -4,6 +4,7 @@
 from statistics import mean
 from typing import Optional
 import lxml
+from newspaper.configuration import Configuration
 import newspaper.extractors.defines as defines
 import newspaper.parsers as parsers
 from newspaper.text import StopWords
@@ -25,7 +26,7 @@
 
 
 class ArticleBodyExtractor:
-    def __init__(self, config):
+    def __init__(self, config: Configuration):
         self.config = config
         self.top_node = None
         self.top_node_complemented = None

diff --git a/newspaper/nlp.py b/newspaper/nlp.py
@@ -31,32 +31,31 @@ def keywords(text: str, stopwords: StopWords, max_keywords: Optional[int] = None
     Returns:
         dict: The top 10 keywords and their frequency scores.
     """
-    text = list(stopwords.tokenizer(text))
+    tokenised_text = list(stopwords.tokenizer(text))
     if not text:
         return dict()
     # of words before removing blacklist words
-    num_words = len(text) or 1
-    text = filter(lambda x: x not in stopwords.stop_words, text)
+    num_words = len(tokenised_text) or 1
+    tokenised_text = list(
+        filter(lambda x: x not in stopwords.stop_words, tokenised_text)
+    )
 
-    freq = Counter(text)
+    freq = Counter(tokenised_text)
 
     keywords_ = freq.most_common(max_keywords)
+    keywords_dict = {k: v * 1.5 / num_words + 1 for k, v in keywords_}
 
-    keywords_ = {k: v * 1.5 / num_words + 1 for k, v in keywords_}
-
-    return keywords_
+    return keywords_dict
 
 
-def summarize(
-    title: str, text: str, stopwords: StopWords, max_sents: Optional[int] = 5
-):
+def summarize(title: str, text: str, stopwords: StopWords, max_sents: int = 5):
     """Summarize an article into the most relevant sentences in the article.
 
     Args:
         title (str): the article title
         text (str): article contents
         stopwords (StopWords): stopwords object for the language of the text
-        max_sents (Optional[int], optional):maximum number of sentences to
+        max_sents (int, optional):maximum number of sentences to
             return in the summary. Sentences are weighted by their relevance
             using the following criteria: sentence position, frequency of
             keywords, title words found in the sentence, and sentence length.

diff --git a/newspaper/parsers.py b/newspaper/parsers.py
@@ -239,7 +239,7 @@ def create_element(tag, text=None, tail=None):
 
 def remove(
     nodes: Union[lxml.html.HtmlElement, List[lxml.html.HtmlElement]],
-    keep_tags: List[str] = None,
+    keep_tags: Optional[List[str]] = None,
 ):
     """Remove the node(s) from the tree
     Arguments:

diff --git a/newspaper/text.py b/newspaper/text.py
@@ -15,14 +15,14 @@
 
 from newspaper import settings
 
-punctuation = {
+punctuation_set = {
     c for i in range(sys.maxunicode + 1) if category(c := chr(i)).startswith("P")
 }
-punctuation.update(string.punctuation)
+punctuation_set.update(string.punctuation)
 # remove characters used in contractions
 contraction_separators = set("-'`ʹʻʼʽʾʿˈˊ‘’‛′‵Ꞌꞌ")
-punctuation -= contraction_separators
-punctuation: str = "".join(list(punctuation))
+punctuation_set -= contraction_separators
+punctuation: str = "".join(list(punctuation_set))
 whitespace_tokenizer = WhitespaceTokenizer()
 
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,7 +37,14 @@ requests = ">=2.26.0"
 feedparser = ">=6.0.0"
 tldextract = ">=2.0.1"
 python-dateutil = ">=2.6.1"
-setuptools = "<60" # As per numpy recommendations at https://numpy.org/doc/stable/reference/distutils_status_migration.html#numpy-setuptools-interaction
+numpy = [
+    { version = ">=1.26", python = ">=3.9", optional = true},
+    { version = "^1.24", python = ">=3.8, <3.9", optional = true}
+]
+pandas = [
+    {version = ">=2.1.0", optional = true, python = ">=3.9"},
+    {version = ">=2", optional = true, python = ">=3.8, <3.9"}
+]
 
 # Language specific dependencies
 tinysegmenter = {version = ">=0.4", optional = true}
@@ -54,9 +61,6 @@ hi = ["indic-nlp-library"]
 np = ["indic-nlp-library"]
 ta = ["indic-nlp-library"]
 
-
-
-
 [tool.poetry.group.dev.dependencies]
 coverage = {version = ">=7.3.2", python = "^3.8"}
 pre-commit = {version = ">=3.5.0", python = "^3.8"}
@@ -69,6 +73,7 @@ types-pillow = {version = "^10.2.0.20240213", python = "^3.8"}
 types-python-dateutil = {version = "^2.8.19.20240106", python = "^3.8"}
 types-requests = "^2.27.1"
 types-beautifulsoup4 = {version = "^4.12.0.20240106", python = "^3.8"}
+virtualenv = {version = ">=20.25.1"}
 
 [tool.poetry.group.docs.dependencies]
 sphinx = {version = ">=7.0.0", python = "^3.8"}

diff --git a/stubs/indicnlp/tokenize/indic_tokenize.pyi b/stubs/indicnlp/tokenize/indic_tokenize.pyi
@@ -0,0 +1 @@
+def trivial_tokenize(text: str, lang: str = ...): ...
diff --git a/stubs/jieba.pyi b/stubs/jieba.pyi
@@ -0,0 +1,3 @@
+from typing import Union
+
+def cut(sentence: Union[bytes, str], cut_all: bool = ...): ...
diff --git a/stubs/nltk/__init__.pyi b/stubs/nltk/__init__.pyi
@@ -2,3 +2,4 @@ from typing import Optional
 from . import data  # noqa: F401
 
 def download(info_or_id: Optional[str] = None): ...
+def word_tokenize(text, language: str = ...): ...
diff --git a/stubs/nltk/tokenize.pyi b/stubs/nltk/tokenize.pyi
@@ -0,0 +1 @@
+class WhitespaceTokenizer: ...
diff --git a/stubs/pythainlp.pyi b/stubs/pythainlp.pyi
@@ -0,0 +1,5 @@
+from typing import List
+
+def word_tokenize(
+    text: str,
+) -> List[str]: ...
diff --git a/stubs/tinysegmenter.pyi b/stubs/tinysegmenter.pyi
@@ -0,0 +1,2 @@
+class TinySegmenter:
+    def tokenize(self, text: str): ...