Skip to content

Commit

Permalink
Fix 3.12 build (#619)
Browse files Browse the repository at this point in the history
* Upgrade numpy for >= 3.9

* Upgrade pandas for >=3.9

* Upgrade regex

* Upgrade virtualenv to avoid breaking pre-commit

* Enable distutils for everyone

* Fix the mypy issues
  • Loading branch information
palfrey committed Mar 9, 2024
1 parent 1b69cf3 commit 9c9e886
Show file tree
Hide file tree
Showing 13 changed files with 278 additions and 139 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@ jobs: # jobs. We will have two jobs (test and publish) with multiple steps.
python -m pip install --upgrade pip
poetry config virtualenvs.create false --local
poetry install --all-extras
pip install pytest pylint coverage mypy coveralls
pip install pylint coveralls
# python -m nltk.downloader punkt stopwords
env:
SETUPTOOLS_USE_DISTUTILS: stdlib
SETUPTOOLS_USE_DISTUTILS: local
- name: Pylint # Run pylint static analysis
run: |
poetry run pylint newspaper --fail-under=8.0
Expand Down
3 changes: 2 additions & 1 deletion newspaper/extractors/articlebody_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from statistics import mean
from typing import Optional
import lxml
from newspaper.configuration import Configuration
import newspaper.extractors.defines as defines
import newspaper.parsers as parsers
from newspaper.text import StopWords
Expand All @@ -25,7 +26,7 @@


class ArticleBodyExtractor:
def __init__(self, config):
def __init__(self, config: Configuration):
self.config = config
self.top_node = None
self.top_node_complemented = None
Expand Down
21 changes: 10 additions & 11 deletions newspaper/nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,32 +31,31 @@ def keywords(text: str, stopwords: StopWords, max_keywords: Optional[int] = None
Returns:
dict: The top 10 keywords and their frequency scores.
"""
text = list(stopwords.tokenizer(text))
tokenised_text = list(stopwords.tokenizer(text))
if not text:
return dict()
# of words before removing blacklist words
num_words = len(text) or 1
text = filter(lambda x: x not in stopwords.stop_words, text)
num_words = len(tokenised_text) or 1
tokenised_text = list(
filter(lambda x: x not in stopwords.stop_words, tokenised_text)
)

freq = Counter(text)
freq = Counter(tokenised_text)

keywords_ = freq.most_common(max_keywords)
keywords_dict = {k: v * 1.5 / num_words + 1 for k, v in keywords_}

keywords_ = {k: v * 1.5 / num_words + 1 for k, v in keywords_}

return keywords_
return keywords_dict


def summarize(
title: str, text: str, stopwords: StopWords, max_sents: Optional[int] = 5
):
def summarize(title: str, text: str, stopwords: StopWords, max_sents: int = 5):
"""Summarize an article into the most relevant sentences in the article.
Args:
title (str): the article title
text (str): article contents
stopwords (StopWords): stopwords object for the language of the text
max_sents (Optional[int], optional):maximum number of sentences to
max_sents (int, optional):maximum number of sentences to
return in the summary. Sentences are weighted by their relevance
using the following criteria: sentence position, frequency of
keywords, title words found in the sentence, and sentence length.
Expand Down
2 changes: 1 addition & 1 deletion newspaper/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def create_element(tag, text=None, tail=None):

def remove(
nodes: Union[lxml.html.HtmlElement, List[lxml.html.HtmlElement]],
keep_tags: List[str] = None,
keep_tags: Optional[List[str]] = None,
):
"""Remove the node(s) from the tree
Arguments:
Expand Down
8 changes: 4 additions & 4 deletions newspaper/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@

from newspaper import settings

punctuation = {
punctuation_set = {
c for i in range(sys.maxunicode + 1) if category(c := chr(i)).startswith("P")
}
punctuation.update(string.punctuation)
punctuation_set.update(string.punctuation)
# remove characters used in contractions
contraction_separators = set("-'`ʹʻʼʽʾʿˈˊ‘’‛′‵Ꞌꞌ")
punctuation -= contraction_separators
punctuation: str = "".join(list(punctuation))
punctuation_set -= contraction_separators
punctuation: str = "".join(list(punctuation_set))
whitespace_tokenizer = WhitespaceTokenizer()


Expand Down
353 changes: 237 additions & 116 deletions poetry.lock

Large diffs are not rendered by default.

13 changes: 9 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,14 @@ requests = ">=2.26.0"
feedparser = ">=6.0.0"
tldextract = ">=2.0.1"
python-dateutil = ">=2.6.1"
setuptools = "<60" # As per numpy recommendations at https://numpy.org/doc/stable/reference/distutils_status_migration.html#numpy-setuptools-interaction
numpy = [
{ version = ">=1.26", python = ">=3.9", optional = true},
{ version = "^1.24", python = ">=3.8, <3.9", optional = true}
]
pandas = [
{version = ">=2.1.0", optional = true, python = ">=3.9"},
{version = ">=2", optional = true, python = ">=3.8, <3.9"}
]

# Language specific dependencies
tinysegmenter = {version = ">=0.4", optional = true}
Expand All @@ -54,9 +61,6 @@ hi = ["indic-nlp-library"]
np = ["indic-nlp-library"]
ta = ["indic-nlp-library"]




[tool.poetry.group.dev.dependencies]
coverage = {version = ">=7.3.2", python = "^3.8"}
pre-commit = {version = ">=3.5.0", python = "^3.8"}
Expand All @@ -69,6 +73,7 @@ types-pillow = {version = "^10.2.0.20240213", python = "^3.8"}
types-python-dateutil = {version = "^2.8.19.20240106", python = "^3.8"}
types-requests = "^2.27.1"
types-beautifulsoup4 = {version = "^4.12.0.20240106", python = "^3.8"}
virtualenv = {version = ">=20.25.1"}

[tool.poetry.group.docs.dependencies]
sphinx = {version = ">=7.0.0", python = "^3.8"}
Expand Down
1 change: 1 addition & 0 deletions stubs/indicnlp/tokenize/indic_tokenize.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
def trivial_tokenize(text: str, lang: str = ...): ...
3 changes: 3 additions & 0 deletions stubs/jieba.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from typing import Union

def cut(sentence: Union[bytes, str], cut_all: bool = ...): ...
1 change: 1 addition & 0 deletions stubs/nltk/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ from typing import Optional
from . import data # noqa: F401

def download(info_or_id: Optional[str] = None): ...
def word_tokenize(text, language: str = ...): ...
1 change: 1 addition & 0 deletions stubs/nltk/tokenize.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
class WhitespaceTokenizer: ...
5 changes: 5 additions & 0 deletions stubs/pythainlp.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from typing import List

def word_tokenize(
text: str,
) -> List[str]: ...
2 changes: 2 additions & 0 deletions stubs/tinysegmenter.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
class TinySegmenter:
def tokenize(self, text: str): ...

0 comments on commit 9c9e886

Please sign in to comment.