diff --git a/.gitignore b/.gitignore index c3eeb5b..5435d56 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ Pip* .env venv .idea +*.ignore.py diff --git a/README.md b/README.md index 31a3484..db01f84 100644 --- a/README.md +++ b/README.md @@ -156,9 +156,29 @@ twitter: consumer_secret: CONSUMER_SECRET ``` +### Skip entry + +You can also keep an entry if matches with a regular expression pattern. This is useful for avoid the "subscribe now" pages. +This is configured per feed like so: + +```yaml +- name: The Globe and Mail - Report on Business + skip_pattern: "you have access to only \\d+ articles" + twitter: + access_token: ACCESS_TOKEN + access_token_secret: ACCESS_TOKEN_SECRET + url: http://www.theglobeandmail.com/report-on-business/?service=rss +``` + +In this example, if the page says contains the text "you have access to only 10 articles" will skip it. the same if says any number of articles as it's a regular expression. +The `skip_pattern` performs a `re.search` operation and uses the flags for `case insensitive` and `multiline`. + +Look for the docs for [more information about Regular Expressions and the search operation.](https://docs.python.org/3/library/re.html#search-vs-match) + + ### Tweet content -By default, the tweeted diff will include the article's title and the archive diff url, [like this](https://twitter.com/mp_diff/status/1255973684994625539). +By default, the tweeted diff will include the article's title and the archive diff url, [like this.](https://twitter.com/ld_diff/status/1267989297048817672) You change this by tweeting what's changed: the url, the title and/or the summary. For doing so, you need to specify **all** the following `lang` keys: diff --git a/diffengine/__init__.py b/diffengine/__init__.py index 05927f7..a575d2c 100755 --- a/diffengine/__init__.py +++ b/diffengine/__init__.py @@ -27,8 +27,8 @@ from diffengine.exceptions.webdriver import UnknownWebdriverError from diffengine.exceptions.sendgrid import SendgridConfigNotFoundError, SendgridError from diffengine.exceptions.twitter import TwitterConfigNotFoundError, TwitterError -from diffengine.text import to_utf8 from diffengine.sendgrid import SendgridHandler +from diffengine.text import to_utf8, matches from diffengine.twitter import TwitterHandler from envyaml import EnvYAML from peewee import ( @@ -145,7 +145,7 @@ def stale(self): logging.debug("%s not stale (r=%f)", self.url, r) return False - def get_latest(self): + def get_latest(self, skip_pattern=None): """ get_latest is the heart of the application. It will get the current version on the web, extract its summary with readability and compare @@ -179,6 +179,16 @@ def get_latest(self): summary = bleach.clean(summary, tags=["p"], strip=True) summary = _normal(summary) + # if the title or the summay contains the skipping pattern, + # then return none as I don't want to report this change + if skip_pattern and ( + matches(skip_pattern, title) or matches(skip_pattern, summary) + ): + logging.info( + "Skipped page. It matches the skip_pattern prop defined for this feed." + ) + return None + # in case there was a redirect, and remove utm style marketing canonical_url = _remove_utm(resp.url) @@ -622,14 +632,15 @@ def main(): browser.quit() -def process_entry(entry, feed_config, twitter=None, sendgrid=None, lang={}): +def process_entry(entry, feed_config={}, twitter=None, sendgrid=None, lang={}): result = {"skipped": 0, "checked": 0, "new": 0} if not entry.stale: result["skipped"] = 1 else: result["checked"] = 1 try: - version = entry.get_latest() + skip_pattern = feed_config.get("skip_pattern") + version = entry.get_latest(skip_pattern) if version: result["new"] = 1 if version.diff: diff --git a/diffengine/text.py b/diffengine/text.py index f064642..8c23000 100644 --- a/diffengine/text.py +++ b/diffengine/text.py @@ -1,4 +1,6 @@ import logging +import re +import unicodedata def build_text(diff, lang={}): @@ -72,3 +74,9 @@ def to_utf8(text): return text return result + + +def matches(pattern, text): + nfkd_form = unicodedata.normalize("NFKD", text.upper()) + normalized = u"".join([c for c in nfkd_form if not unicodedata.combining(c)]) + return re.search(pattern, normalized, re.I | re.M) is not None diff --git a/test_diffengine.py b/test_diffengine.py index b166e24..f9b9bb5 100644 --- a/test_diffengine.py +++ b/test_diffengine.py @@ -1,7 +1,6 @@ import logging import os import re -import yaml from envyaml import EnvYAML import setup @@ -29,7 +28,7 @@ SendgridHandler, _fingerprint, ) -from diffengine.text import build_text, to_utf8 +from diffengine.text import build_text, to_utf8, matches from diffengine.utils import generate_config from diffengine.exceptions.sendgrid import ( SendgridConfigNotFoundError, @@ -261,7 +260,7 @@ def test_stale_is_skipped(self): type(entry).stale = PropertyMock(return_value=False) # Test - result = process_entry(entry, None, None) + result = process_entry(entry, {}, None) # Assert assert result["skipped"] == 1 @@ -273,7 +272,7 @@ def test_raise_if_entry_retrieve_fails(self): entry.get_latest = MagicMock(side_effect=Exception("TEST")) # Test - result = process_entry(entry, None, None) + result = process_entry(entry, {}, None) # Assert entry.get_latest.assert_called_once() @@ -290,7 +289,7 @@ def test_get_none_if_no_new_version(self): entry.get_latest = MagicMock(return_value=None) # Test - result = process_entry(entry, None, twitter) + result = process_entry(entry, {}, twitter) # Assert entry.get_latest.assert_called_once() @@ -311,7 +310,7 @@ def test_do_not_tweet_if_entry_has_no_diff(self): entry.get_latest = MagicMock(return_value=version) # Test - result = process_entry(entry, None, twitter) + result = process_entry(entry, {}, twitter) # Assert entry.get_latest.assert_called_once() @@ -332,7 +331,7 @@ def test_do_not_tweet_if_feed_has_no_token(self): entry.get_latest = MagicMock(return_value=version) # Test - result = process_entry(entry, None, twitter) + result = process_entry(entry, {}, twitter) # Assert entry.get_latest.assert_called_once() @@ -817,3 +816,37 @@ def test_latin1_to_utf8(self): text_utf8 = "Me preocupa más la parte futbolística" result = to_utf8(text_latin) self.assertEquals(result, text_utf8) + + +class MatchesTest(TestCase): + skip_pattern = "subscribe.*\\d{2} articles" + + def test_matches_does_not_match(self): + result = matches( + self.skip_pattern, "Hey! You need to subscribe to 1 article to continue" + ) + self.assertFalse(result) + + def test_matches_does_match(self): + result = matches( + self.skip_pattern, "Hey! You need to subscribe to 10 articles to continue" + ) + self.assertTrue(result) + + def test_matches_with_multiline(self): + result = matches( + self.skip_pattern, "Hey!\nYou need to subscribe to 10 articles\nto continue" + ) + self.assertTrue(result) + + def test_matches_is_case_insensitive(self): + result = matches( + self.skip_pattern, "Hey!\nYou need to SubsCribe to 10 ARTicles\nto continue" + ) + self.assertTrue(result) + + def test_matches_is_accent_insensitive(self): + result = matches( + self.skip_pattern, "Hey!\nYou need to SubsCribé to 10 ARTiclès\nto continue" + ) + self.assertTrue(result)