Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
adde6ac
Missing diff props to be able to tweet
nahuelhds May 25, 2020
ed0b4bd
Thread creation changed to be based on the default value set by @andr…
nahuelhds May 25, 2020
2bc4053
Travis specific chromium-chromedriver version to work with Chrome 81
nahuelhds May 25, 2020
16b4c33
Travis specific chromium-chromedriver version to work with Chrome 83
nahuelhds May 25, 2020
4750989
Travis specific chromium-chromedriver version to work with Chrome 83
nahuelhds May 25, 2020
b0c0d28
wget for chromedriver 83
nahuelhds May 25, 2020
ab0772c
Merge pull request #10 from nahuelhds/feature/tweetting-fix
nahuelhds May 26, 2020
a15b052
Merge pull request #9 from nahuelhds/feature/configurable-database
nahuelhds May 26, 2020
acaee8e
Auto detects when the text is latin1 or ascii and decode it as UTF-8 …
nahuelhds Jun 2, 2020
d5081cc
Mocks adaptations
nahuelhds Jun 2, 2020
4841daf
Merge pull request #11 from nahuelhds/feature/encoding-auto-detection
nahuelhds Jun 2, 2020
37dda37
Merge branch 'master' of github.com:DocNow/diffengine
nahuelhds Jun 3, 2020
6184107
Merge branch 'master' of github.com:DocNow/diffengine
nahuelhds Jun 3, 2020
6e5b580
Readme for this fork
nahuelhds Jun 3, 2020
5e6919d
Ability to skip version if contains some text. Useful for "subscribe …
nahuelhds Jun 3, 2020
c822d8b
Merge branch 'diffengine/master' into feature/skip_regex
nahuelhds Jun 3, 2020
3b36491
Merge branch 'diffengine/master' into feature/skip_regex
nahuelhds Jun 3, 2020
07d8eda
Readme
nahuelhds Jun 3, 2020
ebf3c9c
Log the skipping action
nahuelhds Jun 3, 2020
bdb346a
no message
nahuelhds Jun 3, 2020
5fed5a0
Merge branch 'master' into feature/skip_regex
nahuelhds Jun 3, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ Pip*
.env
venv
.idea
*.ignore.py
22 changes: 21 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -156,9 +156,29 @@ twitter:
consumer_secret: CONSUMER_SECRET
```

### Skip entry

You can also keep an entry if matches with a regular expression pattern. This is useful for avoid the "subscribe now" pages.
This is configured per feed like so:

```yaml
- name: The Globe and Mail - Report on Business
skip_pattern: "you have access to only \\d+ articles"
twitter:
access_token: ACCESS_TOKEN
access_token_secret: ACCESS_TOKEN_SECRET
url: http://www.theglobeandmail.com/report-on-business/?service=rss
```

In this example, if the page says contains the text "you have access to only 10 articles" will skip it. the same if says any number of articles as it's a regular expression.
The `skip_pattern` performs a `re.search` operation and uses the flags for `case insensitive` and `multiline`.

Look for the docs for [more information about Regular Expressions and the search operation.](https://docs.python.org/3/library/re.html#search-vs-match)


### Tweet content

By default, the tweeted diff will include the article's title and the archive diff url, [like this](https://twitter.com/mp_diff/status/1255973684994625539).
By default, the tweeted diff will include the article's title and the archive diff url, [like this.](https://twitter.com/ld_diff/status/1267989297048817672)

You change this by tweeting what's changed: the url, the title and/or the summary. For doing so, you need to specify **all** the following `lang` keys:

Expand Down
19 changes: 15 additions & 4 deletions diffengine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@
from diffengine.exceptions.webdriver import UnknownWebdriverError
from diffengine.exceptions.sendgrid import SendgridConfigNotFoundError, SendgridError
from diffengine.exceptions.twitter import TwitterConfigNotFoundError, TwitterError
from diffengine.text import to_utf8
from diffengine.sendgrid import SendgridHandler
from diffengine.text import to_utf8, matches
from diffengine.twitter import TwitterHandler
from envyaml import EnvYAML
from peewee import (
Expand Down Expand Up @@ -145,7 +145,7 @@ def stale(self):
logging.debug("%s not stale (r=%f)", self.url, r)
return False

def get_latest(self):
def get_latest(self, skip_pattern=None):
"""
get_latest is the heart of the application. It will get the current
version on the web, extract its summary with readability and compare
Expand Down Expand Up @@ -179,6 +179,16 @@ def get_latest(self):
summary = bleach.clean(summary, tags=["p"], strip=True)
summary = _normal(summary)

# if the title or the summay contains the skipping pattern,
# then return none as I don't want to report this change
if skip_pattern and (
matches(skip_pattern, title) or matches(skip_pattern, summary)
):
logging.info(
"Skipped page. It matches the skip_pattern prop defined for this feed."
)
return None

# in case there was a redirect, and remove utm style marketing
canonical_url = _remove_utm(resp.url)

Expand Down Expand Up @@ -622,14 +632,15 @@ def main():
browser.quit()


def process_entry(entry, feed_config, twitter=None, sendgrid=None, lang={}):
def process_entry(entry, feed_config={}, twitter=None, sendgrid=None, lang={}):
result = {"skipped": 0, "checked": 0, "new": 0}
if not entry.stale:
result["skipped"] = 1
else:
result["checked"] = 1
try:
version = entry.get_latest()
skip_pattern = feed_config.get("skip_pattern")
version = entry.get_latest(skip_pattern)
if version:
result["new"] = 1
if version.diff:
Expand Down
8 changes: 8 additions & 0 deletions diffengine/text.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import logging
import re
import unicodedata


def build_text(diff, lang={}):
Expand Down Expand Up @@ -72,3 +74,9 @@ def to_utf8(text):
return text

return result


def matches(pattern, text):
nfkd_form = unicodedata.normalize("NFKD", text.upper())
normalized = u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
return re.search(pattern, normalized, re.I | re.M) is not None
47 changes: 40 additions & 7 deletions test_diffengine.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import logging
import os
import re
import yaml
from envyaml import EnvYAML

import setup
Expand Down Expand Up @@ -29,7 +28,7 @@
SendgridHandler,
_fingerprint,
)
from diffengine.text import build_text, to_utf8
from diffengine.text import build_text, to_utf8, matches
from diffengine.utils import generate_config
from diffengine.exceptions.sendgrid import (
SendgridConfigNotFoundError,
Expand Down Expand Up @@ -261,7 +260,7 @@ def test_stale_is_skipped(self):
type(entry).stale = PropertyMock(return_value=False)

# Test
result = process_entry(entry, None, None)
result = process_entry(entry, {}, None)

# Assert
assert result["skipped"] == 1
Expand All @@ -273,7 +272,7 @@ def test_raise_if_entry_retrieve_fails(self):
entry.get_latest = MagicMock(side_effect=Exception("TEST"))

# Test
result = process_entry(entry, None, None)
result = process_entry(entry, {}, None)

# Assert
entry.get_latest.assert_called_once()
Expand All @@ -290,7 +289,7 @@ def test_get_none_if_no_new_version(self):
entry.get_latest = MagicMock(return_value=None)

# Test
result = process_entry(entry, None, twitter)
result = process_entry(entry, {}, twitter)

# Assert
entry.get_latest.assert_called_once()
Expand All @@ -311,7 +310,7 @@ def test_do_not_tweet_if_entry_has_no_diff(self):
entry.get_latest = MagicMock(return_value=version)

# Test
result = process_entry(entry, None, twitter)
result = process_entry(entry, {}, twitter)

# Assert
entry.get_latest.assert_called_once()
Expand All @@ -332,7 +331,7 @@ def test_do_not_tweet_if_feed_has_no_token(self):
entry.get_latest = MagicMock(return_value=version)

# Test
result = process_entry(entry, None, twitter)
result = process_entry(entry, {}, twitter)

# Assert
entry.get_latest.assert_called_once()
Expand Down Expand Up @@ -817,3 +816,37 @@ def test_latin1_to_utf8(self):
text_utf8 = "Me preocupa más la parte futbolística"
result = to_utf8(text_latin)
self.assertEquals(result, text_utf8)


class MatchesTest(TestCase):
skip_pattern = "subscribe.*\\d{2} articles"

def test_matches_does_not_match(self):
result = matches(
self.skip_pattern, "Hey! You need to subscribe to 1 article to continue"
)
self.assertFalse(result)

def test_matches_does_match(self):
result = matches(
self.skip_pattern, "Hey! You need to subscribe to 10 articles to continue"
)
self.assertTrue(result)

def test_matches_with_multiline(self):
result = matches(
self.skip_pattern, "Hey!\nYou need to subscribe to 10 articles\nto continue"
)
self.assertTrue(result)

def test_matches_is_case_insensitive(self):
result = matches(
self.skip_pattern, "Hey!\nYou need to SubsCribe to 10 ARTicles\nto continue"
)
self.assertTrue(result)

def test_matches_is_accent_insensitive(self):
result = matches(
self.skip_pattern, "Hey!\nYou need to SubsCribé to 10 ARTiclès\nto continue"
)
self.assertTrue(result)