From adde6ac2c21e22d1ab300dc33e585182bafa024c Mon Sep 17 00:00:00 2001 From: Nahue Date: Mon, 25 May 2020 19:39:07 -0300 Subject: [PATCH 01/13] Missing diff props to be able to tweet (cherry picked from commit df8256ed22a779e0292af583d9f1411bae39f8e0) --- diffengine/__init__.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/diffengine/__init__.py b/diffengine/__init__.py index 8936e65..5e62eec 100755 --- a/diffengine/__init__.py +++ b/diffengine/__init__.py @@ -298,6 +298,18 @@ class Diff(BaseModel): emailed = DateTimeField(null=True) blogged = DateTimeField(null=True) + @property + def url_changed(self): + return self.old.url != self.new.url + + @property + def title_changed(self): + return self.old.title != self.new.title + + @property + def summary_changed(self): + return self.old.summary != self.new.summary + @property def html_path(self): # use prime number to spread across directories From ed0b4bdce3adcb6ff30a068986bdf2465ced5d65 Mon Sep 17 00:00:00 2001 From: Nahue Date: Mon, 25 May 2020 19:57:42 -0300 Subject: [PATCH 02/13] Thread creation changed to be based on the default value set by @andresfib at #77, which is not `None` anymore but `''` (cherry picked from commit 69fa1932b6f61f1eba847a0313eacb2ac4d8762c) --- diffengine/twitter.py | 2 +- test_diffengine.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/diffengine/twitter.py b/diffengine/twitter.py index 6c171ff..544ce3d 100644 --- a/diffengine/twitter.py +++ b/diffengine/twitter.py @@ -66,7 +66,7 @@ def tweet_diff(self, diff, token=None, lang={}): # Check if the thread exists thread_status_id_str = None - if diff.old.entry.tweet_status_id_str is None: + if diff.old.entry.tweet_status_id_str == "": try: thread_status_id_str = self.create_thread( diff.old.entry, diff.old, token diff --git a/test_diffengine.py b/test_diffengine.py index e455059..0aeb55e 100644 --- a/test_diffengine.py +++ b/test_diffengine.py @@ -423,7 +423,7 @@ def test_create_thread_if_old_entry_has_no_related_tweet( ): entry = MagicMock() - type(entry).tweet_status_id_str = PropertyMock(return_value=None) + type(entry).tweet_status_id_str = PropertyMock(return_value="") diff = get_mocked_diff() type(diff.old).entry = entry From 2bc4053221ca59ac75130592f1bafddf34e189e9 Mon Sep 17 00:00:00 2001 From: Nahue Date: Mon, 25 May 2020 20:18:01 -0300 Subject: [PATCH 03/13] Travis specific chromium-chromedriver version to work with Chrome 81 --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 30d011b..0b68eea 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,7 @@ script: py.test before_install: - sudo apt-get -y update - sudo apt-get install firefox-geckodriver - - sudo apt-get install --upgrade chromium-chromedriver + - sudo apt-get install chromium-chromedriver=81.0.4044.138 after_failure: cat test/diffengine.log notifications: slack: From 16b4c33c37d696a091788dde4f4e7b0eb61f0eef Mon Sep 17 00:00:00 2001 From: Nahue Date: Mon, 25 May 2020 20:20:13 -0300 Subject: [PATCH 04/13] Travis specific chromium-chromedriver version to work with Chrome 83 --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 0b68eea..00f8e23 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,7 @@ script: py.test before_install: - sudo apt-get -y update - sudo apt-get install firefox-geckodriver - - sudo apt-get install chromium-chromedriver=81.0.4044.138 + - sudo apt-get install chromium-chromedriver=83 after_failure: cat test/diffengine.log notifications: slack: From 475098901638a348754eecf3c0e12f7b1e62f248 Mon Sep 17 00:00:00 2001 From: Nahue Date: Mon, 25 May 2020 20:22:55 -0300 Subject: [PATCH 05/13] Travis specific chromium-chromedriver version to work with Chrome 83 --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 00f8e23..7854d06 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,7 @@ script: py.test before_install: - sudo apt-get -y update - sudo apt-get install firefox-geckodriver - - sudo apt-get install chromium-chromedriver=83 + - sudo apt-get install chromium-chromedriver after_failure: cat test/diffengine.log notifications: slack: From b0c0d2870308680aa7819041b721e7789f48a056 Mon Sep 17 00:00:00 2001 From: Nahue Date: Mon, 25 May 2020 20:38:30 -0300 Subject: [PATCH 06/13] wget for chromedriver 83 --- .travis.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 7854d06..ef8f266 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,10 @@ script: py.test before_install: - sudo apt-get -y update - sudo apt-get install firefox-geckodriver - - sudo apt-get install chromium-chromedriver +before_script: + - wget https://chromedriver.storage.googleapis.com/83.0.4103.39/chromedriver_linux64.zip + - unzip chromedriver_linux64.zip -d /home/travis/virtualenv/python3.7.1/bin/ + - export CHROME_BIN=chromium-browser after_failure: cat test/diffengine.log notifications: slack: From acaee8e9893fb00bd7ca6e00a4e3dc6377047ebd Mon Sep 17 00:00:00 2001 From: Nahue Date: Tue, 2 Jun 2020 20:27:59 -0300 Subject: [PATCH 07/13] Auto detects when the text is latin1 or ascii and decode it as UTF-8 so no strange chars are stored or used for the entry version comparisons --- .gitignore | 1 + diffengine/__init__.py | 4 ++-- diffengine/{text_builder.py => text.py} | 14 ++++++++++++++ diffengine/twitter.py | 2 +- test_diffengine.py | 15 ++++++++++++++- 5 files changed, 32 insertions(+), 4 deletions(-) rename diffengine/{text_builder.py => text.py} (83%) diff --git a/.gitignore b/.gitignore index c3eeb5b..5435d56 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ Pip* .env venv .idea +*.ignore.py diff --git a/diffengine/__init__.py b/diffengine/__init__.py index 28508de..5089386 100755 --- a/diffengine/__init__.py +++ b/diffengine/__init__.py @@ -24,6 +24,7 @@ import unicodedata from diffengine.sendgrid import SendgridHandler +from diffengine.text import to_utf8 from diffengine.twitter import TwitterHandler from exceptions.webdriver import UnknownWebdriverError @@ -173,8 +174,7 @@ def get_latest(self): if resp.status_code != 200: logging.warn("Got %s when fetching %s", resp.status_code, self.url) return None - - doc = readability.Document(resp.text) + doc = readability.Document(to_utf8(resp.text)) title = doc.title() summary = doc.summary(html_partial=True) summary = bleach.clean(summary, tags=["p"], strip=True) diff --git a/diffengine/text_builder.py b/diffengine/text.py similarity index 83% rename from diffengine/text_builder.py rename to diffengine/text.py index 5f1bda1..f064642 100644 --- a/diffengine/text_builder.py +++ b/diffengine/text.py @@ -58,3 +58,17 @@ def build_with_default_content(diff): text = text[0:225] + "…" text += " " + diff.url return text + + +def to_utf8(text): + for encoding in ["latin1", "ascii"]: + try: + result = text.encode(encoding).decode("utf8", "strict") + break + except (UnicodeEncodeError, UnicodeDecodeError): + result = None + + if result is None: + return text + + return result diff --git a/diffengine/twitter.py b/diffengine/twitter.py index e886aff..2b16c49 100644 --- a/diffengine/twitter.py +++ b/diffengine/twitter.py @@ -3,7 +3,7 @@ from datetime import datetime -from diffengine.text_builder import build_text +from diffengine.text import build_text from exceptions.twitter import ( AlreadyTweetedError, TwitterConfigNotFoundError, diff --git a/test_diffengine.py b/test_diffengine.py index 4662811..16d85a5 100644 --- a/test_diffengine.py +++ b/test_diffengine.py @@ -29,7 +29,7 @@ SendgridHandler, _fingerprint, ) -from diffengine.text_builder import build_text +from diffengine.text import build_text, to_utf8 from diffengine.utils import generate_config from exceptions.sendgrid import ( SendgridConfigNotFoundError, @@ -804,3 +804,16 @@ def test_lang_content_text(self): self.assertEqual( text, "change in the URL, the title and the summary\n%s" % diff.url ) + + +class EncodingTest(TestCase): + def test_utf8_do_nothingg(self): + text_utf8 = "Me preocupa más la parte futbolística" + result = to_utf8(text_utf8) + self.assertEquals(result, text_utf8) + + def test_latin1_to_utf8(self): + text_latin = "Me preocupa más la parte futbolística" + text_utf8 = "Me preocupa más la parte futbolística" + result = to_utf8(text_latin) + self.assertEquals(result, text_utf8) From d5081ccf8beaf22f012309bc05f51e66ef39c133 Mon Sep 17 00:00:00 2001 From: Nahue Date: Tue, 2 Jun 2020 20:40:53 -0300 Subject: [PATCH 08/13] Mocks adaptations --- test_diffengine.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/test_diffengine.py b/test_diffengine.py index 16d85a5..3843d1e 100644 --- a/test_diffengine.py +++ b/test_diffengine.py @@ -663,8 +663,8 @@ def get_mocked_diff(with_archive_urls=True): class TextBuilderTest(TestCase): @patch("logging.warning") - @patch("diffengine.text_builder.build_with_lang") - @patch("diffengine.text_builder.build_with_default_content") + @patch("diffengine.text.build_with_lang") + @patch("diffengine.text.build_with_default_content") def test_build_with_default_content_when_no_lang_given( self, mocked_build_with_default_content, mocked_build_from_lang, mocked_warning ): @@ -679,8 +679,8 @@ def test_build_with_default_content_when_no_lang_given( mocked_build_from_lang.assert_not_called() @patch("logging.warning") - @patch("diffengine.text_builder.build_with_lang") - @patch("diffengine.text_builder.build_with_default_content") + @patch("diffengine.text.build_with_lang") + @patch("diffengine.text.build_with_default_content") def test_build_with_default_content_when_lang_is_incomplete( self, mocked_build_with_default_content, mocked_build_from_lang, mocked_warning ): @@ -700,8 +700,8 @@ def test_build_with_default_content_when_lang_is_incomplete( mocked_build_from_lang.assert_not_called() @patch("logging.warning") - @patch("diffengine.text_builder.build_with_lang") - @patch("diffengine.text_builder.build_with_default_content") + @patch("diffengine.text.build_with_lang") + @patch("diffengine.text.build_with_default_content") def test_build_with_lang_when_lang_given( self, mocked_build_with_default_content, mocked_build_from_lang, mocked_warning ): @@ -722,7 +722,7 @@ def test_build_with_lang_when_lang_given( mocked_build_with_default_content.assert_not_called() mocked_build_from_lang.assert_called_once() - @patch("diffengine.text_builder.build_with_lang") + @patch("diffengine.text.build_with_lang") def test_default_content_text(self, mocked_build_from_lang): diff = get_mocked_diff() type(diff.new).title = "Test" From 6e5b580850fb4c3feff67bd6fce51dd6e4f2bbcd Mon Sep 17 00:00:00 2001 From: Nahue Date: Tue, 2 Jun 2020 22:23:35 -0300 Subject: [PATCH 09/13] Readme for this fork --- README.md | 347 +++--------------------------------------------------- 1 file changed, 15 insertions(+), 332 deletions(-) diff --git a/README.md b/README.md index 31a3484..baa6efb 100644 --- a/README.md +++ b/README.md @@ -1,338 +1,21 @@ -
- -
+# Diffengine for Heroku -diffengine is a utility for watching RSS feeds to see when story content -changes. When new content is found a snapshot is saved at the Internet Archive, -and a diff is generated for sending to social media. The hope is that it can -help draw attention to the way news is being shaped on the web. It also creates -a database of changes over time that can be useful for research purposes. +This is a fork from [DocNow/diffengine](https://github.com/) and it focused in making this engine to work under Heroku. -diffengine draws heavily on the inspiration of [NYTDiff] and [NewsDiffs] which -*almost* did what we wanted. [NYTdiff] is able to create presentable diff images -and tweet them, but was designed to work specifically with the NYTimes API. -NewsDiffs provides a comprehensive framework for watching changes on multiple -sites (Washington Post, New York Times, CNN, BBC, etc) but you need to be a -programmer to add a [parser -module](https://github.com/ecprice/newsdiffs/tree/master/parsers) for a website -that you want to monitor. It is also a full-on website which involves some -commitment to install and run. +## Contributions made -With the help of [feedparser], diffengine takes a different approach by working -with any site that publishes an RSS feed of changes. This covers many news -organizations, but also personal blogs and organizational websites that put out -regular updates. And with the [readability] module, diffengine is able to -automatically extract the primary content of pages, without requiring special -parsing to remove boilerplate material. And like NYTDiff, instead of creating -another website for people to watch, diffengine pushes updates out to social -media where people are already, while also building a local database of diffs -that can be used for research purposes. +- Environment vars in `config.yaml` file [#67](https://github.com/DocNow/diffengine/pull/67) +- Black formatter integration [#68](https://github.com/DocNow/diffengine/pull/68) +- CLI command for generating tokens for extra twitter accounts [#69](https://github.com/DocNow/diffengine/pull/69) +- Configurable webdriver (gecko by default but now chrome can be chosen) [#70](https://github.com/DocNow/diffengine/pull/70) +- Twitter integration [#71](https://github.com/DocNow/diffengine/pull/71) +- Configurable loggers (file and console) [#72](https://github.com/DocNow/diffengine/pull/72) +- Configurable tweet text [#74](https://github.com/DocNow/diffengine/pull/74) +- Configurable database [#83](https://github.com/DocNow/diffengine/pull/83) +- Encoding autodetection [#88](https://github.com/DocNow/diffengine/pull/88) -## Install -1. install [GeckoDriver] -1. install [Python 3] -1. `pip3 install diffengine` +## Working example -## Run - -In order to run diffengine you need to pick a directory location where you can -store the diffengine configuration, database and diffs. For example I have a -directory in my home directory, but you can use whatever location you want, you -just need to be able to write to it. - -The first time you run diffengine it will prompt you to enter an RSS or Atom -feed URL to monitor and will authenticate with Twitter. - -```console -% diffengine /home/ed/.diffengine - -What RSS/Atom feed would you like to monitor? https://inkdroid.org/feed.xml - -Would you like to set up tweeting edits? [Y/n] Y - -Go to https://apps.twitter.com and create an application. - -What is the consumer key? - -What is the consumer secret? - -Log in to https://twitter.com as the user you want to tweet as and hit enter. - -Visit https://api.twitter.com/oauth/authorize?oauth_token=NRW9BQAAAAAAyqBnAAXXYYlCL8g - -What is your PIN: 8675309 - -Saved your configuration in /home/ed/.diffengine/config.yaml - -Fetching initial set of entries. - -Done! -``` - -After that you just need to put diffengine in your crontab to have it run -regularly, or you can run it manually at your own intervals if you want. Here's -my crontab to run every 30 minutes to look for new content. - - 0,30 * * * * /usr/local/bin/diffengine /home/ed/.diffengine - -You can examine your config file at any time and add/remove feeds as needed. It -is the `config.yaml` file that is stored relative to the storage directory you -chose, so in my case `/home/ed/.diffengine/config.yaml`. - -Logs can be found in `diffengine.log` in the storage directory, for example -`/home/ed/.diffengine/diffengine.log`. - -## Examples - -Checkout [Ryan Baumann's "diffengine" Twitter list] for a list of known -diffengine Twitter accounts that are out there. - -## Config options - -### Database engine - -By default the database is configured for Sqlite and the file `./diffengine.db` through the `db` config prop - -```yaml -db: sqlite:///diffengine.db -``` - -This value responds to the [database URL connection string format](http://docs.peewee-orm.com/en/latest/peewee/playhouse.html#database-url). - -For instance, you can co˚nnect to your postgresql database using something like this. - -```yaml -db: postgresql://postgres:my_password@localhost:5432/my_database -``` - -In case you store your database url connection into an environment var, like in Heroku. You can simply do as follows. - -```yaml -db: "${DATABASE_URL}" -``` - -### Multiple Accounts & Feed Implementation Example - -If you are setting multiple accounts, and multiple feeds if may be helpful to setup a -directory for each account. For example: - -- Toronto Sun `/home/nruest/.torontosun` -- Toronto Star `/home/nruest/.torontostar` -- Globe & Mail `/home/nruest/.globemail` -- Canadaland `/home/nruest/.canadaland` -- CBC `/home/nruest/.cbc` - -Then you will configure a cron entry for each account: - -``` -0,15,30,45 * * * * /usr/bin/flock -xn /tmp/globemail.lock -c "/usr/local/bin/diffengine /home/nruest/.globemail" -0,15,30,45 * * * * /usr/bin/flock -xn /tmp/torontosun.lock -c "/usr/local/bin/diffengine /home/nruest/.torontosun" -0,15,30,45 * * * * /usr/bin/flock -xn /tmp/cbc.lock -c "/usr/local/bin/diffengine /home/nruest/.cbc" -0,15,30,45 * * * * /usr/bin/flock -xn /tmp/lapresse.lock -c "/usr/local/bin/diffengine /home/nruest/.lapresse" -0,15,30,45 * * * * /usr/bin/flock -xn /tmp/calgaryherald.lock -c "/usr/local/bin/diffengine /home/nruest/.calgaryherald" -``` - -If there are multiple feeds for an account, you can setup the `config.yml` like so: - -```yaml -- name: The Globe and Mail - Report on Business - twitter: - access_token: ACCESS_TOKEN - access_token_secret: ACCESS_TOKEN_SECRET - url: http://www.theglobeandmail.com/report-on-business/?service=rss -- name: The Globe and Mail - Opinion - twitter: - access_token: ACCESS_TOKEN - access_token_secret: ACCESS_TOKEN_SECRET - url: http://www.theglobeandmail.com/opinion/?service=rss -- name: The Globe and Mail - News - twitter: - access_token: ACCESS_TOKEN - access_token_secret: ACCESS_TOKEN_SECRET - url: http://www.theglobeandmail.com/news/?service=rss -twitter: - consumer_key: CONSUMER_KEY - consumer_secret: CONSUMER_SECRET -``` - -### Tweet content - -By default, the tweeted diff will include the article's title and the archive diff url, [like this](https://twitter.com/mp_diff/status/1255973684994625539). - -You change this by tweeting what's changed: the url, the title and/or the summary. For doing so, you need to specify **all** the following `lang` keys: - -```yaml -lang: - change_in: "Change in" - the_url: "the URL" - the_title: "the title" - and: "and" - the_summary: "the summary" -``` - -Only if all the keys are defined, the tweet will include what's changed on its content, followed by the `diff.url`. Some examples: - -- "Change in the title" -- "Change in the summary" -- "Change in the title and the summary" - -And so on with all the possible combinations between url, title and summary - -### Support for environment vars - -The configuration file has support for [environment variables](https://medium.com/chingu/an-introduction-to-environment-variables-and-how-to-use-them-f602f66d15fa). This is useful if you want to keeping your credentials secure when deploying to Heroku, Vercel (former ZEIT Now), AWS, Azure, Google Cloud or any other similar services. The environment variables are defined on the app of the platform you use or directly in a [dotenv file](https://12factor.net/config), which is the usual case when coding locally. - -For instance, say you want to keep your Twitter credentials safe. You'd keep a reference to it in the `config.yaml` this way: - -```yaml -twitter: - consumer_key: "${MY_CONSUMER_KEY_ENV_VAR}" - consumer_secret: "${MY_CONSUMER_SECRET_ENV_VAR}" -``` - -Then you would define your environment variables `MY_CONSUMER_KEY_ENV_VAR` and `MY_CONSUMER_SECRET_ENV_VAR` in your `.env` file: - -```dotenv -MY_CONSUMER_KEY_ENV_VAR="CONSUMER_KEY" -MY_CONSUMER_SECRET_ENV_VAR="CONSUMER_SECRET" -``` - -Done! You can use diffengine as usual and keep your credentials safe. - -### Adding a Twitter account when the configuration file is already created - -You can use the following command for adding Twitter accounts to the config file. - -```shell -$ diffengine --add - -Log in to https://twitter.com as the user you want to tweet as and hit enter. -Visit https://api.twitter.com/oauth/authorize?oauth_token=QKGAqgAAAAABDsonAAABcbfQfFw in your browser and hit enter. -What is your PIN: 1234567 - -These are your access token and secret. -DO NOT SHARE THEM WITH ANYONE! - -ACCESS_TOKEN -xxxxxxxxxxx-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy - -ACCESS_TOKEN_SECRET -zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz -``` - -Then you would use the `ACCESS_TOKEN` and the `ACCESS_TOKEN_SECRET` inside the config like this - -```yaml -feeds: -- name: My new feed - url: http://www.mynewfeed.com/feed/ - twitter: - access_token: "${ACCESS_TOKEN}" - access_token_secret: "${ACCESS_TOKEN_SECRET}" -``` - -### Avaiable webdriver engines - -Diffengine has support for `geckodriver` and `chromedriver`. - -You can configure this in the `config.yaml`. The keys are the following ones. -```yaml -webdriver: - engine: - executable_path: - binary_location: -``` - -#### Configuring geckodriver - -The `geckodriver` is properly defined by default. In case you need to configure it, then: - -```yaml -webdriver: - engine: "geckodriver" - executable_path: null (this config has no use with geckodriver) - binary_location: null (the same as above with this one) -``` - -#### Configuring chromedriver - -If you want to use `chromedriver` locally, then you should leave the config this way: - -```yaml -webdriver: - engine: "chromedriver" - executable_path: null ("chromedriver" by default) - binary_location: null ("" by default) -``` - -##### Using chromedriver in Heroku - -If you use Heroku, then you have to add the [Heroku chromedriver buildpack](https://github.com/heroku/heroku-buildpack-chromedriver). -And then use the environment vars provided automatically by it. - -```yaml -webdriver: - engine: "chromedriver" - executable_path: "${CHROMEDRIVER_PATH}" - binary_location: "${GOOGLE_CHROME_BIN}" -``` - -### Configuring the loggers - -By default, the script will log everyhintg to `./diffengine.log`. -Anyway, you can disable the file logger and/or enable the console logger as well. -You can modify the log filename, too. - -If no present, the default values will be the following ones. -```yaml -log: diffengine.log -logger: - file: true - console : false -``` - -Logging to the console could be useful to see what's happening if the app lives in services like Heroku. - -## Develop - -[![Build Status](https://travis-ci.org/DocNow/diffengine.svg)](http://travis-ci.org/DocNow/diffengine) - -Here's how to get started hacking on diffengine with [pipenv]: - -```console -% git clone https://github.com/docnow/diffengine -% cd diffengine -% pipenv install -% pytest -============================= test session starts ============================== -platform linux -- Python 3.5.2, pytest-3.0.5, py-1.4.32, pluggy-0.4.0 -rootdir: /home/ed/Projects/diffengine, inifile: -collected 5 items - -test_diffengine.py ..... - -=========================== 5 passed in 8.09 seconds =========================== -``` - -Last, you need to install the pre-commit hooks to be run before any commit - -``` -pre-commit install -``` - -This way, [Black](https://black.readthedocs.io/en/stable/) formatter will be executed every time. - -We recommend you to [to configure it in your own IDE here.](https://black.readthedocs.io/en/stable/editor_integration.html) - - -[nyt_diff]: https://twitter.com/nyt_diff -[NYTDiff]: https://github.com/j-e-d/NYTdiff -[NewsDiffs]: http://newsdiffs.org/ -[feedparser]: https://pythonhosted.org/feedparser/ -[readability]: https://github.com/buriy/python-readability -[GeckoDriver]: https://github.com/mozilla/geckodriver -[Python 3]: https://python.org -[create an issue]: https://github.com/DocNow/diffengine/issues -[pipenv]: https://pipenv.readthedocs.io/en/latest/ -[Ryan Baumann's "diffengine" Twitter list]: https://twitter.com/ryanfb/lists/diffengine +All of these integration are currently working under this fork and most of them are already integrated in the upstream branch. +A working implementation of this particular branch can be seen at the [diffbots project.](https://github.com/nahuelhds/diffbots) From 5e6919d0b17d8e50e5c12f3f1d9fab667fedbbce Mon Sep 17 00:00:00 2001 From: Nahue Date: Tue, 2 Jun 2020 23:56:19 -0300 Subject: [PATCH 10/13] Ability to skip version if contains some text. Useful for "subscribe now" kind of pages --- diffengine/__init__.py | 16 ++++++++++---- diffengine/text.py | 8 +++++++ test_diffengine.py | 48 ++++++++++++++++++++++++++++++++++++------ 3 files changed, 61 insertions(+), 11 deletions(-) diff --git a/diffengine/__init__.py b/diffengine/__init__.py index c0c155b..e5a5fb0 100755 --- a/diffengine/__init__.py +++ b/diffengine/__init__.py @@ -37,7 +37,7 @@ from diffengine.exceptions.webdriver import UnknownWebdriverError from diffengine.exceptions.twitter import ConfigNotFoundError, TwitterError -from diffengine.text import to_utf8 +from diffengine.text import to_utf8, matches from diffengine.twitter import TwitterHandler from diffengine.exceptions.sendgrid import ( ConfigNotFoundError as SGConfigNotFoundError, @@ -142,7 +142,7 @@ def stale(self): logging.debug("%s not stale (r=%f)", self.url, r) return False - def get_latest(self): + def get_latest(self, skip_pattern=None): """ get_latest is the heart of the application. It will get the current version on the web, extract its summary with readability and compare @@ -176,6 +176,13 @@ def get_latest(self): summary = bleach.clean(summary, tags=["p"], strip=True) summary = _normal(summary) + # if the title or the summay contains the skipping pattern, + # then return none as I don't want to report this change + if skip_pattern and ( + matches(skip_pattern, title) or matches(skip_pattern, summary) + ): + return None + # in case there was a redirect, and remove utm style marketing canonical_url = _remove_utm(resp.url) @@ -604,14 +611,15 @@ def main(): browser.quit() -def process_entry(entry, feed_config, twitter=None, sendgrid=None, lang={}): +def process_entry(entry, feed_config={}, twitter=None, sendgrid=None, lang={}): result = {"skipped": 0, "checked": 0, "new": 0} if not entry.stale: result["skipped"] = 1 else: result["checked"] = 1 try: - version = entry.get_latest() + skip_pattern = feed_config.get("skip_pattern") + version = entry.get_latest(skip_pattern) if version: result["new"] = 1 if version.diff: diff --git a/diffengine/text.py b/diffengine/text.py index f064642..8c23000 100644 --- a/diffengine/text.py +++ b/diffengine/text.py @@ -1,4 +1,6 @@ import logging +import re +import unicodedata def build_text(diff, lang={}): @@ -72,3 +74,9 @@ def to_utf8(text): return text return result + + +def matches(pattern, text): + nfkd_form = unicodedata.normalize("NFKD", text.upper()) + normalized = u"".join([c for c in nfkd_form if not unicodedata.combining(c)]) + return re.search(pattern, normalized, re.I | re.M) is not None diff --git a/test_diffengine.py b/test_diffengine.py index e01b1a5..6eba9f6 100644 --- a/test_diffengine.py +++ b/test_diffengine.py @@ -33,7 +33,7 @@ AlreadyEmailedError as SGAlreadyEmailedError, ArchiveUrlNotFoundError as SGArchiveNotFoundError, ) -from diffengine.text import build_text +from diffengine.text import build_text, to_utf8, matches from diffengine.exceptions.twitter import ( ConfigNotFoundError, TokenNotFoundError, @@ -95,7 +95,7 @@ def test_diff(): assert os.path.isfile(diff.thumbnail_path) # check that the url for the internet archive diff is working - assert re.match("^https://web.archive.org/web/diff/\d+/\d+/https.+$", diff.url) + assert re.match("^https://web.archive.org/web/diff/\\d+/\\d+/https.+$", diff.url) def test_html_diff(): @@ -231,7 +231,7 @@ def test_stale_is_skipped(self): type(entry).stale = PropertyMock(return_value=False) # Test - result = process_entry(entry, None, None) + result = process_entry(entry, {}, None) # Assert assert result["skipped"] == 1 @@ -243,7 +243,7 @@ def test_raise_if_entry_retrieve_fails(self): entry.get_latest = MagicMock(side_effect=Exception("TEST")) # Test - result = process_entry(entry, None, None) + result = process_entry(entry, {}, None) # Assert entry.get_latest.assert_called_once() @@ -260,7 +260,7 @@ def test_get_none_if_no_new_version(self): entry.get_latest = MagicMock(return_value=None) # Test - result = process_entry(entry, None, twitter) + result = process_entry(entry, {}, twitter) # Assert entry.get_latest.assert_called_once() @@ -281,7 +281,7 @@ def test_do_not_tweet_if_entry_has_no_diff(self): entry.get_latest = MagicMock(return_value=version) # Test - result = process_entry(entry, None, twitter) + result = process_entry(entry, {}, twitter) # Assert entry.get_latest.assert_called_once() @@ -302,7 +302,7 @@ def test_do_not_tweet_if_feed_has_no_token(self): entry.get_latest = MagicMock(return_value=version) # Test - result = process_entry(entry, None, twitter) + result = process_entry(entry, {}, twitter) # Assert entry.get_latest.assert_called_once() @@ -773,3 +773,37 @@ def test_latin1_to_utf8(self): text_utf8 = "Me preocupa más la parte futbolística" result = to_utf8(text_latin) self.assertEquals(result, text_utf8) + + +class MatchesTest(TestCase): + skip_pattern = "subscribe.*\\d{2} articles" + + def test_matches_does_not_match(self): + result = matches( + self.skip_pattern, "Hey! You need to subscribe to 1 article to continue" + ) + self.assertFalse(result) + + def test_matches_does_match(self): + result = matches( + self.skip_pattern, "Hey! You need to subscribe to 10 articles to continue" + ) + self.assertTrue(result) + + def test_matches_with_multiline(self): + result = matches( + self.skip_pattern, "Hey!\nYou need to subscribe to 10 articles\nto continue" + ) + self.assertTrue(result) + + def test_matches_is_case_insensitive(self): + result = matches( + self.skip_pattern, "Hey!\nYou need to SubsCribe to 10 ARTicles\nto continue" + ) + self.assertTrue(result) + + def test_matches_is_accent_insensitive(self): + result = matches( + self.skip_pattern, "Hey!\nYou need to SubsCribé to 10 ARTiclès\nto continue" + ) + self.assertTrue(result) From 3b364917332ff95bd1de5e9063bf699e9eeec6a6 Mon Sep 17 00:00:00 2001 From: Nahue Date: Wed, 3 Jun 2020 00:00:40 -0300 Subject: [PATCH 11/13] Merge branch 'diffengine/master' into feature/skip_regex # Conflicts: # diffengine/__init__.py # test_diffengine.py --- .travis.yml | 5 +- README.md | 56 ++++-- config-test.yaml | 7 + diffengine/__init__.py | 102 ++++++---- diffengine/exceptions/sendgrid.py | 4 +- diffengine/exceptions/twitter.py | 4 +- diffengine/sendgrid.py | 8 +- diffengine/twitter.py | 15 +- diffengine/utils.py | 11 ++ requirements.txt | 1 + test_diffengine.py | 306 +++++++++++++++++------------- 11 files changed, 316 insertions(+), 203 deletions(-) create mode 100644 config-test.yaml create mode 100644 diffengine/utils.py diff --git a/.travis.yml b/.travis.yml index 30d011b..ef8f266 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,10 @@ script: py.test before_install: - sudo apt-get -y update - sudo apt-get install firefox-geckodriver - - sudo apt-get install --upgrade chromium-chromedriver +before_script: + - wget https://chromedriver.storage.googleapis.com/83.0.4103.39/chromedriver_linux64.zip + - unzip chromedriver_linux64.zip -d /home/travis/virtualenv/python3.7.1/bin/ + - export CHROME_BIN=chromium-browser after_failure: cat test/diffengine.log notifications: slack: diff --git a/README.md b/README.md index 9470b9c..31a3484 100644 --- a/README.md +++ b/README.md @@ -88,30 +88,31 @@ Logs can be found in `diffengine.log` in the storage directory, for example Checkout [Ryan Baumann's "diffengine" Twitter list] for a list of known diffengine Twitter accounts that are out there. -## Tweeting text options +## Config options -By default, the tweeted diff will include the article's title and the archive diff url, [like this](https://twitter.com/mp_diff/status/1255973684994625539). +### Database engine -You change this by tweeting what's changed: the url, the title and/or the summary. For doing so, you need to specify **all** the following `lang` keys: +By default the database is configured for Sqlite and the file `./diffengine.db` through the `db` config prop ```yaml -lang: - change_in: "Change in" - the_url: "the URL" - the_title: "the title" - and: "and" - the_summary: "the summary" +db: sqlite:///diffengine.db ``` -Only if all the keys are defined, the tweet will include what's changed on its content, followed by the `diff.url`. Some examples: +This value responds to the [database URL connection string format](http://docs.peewee-orm.com/en/latest/peewee/playhouse.html#database-url). -- "Change in the title" -- "Change in the summary" -- "Change in the title and the summary" +For instance, you can co˚nnect to your postgresql database using something like this. -And so on with all the possible combinations between url, title and summary +```yaml +db: postgresql://postgres:my_password@localhost:5432/my_database +``` + +In case you store your database url connection into an environment var, like in Heroku. You can simply do as follows. -## Multiple Accounts & Feed Implementation Example +```yaml +db: "${DATABASE_URL}" +``` + +### Multiple Accounts & Feed Implementation Example If you are setting multiple accounts, and multiple feeds if may be helpful to setup a directory for each account. For example: @@ -155,6 +156,29 @@ twitter: consumer_secret: CONSUMER_SECRET ``` +### Tweet content + +By default, the tweeted diff will include the article's title and the archive diff url, [like this](https://twitter.com/mp_diff/status/1255973684994625539). + +You change this by tweeting what's changed: the url, the title and/or the summary. For doing so, you need to specify **all** the following `lang` keys: + +```yaml +lang: + change_in: "Change in" + the_url: "the URL" + the_title: "the title" + and: "and" + the_summary: "the summary" +``` + +Only if all the keys are defined, the tweet will include what's changed on its content, followed by the `diff.url`. Some examples: + +- "Change in the title" +- "Change in the summary" +- "Change in the title and the summary" + +And so on with all the possible combinations between url, title and summary + ### Support for environment vars The configuration file has support for [environment variables](https://medium.com/chingu/an-introduction-to-environment-variables-and-how-to-use-them-f602f66d15fa). This is useful if you want to keeping your credentials secure when deploying to Heroku, Vercel (former ZEIT Now), AWS, Azure, Google Cloud or any other similar services. The environment variables are defined on the app of the platform you use or directly in a [dotenv file](https://12factor.net/config), which is the usual case when coding locally. @@ -176,7 +200,7 @@ MY_CONSUMER_SECRET_ENV_VAR="CONSUMER_SECRET" Done! You can use diffengine as usual and keep your credentials safe. -## Adding a Twitter account when the configuration file is already created +### Adding a Twitter account when the configuration file is already created You can use the following command for adding Twitter accounts to the config file. diff --git a/config-test.yaml b/config-test.yaml new file mode 100644 index 0000000..3fefc3b --- /dev/null +++ b/config-test.yaml @@ -0,0 +1,7 @@ +db: ${TEST_DATABASE_URL} +twitter: + consumer_key: ${TEST_CONSUMER_KEY} + consumer_secret: ${TEST_CONSUMER_SECRET} + token: + access_token: ${TEST_ACCESS_TOKEN} + access_token_secret: ${TEST_ACCESS_TOKEN_SECRET} diff --git a/diffengine/__init__.py b/diffengine/__init__.py index e5a5fb0..3166959 100755 --- a/diffengine/__init__.py +++ b/diffengine/__init__.py @@ -8,7 +8,6 @@ import os import re import sys -import json import time import yaml import bleach @@ -19,46 +18,50 @@ import logging import argparse import requests -import selenium import htmldiff2 import feedparser -import subprocess import readability import unicodedata -from peewee import * -from playhouse.migrate import SqliteMigrator, migrate from datetime import datetime +from diffengine.exceptions.webdriver import UnknownWebdriverError +from diffengine.exceptions.sendgrid import SendgridConfigNotFoundError, SendgridError +from diffengine.exceptions.twitter import TwitterConfigNotFoundError, TwitterError +from diffengine.text import to_utf8 +from diffengine.sendgrid import SendgridHandler +from diffengine.twitter import TwitterHandler +from envyaml import EnvYAML +from peewee import ( + DatabaseProxy, + CharField, + DateTimeField, + OperationalError, + ForeignKeyField, + Model, + SqliteDatabase, + TextField, +) +from playhouse.db_url import connect +from playhouse.migrate import SqliteMigrator, migrate from selenium import webdriver from selenium.webdriver.chrome.options import Options as ChromeOptions from selenium.webdriver.firefox.options import Options as FirefoxOptions from urllib.parse import urlparse, urlunparse, parse_qs, urlencode -from envyaml import EnvYAML - -from diffengine.exceptions.webdriver import UnknownWebdriverError -from diffengine.exceptions.twitter import ConfigNotFoundError, TwitterError -from diffengine.text import to_utf8, matches -from diffengine.twitter import TwitterHandler -from diffengine.exceptions.sendgrid import ( - ConfigNotFoundError as SGConfigNotFoundError, - SendgridError, -) -from diffengine.sendgrid import SendgridHandler home = None config = {} -db = SqliteDatabase(None) +database = DatabaseProxy() browser = None class BaseModel(Model): class Meta: - database = db + database = database class Feed(BaseModel): - url = CharField(primary_key=True) - name = CharField() + url = TextField(primary_key=True) + name = TextField() created = DateTimeField(default=datetime.utcnow) @property @@ -102,7 +105,7 @@ def get_latest(self): class Entry(BaseModel): - url = CharField() + url = TextField() created = DateTimeField(default=datetime.utcnow) checked = DateTimeField(default=datetime.utcnow) tweet_status_id_str = CharField(null=False, default="") @@ -154,9 +157,9 @@ def get_latest(self, skip_pattern=None): be returned. """ - # make sure we don't go too fast - # TODO: can we remove this? Why is this here? - time.sleep(1) + time_sleep = config.get("time_sleep", 0) + if time_sleep > 0: + time.sleep(time_sleep) # fetch the current readability-ized content for the page logging.info("checking %s", self.url) @@ -238,11 +241,11 @@ class FeedEntry(BaseModel): class EntryVersion(BaseModel): - title = CharField() - url = CharField(index=True) - summary = CharField() + title = TextField() + url = TextField(index=True) + summary = TextField() created = DateTimeField(default=datetime.utcnow) - archive_url = CharField(null=True) + archive_url = TextField(null=True) entry = ForeignKeyField(Entry, backref="versions") tweet_status_id_str = CharField(null=False, default="") @@ -306,6 +309,18 @@ class Diff(BaseModel): emailed = DateTimeField(null=True) blogged = DateTimeField(null=True) + @property + def url_changed(self): + return self.old.url != self.new.url + + @property + def title_changed(self): + return self.old.title != self.new.title + + @property + def summary_changed(self): + return self.old.summary != self.new.summary + @property def html_path(self): # use prime number to spread across directories @@ -492,17 +507,20 @@ def home_path(rel_path): def setup_db(): - global db - db_file = config.get("db", home_path("diffengine.db")) - logging.debug("connecting to db %s", db_file) - db.init(db_file) - db.connect() - db.create_tables([Feed, Entry, FeedEntry, EntryVersion, Diff], safe=True) - try: - migrator = SqliteMigrator(db) - migrate(migrator.add_index("entryversion", ("url",), False)) - except OperationalError as e: - logging.debug(e) + global home, database + database_url = config.get("db", "sqlite:///diffengine.db") + logging.debug("connecting to db %s", database_url) + database_handler = connect(database_url) + database.initialize(database_handler) + database.connect() + database.create_tables([Feed, Entry, FeedEntry, EntryVersion, Diff], safe=True) + + if isinstance(database_handler, SqliteDatabase): + try: + migrator = SqliteMigrator(database_handler) + migrate(migrator.add_index("entryversion", ("url",), False)) + except OperationalError as e: + logging.debug(e) def chromedriver_browser(executable_path, binary_location): @@ -539,7 +557,7 @@ def setup_browser(engine="geckodriver", executable_path=None, binary_location="" def init(new_home, prompt=True): - global home, browser + global home, config, browser home = new_home load_config(prompt) try: @@ -572,7 +590,7 @@ def main(): twitter_handler = TwitterHandler( twitter_config["consumer_key"], twitter_config["consumer_secret"] ) - except ConfigNotFoundError as e: + except TwitterConfigNotFoundError as e: twitter_handler = None logging.warning("error when creating Twitter Handler. Reason", str(e)) except KeyError as e: @@ -637,7 +655,7 @@ def process_entry(entry, feed_config={}, twitter=None, sendgrid=None, lang={}): version.diff, feed_config.get("sendgrid", {}) ) - except SGConfigNotFoundError as e: + except SendgridConfigNotFoundError as e: logging.error( "Missing configuration values for publishing entry %s", entry.url, diff --git a/diffengine/exceptions/sendgrid.py b/diffengine/exceptions/sendgrid.py index 1b95567..a7118e3 100644 --- a/diffengine/exceptions/sendgrid.py +++ b/diffengine/exceptions/sendgrid.py @@ -2,7 +2,7 @@ class SendgridError(RuntimeError): pass -class ConfigNotFoundError(SendgridError): +class SendgridConfigNotFoundError(SendgridError): """Exception raised if the Sendgrid instance has not the API key""" def __init__(self): @@ -14,6 +14,6 @@ def __init__(self, diff_id): self.message = "diff %s was already emailed with sendgrid " % diff_id -class ArchiveUrlNotFoundError(SendgridError): +class SendgridArchiveUrlNotFoundError(SendgridError): def __init__(self): self.message = "not publishing without archive urls" diff --git a/diffengine/exceptions/twitter.py b/diffengine/exceptions/twitter.py index 81f71c8..112c006 100644 --- a/diffengine/exceptions/twitter.py +++ b/diffengine/exceptions/twitter.py @@ -2,7 +2,7 @@ class TwitterError(RuntimeError): pass -class ConfigNotFoundError(TwitterError): +class TwitterConfigNotFoundError(TwitterError): """Exception raised if the Twitter instance has not the required key and secret""" def __init__(self): @@ -21,7 +21,7 @@ def __init__(self, diff): self.message = "diff %s has already been tweeted" % diff.id -class AchiveUrlNotFoundError(TwitterError): +class TwitterAchiveUrlNotFoundError(TwitterError): def __init__(self, diff): self.message = "not tweeting without archive urls for diff %s" % diff.id diff --git a/diffengine/sendgrid.py b/diffengine/sendgrid.py index c382f68..640a180 100644 --- a/diffengine/sendgrid.py +++ b/diffengine/sendgrid.py @@ -5,8 +5,8 @@ from diffengine.exceptions.sendgrid import ( AlreadyEmailedError, - ConfigNotFoundError, - ArchiveUrlNotFoundError, + SendgridConfigNotFoundError, + SendgridArchiveUrlNotFoundError, ) @@ -43,13 +43,13 @@ def publish_diff(self, diff, feed_config): if diff.emailed: raise AlreadyEmailedError(diff.id) elif not (diff.old.archive_url and diff.new.archive_url): - raise ArchiveUrlNotFoundError() + raise SendgridArchiveUrlNotFoundError() api_token = feed_config.get("api_token", self.api_token) sender = feed_config.get("sender", self.sender) receivers = feed_config.get("receivers", self.receivers) if not all([api_token, sender, receivers]): - raise ConfigNotFoundError + raise SendgridConfigNotFoundError subject = self.build_subject(diff) message = Mail( diff --git a/diffengine/twitter.py b/diffengine/twitter.py index 702a8ee..5447a4f 100644 --- a/diffengine/twitter.py +++ b/diffengine/twitter.py @@ -6,9 +6,9 @@ from diffengine.text import build_text from diffengine.exceptions.twitter import ( AlreadyTweetedError, - ConfigNotFoundError, + TwitterConfigNotFoundError, TokenNotFoundError, - AchiveUrlNotFoundError, + TwitterAchiveUrlNotFoundError, UpdateStatusError, ) @@ -19,7 +19,7 @@ class TwitterHandler: def __init__(self, consumer_key, consumer_secret): if not consumer_key or not consumer_secret: - raise ConfigNotFoundError() + raise TwitterConfigNotFoundError() self.consumer_key = consumer_key self.consumer_secret = consumer_secret @@ -59,14 +59,14 @@ def tweet_diff(self, diff, token=None, lang={}): elif diff.tweeted: raise AlreadyTweetedError(diff) elif not (diff.old.archive_url and diff.new.archive_url): - raise AchiveUrlNotFoundError(diff) + raise TwitterAchiveUrlNotFoundError(diff) twitter = self.api(token) text = build_text(diff, lang) # Check if the thread exists thread_status_id_str = None - if diff.old.entry.tweet_status_id_str is None: + if diff.old.entry.tweet_status_id_str == "": try: thread_status_id_str = self.create_thread( diff.old.entry, diff.old, token @@ -98,3 +98,8 @@ def tweet_diff(self, diff, token=None, lang={}): diff.save() except Exception as e: logging.error("unable to tweet: %s", e) + + def delete_diff(self, diff, token=None): + twitter = self.api(token) + twitter.destroy_status(diff.old.tweet_status_id_str) + twitter.destroy_status(diff.new.tweet_status_id_str) diff --git a/diffengine/utils.py b/diffengine/utils.py new file mode 100644 index 0000000..59372d3 --- /dev/null +++ b/diffengine/utils.py @@ -0,0 +1,11 @@ +import os +import yaml + + +def generate_config(home, content): + config_file = os.path.join(home, "config.yaml") + + if not os.path.isdir(home): + os.makedirs(home) + + yaml.dump(content, open(config_file, "w"), default_flow_style=False) diff --git a/requirements.txt b/requirements.txt index 7d1cd07..9cced30 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,4 @@ readability-lxml envyaml>=0.1912 pre-commit==2.3.0 sendgrid +psycopg2-binary==2.8.5 diff --git a/test_diffengine.py b/test_diffengine.py index 6eba9f6..4cd42ed 100644 --- a/test_diffengine.py +++ b/test_diffengine.py @@ -1,14 +1,14 @@ import logging import os import re - import yaml -from selenium import webdriver +from envyaml import EnvYAML import setup import pytest import shutil +from selenium import webdriver from unittest import TestCase from unittest.mock import MagicMock, patch from unittest.mock import PropertyMock @@ -27,147 +27,178 @@ UA, TwitterHandler, SendgridHandler, + _fingerprint, ) +from diffengine.text import build_text, to_utf8, matches +from diffengine.utils import generate_config from diffengine.exceptions.sendgrid import ( - ConfigNotFoundError as SGConfigNotFoundError, - AlreadyEmailedError as SGAlreadyEmailedError, - ArchiveUrlNotFoundError as SGArchiveNotFoundError, + SendgridConfigNotFoundError, + AlreadyEmailedError, + SendgridArchiveUrlNotFoundError, ) -from diffengine.text import build_text, to_utf8, matches from diffengine.exceptions.twitter import ( - ConfigNotFoundError, + TwitterConfigNotFoundError, TokenNotFoundError, AlreadyTweetedError, - AchiveUrlNotFoundError, + TwitterAchiveUrlNotFoundError, UpdateStatusError, ) -if os.path.isdir("test"): - shutil.rmtree("test") - -# set things up but disable prompting for initial feed -init("test", prompt=False) +test_home = "test" +test_env_file = ".env" +test_config = EnvYAML( + "config-test.yaml", + env_file=test_env_file if os.path.isfile(test_env_file) else None, +) -# the sequence of these tests is significant +if os.path.isdir(test_home): + shutil.rmtree(test_home) def test_version(): assert setup.version in UA -def test_feed(): - f = Feed.create(name="Test", url="https://inkdroid.org/feed.xml") - f.get_latest() - assert f.created - assert len(f.entries) == 10 - - -def test_entry(): - f = Feed.get(Feed.url == "https://inkdroid.org/feed.xml") - e = f.entries[0] - v = e.get_latest() - assert type(v) == EntryVersion - assert len(e.versions) == 1 - - -def test_diff(): - f = Feed.get(Feed.url == "https://inkdroid.org/feed.xml") - e = f.entries[0] - v1 = e.versions[0] - - # remove some characters from the version - v1.summary = v1.summary[0:-20] - v1.save() - - v2 = e.get_latest() - assert type(v2) == EntryVersion - assert v2.diff - assert v2.archive_url is not None - assert ( - re.match("^https://web.archive.org/web/[0-9]+/.+$", v2.archive_url) is not None - ) - - diff = v2.diff - assert diff.old == v1 - assert diff.new == v2 - assert os.path.isfile(diff.html_path) - assert os.path.isfile(diff.screenshot_path) - assert os.path.isfile(diff.thumbnail_path) - - # check that the url for the internet archive diff is working - assert re.match("^https://web.archive.org/web/diff/\\d+/\\d+/https.+$", diff.url) - - -def test_html_diff(): - f = Feed.get(Feed.url == "https://inkdroid.org/feed.xml") - e = f.entries[0] - - # add a change to the summary that htmldiff ignores - v1 = e.versions[-1] - parts = v1.summary.split() - parts.insert(2, "
\n") - v1.summary = " ".join(parts) - v1.save() - - v2 = e.get_latest() - assert v2 is None - - -def test_many_to_many(): - - # these two feeds share this entry, we want diffengine to support - # multiple feeds for the same content, which is fairly common at - # large media organizations with multiple topical feeds - url = "https://www.washingtonpost.com/classic-apps/how-a-week-of-tweets-by-trump-stoked-anxiety-moved-markets-and-altered-plans/2017/01/07/38be8e64-d436-11e6-9cb0-54ab630851e8_story.html" - - f1 = Feed.create( - name="feed1", - url="https://raw.githubusercontent.com/DocNow/diffengine/master/test-data/feed1.xml", - ) - f1.get_latest() +def test_fingerprint(): + assert _fingerprint("foo bar") == "foobar" + assert _fingerprint("foo bar\nbaz") == "foobarbaz" + assert _fingerprint("foo
bar") == "foobar" + assert _fingerprint("foo'bar") == "foobar" + assert _fingerprint("foo’bar") == "foobar" - f2 = Feed.create( - name="feed2", - url="https://raw.githubusercontent.com/DocNow/diffengine/master/test-data/feed2.xml", - ) - f2.get_latest() - assert f1.entries.where(Entry.url == url).count() == 1 - assert f2.entries.where(Entry.url == url).count() == 1 +class FeedTest(TestCase): + feed = None + entry = None + version = None - e = Entry.get(Entry.url == url) - assert FeedEntry.select().where(FeedEntry.entry == e).count() == 2 + def setUp(self) -> None: + generate_config(test_home, {"db": test_config.get("db", "sqlite:///:memory:")}) + # set things up but disable prompting for initial feed + init(test_home, prompt=False) + self.feed = Feed.create(name="Test", url="https://inkdroid.org/feed.xml") + self.feed.get_latest() + self.entry = self.feed.entries[0] + self.version = self.entry.get_latest() + + def test_feed(self): + assert self.feed.created + assert len(self.feed.entries) == 10 + + def test_entry(self): + assert type(self.version) == EntryVersion + assert len(self.entry.versions) == 1 + + def test_diff(self): + e = self.entry + v1 = e.versions[0] + + # remove some characters from the version + v1.summary = v1.summary[0:-20] + v1.save() + + v2 = e.get_latest() + assert type(v2) == EntryVersion + assert v2.diff + assert v2.archive_url is not None + assert ( + re.match("^https://web.archive.org/web/[0-9]+/.+$", v2.archive_url) + is not None + ) + diff = v2.diff + assert diff.old == v1 + assert diff.new == v2 + assert os.path.isfile(diff.html_path) + assert os.path.isfile(diff.screenshot_path) + assert os.path.isfile(diff.thumbnail_path) -def test_bad_feed_url(): - # bad feed url shouldn't cause a fatal exception - f = Feed.create(name="feed1", url="http://example.org/feedfeed.xml") - f.get_latest() - assert True + # check that the url for the internet archive diff is working + assert re.match( + "^https://web.archive.org/web/diff/\\d+/\\d+/https.+$", diff.url + ) + def test_html_diff(self): + e = self.entry -def test_whitespace(): - f = Feed.get(url="https://inkdroid.org/feed.xml") - e = f.entries[0] - v1 = e.versions[-1] + # add a change to the summary that htmldiff ignores + v1 = e.versions[-1] + parts = v1.summary.split() + parts.insert(2, "
\n") + v1.summary = " ".join(parts) + v1.save() - # add some whitespace - v1.summary = v1.summary + "\n\n " - v1.save() + v2 = e.get_latest() + assert v2 is None - # whitespace should not count when diffing - v2 = e.get_latest() - assert v2 == None + def test_many_to_many(self): + # these two feeds share this entry, we want diffengine to support + # multiple feeds for the same content, which is fairly common at + # large media organizations with multiple topical feeds + url = "https://www.washingtonpost.com/classic-apps/how-a-week-of-tweets-by-trump-stoked-anxiety-moved-markets-and-altered-plans/2017/01/07/38be8e64-d436-11e6-9cb0-54ab630851e8_story.html" -def test_fingerprint(): - from diffengine import _fingerprint + f1 = Feed.create( + name="feed1", + url="https://raw.githubusercontent.com/DocNow/diffengine/master/test-data/feed1.xml", + ) + f1.get_latest() - assert _fingerprint("foo bar") == "foobar" - assert _fingerprint("foo bar\nbaz") == "foobarbaz" - assert _fingerprint("foo
bar") == "foobar" - assert _fingerprint("foo'bar") == "foobar" - assert _fingerprint("foo’bar") == "foobar" + f2 = Feed.create( + name="feed2", + url="https://raw.githubusercontent.com/DocNow/diffengine/master/test-data/feed2.xml", + ) + f2.get_latest() + + assert f1.entries.where(Entry.url == url).count() == 1 + assert f2.entries.where(Entry.url == url).count() == 1 + + e = Entry.get(Entry.url == url) + assert FeedEntry.select().where(FeedEntry.entry == e).count() == 2 + + def test_bad_feed_url(self): + # bad feed url shouldn't cause a fatal exception + f = Feed.create(name="feed1", url="http://example.org/feedfeed.xml") + f.get_latest() + assert True + + def test_whitespace(self): + e = self.feed.entries[0] + v1 = e.versions[-1] + + # add some whitespace + v1.summary = v1.summary + "\n\n " + v1.save() + + # whitespace should not count when diffing + v2 = e.get_latest() + assert v2 == None + + # This one is only for tweeting purposes only + # If no .env var is set, this one will success anyway :) + def test_tweet_diff(self): + e = self.entry + v1 = e.versions[0] + + # remove some characters from the version + v1.summary = v1.summary[0:-20] + v1.save() + + v2 = e.get_latest() + + # run this alone for checking correct tweeting behavior + if v2 is not None: + diff = v2.diff + try: + token = test_config.get("twitter.token") + twitter_handler = TwitterHandler( + test_config.get("twitter.consumer_key"), + test_config.get("twitter.consumer_secret"), + ) + twitter_handler.tweet_diff(diff, token) + twitter_handler.delete_diff(diff, token) + except Exception: + logging.debug("no tweet configured for test. Doing nothing") class EnvVarsTest(TestCase): @@ -186,8 +217,7 @@ def test_config_file_integration(self): test_config = { "example": {"private_value": private_yaml_key, "public_value": public_value} } - config_file = home_path("config.yaml") - yaml.dump(test_config, open(config_file, "w"), default_flow_style=False) + generate_config(test_home, test_config) # test! new_config = load_config() @@ -367,13 +397,17 @@ def tearDown(self) -> None: logging.disable(logging.NOTSET) def test_raises_if_no_config_set(self): - self.assertRaises(ConfigNotFoundError, TwitterHandler, None, None) - self.assertRaises(ConfigNotFoundError, TwitterHandler, "myConsumerKey", None) - self.assertRaises(ConfigNotFoundError, TwitterHandler, None, "myConsumerSecret") + self.assertRaises(TwitterConfigNotFoundError, TwitterHandler, None, None) + self.assertRaises( + TwitterConfigNotFoundError, TwitterHandler, "myConsumerKey", None + ) + self.assertRaises( + TwitterConfigNotFoundError, TwitterHandler, None, "myConsumerSecret" + ) try: TwitterHandler("myConsumerKey", "myConsumerSecret") - except ConfigNotFoundError: + except TwitterConfigNotFoundError: self.fail("Twitter.__init__ raised ConfigNotFoundError unexpectedly!") def test_raises_if_no_token_provided(self): @@ -401,15 +435,19 @@ def test_raises_if_not_all_archive_urls_are_present(self): } twitter = TwitterHandler("myConsumerKey", "myConsumerSecret") - self.assertRaises(AchiveUrlNotFoundError, twitter.tweet_diff, diff, token) + self.assertRaises( + TwitterAchiveUrlNotFoundError, twitter.tweet_diff, diff, token + ) type(diff.old).archive_url = PropertyMock(return_value="http://test.url/old") - self.assertRaises(AchiveUrlNotFoundError, twitter.tweet_diff, diff, token) + self.assertRaises( + TwitterAchiveUrlNotFoundError, twitter.tweet_diff, diff, token + ) type(diff.new).archive_url = PropertyMock(return_value="http://test.url/new") try: twitter.tweet_diff(diff, token) - except AchiveUrlNotFoundError: + except TwitterAchiveUrlNotFoundError: self.fail("twitter.tweet_diff raised AchiveUrlNotFoundError unexpectedly!") class MockedStatus(MagicMock): @@ -423,7 +461,7 @@ def test_create_thread_if_old_entry_has_no_related_tweet( ): entry = MagicMock() - type(entry).tweet_status_id_str = PropertyMock(return_value=None) + type(entry).tweet_status_id_str = PropertyMock(return_value="") diff = get_mocked_diff() type(diff.old).entry = entry @@ -560,10 +598,10 @@ def test_raises_if_no_config_set(self): type(diff).emailed = PropertyMock(return_value=False) sendgrid = SendgridHandler({}) - self.assertRaises(SGConfigNotFoundError, sendgrid.publish_diff, diff, {}) + self.assertRaises(SendgridConfigNotFoundError, sendgrid.publish_diff, diff, {}) try: sendgrid.publish_diff(diff, self.config["sendgrid"]) - except SGConfigNotFoundError: + except SendgridConfigNotFoundError: self.fail("sendgrid.publish_diff raised ConfigNotFoundError unexpectedly!") def test_raises_if_already_emailed(self): @@ -572,7 +610,7 @@ def test_raises_if_already_emailed(self): sendgrid = SendgridHandler(self.config["sendgrid"]) self.assertRaises( - SGAlreadyEmailedError, sendgrid.publish_diff, diff, self.config["sendgrid"] + AlreadyEmailedError, sendgrid.publish_diff, diff, self.config["sendgrid"] ) def test_raises_if_not_all_archive_urls_are_present(self): @@ -580,18 +618,24 @@ def test_raises_if_not_all_archive_urls_are_present(self): sendgrid = SendgridHandler(self.config["sendgrid"]) self.assertRaises( - SGArchiveNotFoundError, sendgrid.publish_diff, diff, self.config["sendgrid"] + SendgridArchiveUrlNotFoundError, + sendgrid.publish_diff, + diff, + self.config["sendgrid"], ) type(diff.old).archive_url = PropertyMock(return_value="http://test.url/old") self.assertRaises( - SGArchiveNotFoundError, sendgrid.publish_diff, diff, self.config["sendgrid"] + SendgridArchiveUrlNotFoundError, + sendgrid.publish_diff, + diff, + self.config["sendgrid"], ) type(diff.new).archive_url = PropertyMock(return_value="http://test.url/new") try: sendgrid.publish_diff(diff, self.config["sendgrid"]) - except SGArchiveNotFoundError: + except SendgridArchiveUrlNotFoundError: self.fail( "sendgrid.publish_diff raised AchiveUrlNotFoundError unexpectedly!" ) From 07d8edaff562e580d0318e47ab2dc72860f14295 Mon Sep 17 00:00:00 2001 From: Nahue Date: Wed, 3 Jun 2020 00:08:47 -0300 Subject: [PATCH 12/13] Readme --- README.md | 22 +++++++++++++++++++++- diffengine/__init__.py | 2 +- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 31a3484..db01f84 100644 --- a/README.md +++ b/README.md @@ -156,9 +156,29 @@ twitter: consumer_secret: CONSUMER_SECRET ``` +### Skip entry + +You can also keep an entry if matches with a regular expression pattern. This is useful for avoid the "subscribe now" pages. +This is configured per feed like so: + +```yaml +- name: The Globe and Mail - Report on Business + skip_pattern: "you have access to only \\d+ articles" + twitter: + access_token: ACCESS_TOKEN + access_token_secret: ACCESS_TOKEN_SECRET + url: http://www.theglobeandmail.com/report-on-business/?service=rss +``` + +In this example, if the page says contains the text "you have access to only 10 articles" will skip it. the same if says any number of articles as it's a regular expression. +The `skip_pattern` performs a `re.search` operation and uses the flags for `case insensitive` and `multiline`. + +Look for the docs for [more information about Regular Expressions and the search operation.](https://docs.python.org/3/library/re.html#search-vs-match) + + ### Tweet content -By default, the tweeted diff will include the article's title and the archive diff url, [like this](https://twitter.com/mp_diff/status/1255973684994625539). +By default, the tweeted diff will include the article's title and the archive diff url, [like this.](https://twitter.com/ld_diff/status/1267989297048817672) You change this by tweeting what's changed: the url, the title and/or the summary. For doing so, you need to specify **all** the following `lang` keys: diff --git a/diffengine/__init__.py b/diffengine/__init__.py index 3166959..e958550 100755 --- a/diffengine/__init__.py +++ b/diffengine/__init__.py @@ -27,7 +27,7 @@ from diffengine.exceptions.webdriver import UnknownWebdriverError from diffengine.exceptions.sendgrid import SendgridConfigNotFoundError, SendgridError from diffengine.exceptions.twitter import TwitterConfigNotFoundError, TwitterError -from diffengine.text import to_utf8 +from diffengine.text import to_utf8, matches from diffengine.sendgrid import SendgridHandler from diffengine.twitter import TwitterHandler from envyaml import EnvYAML From ebf3c9cee9f055478478dffe23885c8ec8657d7e Mon Sep 17 00:00:00 2001 From: Nahue Date: Wed, 3 Jun 2020 00:12:28 -0300 Subject: [PATCH 13/13] Log the skipping action --- diffengine/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/diffengine/__init__.py b/diffengine/__init__.py index e958550..4f34178 100755 --- a/diffengine/__init__.py +++ b/diffengine/__init__.py @@ -184,6 +184,9 @@ def get_latest(self, skip_pattern=None): if skip_pattern and ( matches(skip_pattern, title) or matches(skip_pattern, summary) ): + logging.info( + "Skipped page. It matches the skip_pattern prop defined for this feed." + ) return None # in case there was a redirect, and remove utm style marketing