From 40e95b06dfcdd062282e470fe01ef780c6fd9e7d Mon Sep 17 00:00:00 2001 From: Nahue Date: Mon, 25 May 2020 09:03:42 -0300 Subject: [PATCH 01/17] Database is now connected through proxy and inferred by its url defined at config `db` --- diffengine/__init__.py | 67 +++++++++++++++++++++++------------------- diffengine/sendgrid.py | 8 ++--- diffengine/twitter.py | 8 ++--- diffengine/utils.py | 11 +++++++ exceptions/sendgrid.py | 4 +-- exceptions/twitter.py | 4 +-- test_diffengine.py | 61 +++++++++++++++++++++++--------------- 7 files changed, 97 insertions(+), 66 deletions(-) create mode 100644 diffengine/utils.py diff --git a/diffengine/__init__.py b/diffengine/__init__.py index 8936e65..52652e4 100755 --- a/diffengine/__init__.py +++ b/diffengine/__init__.py @@ -8,7 +8,6 @@ import os import re import sys -import json import time import yaml import bleach @@ -19,40 +18,45 @@ import logging import argparse import requests -import selenium import htmldiff2 import feedparser -import subprocess import readability import unicodedata -from peewee import * -from playhouse.migrate import SqliteMigrator, migrate +from diffengine.sendgrid import SendgridHandler +from diffengine.twitter import TwitterHandler + +from exceptions.webdriver import UnknownWebdriverError +from exceptions.sendgrid import SendgridConfigNotFoundError, SendgridError +from exceptions.twitter import TwitterConfigNotFoundError, TwitterError + from datetime import datetime +from peewee import ( + DatabaseProxy, + CharField, + DateTimeField, + OperationalError, + ForeignKeyField, + Model, + SqliteDatabase, +) +from playhouse.db_url import connect +from playhouse.migrate import SqliteMigrator, migrate from selenium import webdriver from selenium.webdriver.chrome.options import Options as ChromeOptions from selenium.webdriver.firefox.options import Options as FirefoxOptions from urllib.parse import urlparse, urlunparse, parse_qs, urlencode from envyaml import EnvYAML -from exceptions.webdriver import UnknownWebdriverError -from exceptions.twitter import ConfigNotFoundError, TwitterError -from diffengine.twitter import TwitterHandler -from exceptions.sendgrid import ( - ConfigNotFoundError as SGConfigNotFoundError, - SendgridError, -) -from diffengine.sendgrid import SendgridHandler - home = None config = {} -db = SqliteDatabase(None) +database = DatabaseProxy() browser = None class BaseModel(Model): class Meta: - database = db + database = database class Feed(BaseModel): @@ -484,17 +488,20 @@ def home_path(rel_path): def setup_db(): - global db - db_file = config.get("db", home_path("diffengine.db")) - logging.debug("connecting to db %s", db_file) - db.init(db_file) - db.connect() - db.create_tables([Feed, Entry, FeedEntry, EntryVersion, Diff], safe=True) - try: - migrator = SqliteMigrator(db) - migrate(migrator.add_index("entryversion", ("url",), False)) - except OperationalError as e: - logging.debug(e) + global home, database + database_url = config.get("db", "sqlite:///diffengine.db") + logging.debug("connecting to db %s", database_url) + database_handler = connect(database_url) + database.initialize(database_handler) + database.connect() + database.create_tables([Feed, Entry, FeedEntry, EntryVersion, Diff], safe=True) + + if isinstance(database_handler, SqliteDatabase): + try: + migrator = SqliteMigrator(database) + migrate(migrator.add_index("entryversion", ("url",), False)) + except OperationalError as e: + logging.debug(e) def chromedriver_browser(executable_path, binary_location): @@ -531,7 +538,7 @@ def setup_browser(engine="geckodriver", executable_path=None, binary_location="" def init(new_home, prompt=True): - global home, browser + global home, config, browser home = new_home load_config(prompt) try: @@ -564,7 +571,7 @@ def main(): twitter_handler = TwitterHandler( twitter_config["consumer_key"], twitter_config["consumer_secret"] ) - except ConfigNotFoundError as e: + except TwitterConfigNotFoundError as e: twitter_handler = None logging.warning("error when creating Twitter Handler. Reason", str(e)) except KeyError as e: @@ -628,7 +635,7 @@ def process_entry(entry, feed_config, twitter=None, sendgrid=None, lang={}): version.diff, feed_config.get("sendgrid", {}) ) - except SGConfigNotFoundError as e: + except SendgridConfigNotFoundError as e: logging.error( "Missing configuration values for publishing entry %s", entry.url, diff --git a/diffengine/sendgrid.py b/diffengine/sendgrid.py index 96ba09f..c969880 100644 --- a/diffengine/sendgrid.py +++ b/diffengine/sendgrid.py @@ -5,8 +5,8 @@ from exceptions.sendgrid import ( AlreadyEmailedError, - ConfigNotFoundError, - ArchiveUrlNotFoundError, + SendgridConfigNotFoundError, + SendgridArchiveUrlNotFoundError, ) @@ -43,13 +43,13 @@ def publish_diff(self, diff, feed_config): if diff.emailed: raise AlreadyEmailedError(diff.id) elif not (diff.old.archive_url and diff.new.archive_url): - raise ArchiveUrlNotFoundError() + raise SendgridArchiveUrlNotFoundError() api_token = feed_config.get("api_token", self.api_token) sender = feed_config.get("sender", self.sender) receivers = feed_config.get("receivers", self.receivers) if not all([api_token, sender, receivers]): - raise ConfigNotFoundError + raise SendgridConfigNotFoundError subject = self.build_subject(diff) message = Mail( diff --git a/diffengine/twitter.py b/diffengine/twitter.py index 6c171ff..02d3631 100644 --- a/diffengine/twitter.py +++ b/diffengine/twitter.py @@ -6,9 +6,9 @@ from diffengine.text_builder import build_text from exceptions.twitter import ( AlreadyTweetedError, - ConfigNotFoundError, + TwitterConfigNotFoundError, TokenNotFoundError, - AchiveUrlNotFoundError, + TwitterAchiveUrlNotFoundError, UpdateStatusError, ) @@ -19,7 +19,7 @@ class TwitterHandler: def __init__(self, consumer_key, consumer_secret): if not consumer_key or not consumer_secret: - raise ConfigNotFoundError() + raise TwitterConfigNotFoundError() self.consumer_key = consumer_key self.consumer_secret = consumer_secret @@ -59,7 +59,7 @@ def tweet_diff(self, diff, token=None, lang={}): elif diff.tweeted: raise AlreadyTweetedError(diff) elif not (diff.old.archive_url and diff.new.archive_url): - raise AchiveUrlNotFoundError(diff) + raise TwitterAchiveUrlNotFoundError(diff) twitter = self.api(token) text = build_text(diff, lang) diff --git a/diffengine/utils.py b/diffengine/utils.py new file mode 100644 index 0000000..59372d3 --- /dev/null +++ b/diffengine/utils.py @@ -0,0 +1,11 @@ +import os +import yaml + + +def generate_config(home, content): + config_file = os.path.join(home, "config.yaml") + + if not os.path.isdir(home): + os.makedirs(home) + + yaml.dump(content, open(config_file, "w"), default_flow_style=False) diff --git a/exceptions/sendgrid.py b/exceptions/sendgrid.py index 1b95567..a7118e3 100644 --- a/exceptions/sendgrid.py +++ b/exceptions/sendgrid.py @@ -2,7 +2,7 @@ class SendgridError(RuntimeError): pass -class ConfigNotFoundError(SendgridError): +class SendgridConfigNotFoundError(SendgridError): """Exception raised if the Sendgrid instance has not the API key""" def __init__(self): @@ -14,6 +14,6 @@ def __init__(self, diff_id): self.message = "diff %s was already emailed with sendgrid " % diff_id -class ArchiveUrlNotFoundError(SendgridError): +class SendgridArchiveUrlNotFoundError(SendgridError): def __init__(self): self.message = "not publishing without archive urls" diff --git a/exceptions/twitter.py b/exceptions/twitter.py index 81f71c8..112c006 100644 --- a/exceptions/twitter.py +++ b/exceptions/twitter.py @@ -2,7 +2,7 @@ class TwitterError(RuntimeError): pass -class ConfigNotFoundError(TwitterError): +class TwitterConfigNotFoundError(TwitterError): """Exception raised if the Twitter instance has not the required key and secret""" def __init__(self): @@ -21,7 +21,7 @@ def __init__(self, diff): self.message = "diff %s has already been tweeted" % diff.id -class AchiveUrlNotFoundError(TwitterError): +class TwitterAchiveUrlNotFoundError(TwitterError): def __init__(self, diff): self.message = "not tweeting without archive urls for diff %s" % diff.id diff --git a/test_diffengine.py b/test_diffengine.py index e455059..a1d42f2 100644 --- a/test_diffengine.py +++ b/test_diffengine.py @@ -1,14 +1,12 @@ import logging import os import re - import yaml -from selenium import webdriver - import setup import pytest import shutil +from selenium import webdriver from unittest import TestCase from unittest.mock import MagicMock, patch from unittest.mock import PropertyMock @@ -28,23 +26,25 @@ TwitterHandler, SendgridHandler, ) +from diffengine.text_builder import build_text +from diffengine.utils import generate_config from exceptions.sendgrid import ( - ConfigNotFoundError as SGConfigNotFoundError, - AlreadyEmailedError as SGAlreadyEmailedError, - ArchiveUrlNotFoundError as SGArchiveNotFoundError, + SendgridConfigNotFoundError, + AlreadyEmailedError, + SendgridArchiveUrlNotFoundError, ) -from diffengine.text_builder import build_text from exceptions.twitter import ( - ConfigNotFoundError, + TwitterConfigNotFoundError, TokenNotFoundError, AlreadyTweetedError, - AchiveUrlNotFoundError, + TwitterAchiveUrlNotFoundError, UpdateStatusError, ) if os.path.isdir("test"): shutil.rmtree("test") +generate_config("test", {"db": "sqlite:///:memory:"}) # set things up but disable prompting for initial feed init("test", prompt=False) @@ -186,8 +186,7 @@ def test_config_file_integration(self): test_config = { "example": {"private_value": private_yaml_key, "public_value": public_value} } - config_file = home_path("config.yaml") - yaml.dump(test_config, open(config_file, "w"), default_flow_style=False) + generate_config("test", test_config) # test! new_config = load_config() @@ -367,13 +366,17 @@ def tearDown(self) -> None: logging.disable(logging.NOTSET) def test_raises_if_no_config_set(self): - self.assertRaises(ConfigNotFoundError, TwitterHandler, None, None) - self.assertRaises(ConfigNotFoundError, TwitterHandler, "myConsumerKey", None) - self.assertRaises(ConfigNotFoundError, TwitterHandler, None, "myConsumerSecret") + self.assertRaises(TwitterConfigNotFoundError, TwitterHandler, None, None) + self.assertRaises( + TwitterConfigNotFoundError, TwitterHandler, "myConsumerKey", None + ) + self.assertRaises( + TwitterConfigNotFoundError, TwitterHandler, None, "myConsumerSecret" + ) try: TwitterHandler("myConsumerKey", "myConsumerSecret") - except ConfigNotFoundError: + except TwitterConfigNotFoundError: self.fail("Twitter.__init__ raised ConfigNotFoundError unexpectedly!") def test_raises_if_no_token_provided(self): @@ -401,15 +404,19 @@ def test_raises_if_not_all_archive_urls_are_present(self): } twitter = TwitterHandler("myConsumerKey", "myConsumerSecret") - self.assertRaises(AchiveUrlNotFoundError, twitter.tweet_diff, diff, token) + self.assertRaises( + TwitterAchiveUrlNotFoundError, twitter.tweet_diff, diff, token + ) type(diff.old).archive_url = PropertyMock(return_value="http://test.url/old") - self.assertRaises(AchiveUrlNotFoundError, twitter.tweet_diff, diff, token) + self.assertRaises( + TwitterAchiveUrlNotFoundError, twitter.tweet_diff, diff, token + ) type(diff.new).archive_url = PropertyMock(return_value="http://test.url/new") try: twitter.tweet_diff(diff, token) - except AchiveUrlNotFoundError: + except TwitterAchiveUrlNotFoundError: self.fail("twitter.tweet_diff raised AchiveUrlNotFoundError unexpectedly!") class MockedStatus(MagicMock): @@ -560,10 +567,10 @@ def test_raises_if_no_config_set(self): type(diff).emailed = PropertyMock(return_value=False) sendgrid = SendgridHandler({}) - self.assertRaises(SGConfigNotFoundError, sendgrid.publish_diff, diff, {}) + self.assertRaises(SendgridConfigNotFoundError, sendgrid.publish_diff, diff, {}) try: sendgrid.publish_diff(diff, self.config["sendgrid"]) - except SGConfigNotFoundError: + except SendgridConfigNotFoundError: self.fail("sendgrid.publish_diff raised ConfigNotFoundError unexpectedly!") def test_raises_if_already_emailed(self): @@ -572,7 +579,7 @@ def test_raises_if_already_emailed(self): sendgrid = SendgridHandler(self.config["sendgrid"]) self.assertRaises( - SGAlreadyEmailedError, sendgrid.publish_diff, diff, self.config["sendgrid"] + AlreadyEmailedError, sendgrid.publish_diff, diff, self.config["sendgrid"] ) def test_raises_if_not_all_archive_urls_are_present(self): @@ -580,18 +587,24 @@ def test_raises_if_not_all_archive_urls_are_present(self): sendgrid = SendgridHandler(self.config["sendgrid"]) self.assertRaises( - SGArchiveNotFoundError, sendgrid.publish_diff, diff, self.config["sendgrid"] + SendgridArchiveUrlNotFoundError, + sendgrid.publish_diff, + diff, + self.config["sendgrid"], ) type(diff.old).archive_url = PropertyMock(return_value="http://test.url/old") self.assertRaises( - SGArchiveNotFoundError, sendgrid.publish_diff, diff, self.config["sendgrid"] + SendgridArchiveUrlNotFoundError, + sendgrid.publish_diff, + diff, + self.config["sendgrid"], ) type(diff.new).archive_url = PropertyMock(return_value="http://test.url/new") try: sendgrid.publish_diff(diff, self.config["sendgrid"]) - except SGArchiveNotFoundError: + except SendgridArchiveUrlNotFoundError: self.fail( "sendgrid.publish_diff raised AchiveUrlNotFoundError unexpectedly!" ) From 47f4892de782cb6cfe9d9e56e0a50b7229655c2c Mon Sep 17 00:00:00 2001 From: Nahue Date: Mon, 25 May 2020 13:55:47 -0300 Subject: [PATCH 02/17] psycog2-binary packge for postgresql to work --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 7d1cd07..d62c7b6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,4 @@ readability-lxml envyaml>=0.1912 pre-commit==2.3.0 sendgrid +psycopg2-binary==2.8.3 From 6429461c6dc0ce3353f610d54b38f4113b8d789b Mon Sep 17 00:00:00 2001 From: Nahue Date: Mon, 25 May 2020 09:31:38 -0300 Subject: [PATCH 03/17] Test feed as a suite (cherry picked from commit 9489515ac7f7664687047576f3054ffb942323f9) # Conflicts: # test_diffengine.py --- diffengine/__init__.py | 2 +- test_diffengine.py | 202 ++++++++++++++++++++--------------------- 2 files changed, 102 insertions(+), 102 deletions(-) diff --git a/diffengine/__init__.py b/diffengine/__init__.py index 52652e4..76cee0a 100755 --- a/diffengine/__init__.py +++ b/diffengine/__init__.py @@ -498,7 +498,7 @@ def setup_db(): if isinstance(database_handler, SqliteDatabase): try: - migrator = SqliteMigrator(database) + migrator = SqliteMigrator(database_handler) migrate(migrator.add_index("entryversion", ("url",), False)) except OperationalError as e: logging.debug(e) diff --git a/test_diffengine.py b/test_diffengine.py index a1d42f2..e99cb79 100644 --- a/test_diffengine.py +++ b/test_diffengine.py @@ -25,6 +25,7 @@ UA, TwitterHandler, SendgridHandler, + _fingerprint, ) from diffengine.text_builder import build_text from diffengine.utils import generate_config @@ -41,12 +42,10 @@ UpdateStatusError, ) -if os.path.isdir("test"): - shutil.rmtree("test") +test_home = "test" -generate_config("test", {"db": "sqlite:///:memory:"}) -# set things up but disable prompting for initial feed -init("test", prompt=False) +if os.path.isdir(test_home): + shutil.rmtree(test_home) # the sequence of these tests is significant @@ -55,119 +54,120 @@ def test_version(): assert setup.version in UA -def test_feed(): - f = Feed.create(name="Test", url="https://inkdroid.org/feed.xml") - f.get_latest() - assert f.created - assert len(f.entries) == 10 - - -def test_entry(): - f = Feed.get(Feed.url == "https://inkdroid.org/feed.xml") - e = f.entries[0] - v = e.get_latest() - assert type(v) == EntryVersion - assert len(e.versions) == 1 - - -def test_diff(): - f = Feed.get(Feed.url == "https://inkdroid.org/feed.xml") - e = f.entries[0] - v1 = e.versions[0] - - # remove some characters from the version - v1.summary = v1.summary[0:-20] - v1.save() - - v2 = e.get_latest() - assert type(v2) == EntryVersion - assert v2.diff - assert v2.archive_url is not None - assert ( - re.match("^https://web.archive.org/web/[0-9]+/.+$", v2.archive_url) is not None - ) - - diff = v2.diff - assert diff.old == v1 - assert diff.new == v2 - assert os.path.isfile(diff.html_path) - assert os.path.isfile(diff.screenshot_path) - assert os.path.isfile(diff.thumbnail_path) - - # check that the url for the internet archive diff is working - assert re.match("^https://web.archive.org/web/diff/\d+/\d+/https.+$", diff.url) - - -def test_html_diff(): - f = Feed.get(Feed.url == "https://inkdroid.org/feed.xml") - e = f.entries[0] +def test_fingerprint(): + assert _fingerprint("foo bar") == "foobar" + assert _fingerprint("foo bar\nbaz") == "foobarbaz" + assert _fingerprint("foo
bar") == "foobar" + assert _fingerprint("foo'bar") == "foobar" + assert _fingerprint("foo’bar") == "foobar" - # add a change to the summary that htmldiff ignores - v1 = e.versions[-1] - parts = v1.summary.split() - parts.insert(2, "
\n") - v1.summary = " ".join(parts) - v1.save() - v2 = e.get_latest() - assert v2 is None +class FeedTest(TestCase): + feed = None + entry = None + version = None + def setUp(self) -> None: + generate_config(test_home, {"db": "sqlite:///:memory:"}) + # set things up but disable prompting for initial feed + init(test_home, prompt=False) + self.feed = Feed.create(name="Test", url="https://inkdroid.org/feed.xml") + self.feed.get_latest() + self.entry = self.feed.entries[0] + self.version = self.entry.get_latest() + + def test_feed(self): + assert self.feed.created + assert len(self.feed.entries) == 10 + + def test_entry(self): + assert type(self.version) == EntryVersion + assert len(self.entry.versions) == 1 + + def test_diff(self): + e = self.entry + v1 = e.versions[0] + + # remove some characters from the version + v1.summary = v1.summary[0:-20] + v1.save() + + v2 = e.get_latest() + assert type(v2) == EntryVersion + assert v2.diff + assert v2.archive_url is not None + assert ( + re.match("^https://web.archive.org/web/[0-9]+/.+$", v2.archive_url) + is not None + ) -def test_many_to_many(): + diff = v2.diff + assert diff.old == v1 + assert diff.new == v2 + assert os.path.isfile(diff.html_path) + assert os.path.isfile(diff.screenshot_path) + assert os.path.isfile(diff.thumbnail_path) - # these two feeds share this entry, we want diffengine to support - # multiple feeds for the same content, which is fairly common at - # large media organizations with multiple topical feeds - url = "https://www.washingtonpost.com/classic-apps/how-a-week-of-tweets-by-trump-stoked-anxiety-moved-markets-and-altered-plans/2017/01/07/38be8e64-d436-11e6-9cb0-54ab630851e8_story.html" + # check that the url for the internet archive diff is working + assert re.match( + "^https://web.archive.org/web/diff/\\d+/\\d+/https.+$", diff.url + ) - f1 = Feed.create( - name="feed1", - url="https://raw.githubusercontent.com/DocNow/diffengine/master/test-data/feed1.xml", - ) - f1.get_latest() + def test_html_diff(self): + e = self.entry - f2 = Feed.create( - name="feed2", - url="https://raw.githubusercontent.com/DocNow/diffengine/master/test-data/feed2.xml", - ) - f2.get_latest() + # add a change to the summary that htmldiff ignores + v1 = e.versions[-1] + parts = v1.summary.split() + parts.insert(2, "
\n") + v1.summary = " ".join(parts) + v1.save() - assert f1.entries.where(Entry.url == url).count() == 1 - assert f2.entries.where(Entry.url == url).count() == 1 + v2 = e.get_latest() + assert v2 is None - e = Entry.get(Entry.url == url) - assert FeedEntry.select().where(FeedEntry.entry == e).count() == 2 + def test_many_to_many(self): + # these two feeds share this entry, we want diffengine to support + # multiple feeds for the same content, which is fairly common at + # large media organizations with multiple topical feeds + url = "https://www.washingtonpost.com/classic-apps/how-a-week-of-tweets-by-trump-stoked-anxiety-moved-markets-and-altered-plans/2017/01/07/38be8e64-d436-11e6-9cb0-54ab630851e8_story.html" -def test_bad_feed_url(): - # bad feed url shouldn't cause a fatal exception - f = Feed.create(name="feed1", url="http://example.org/feedfeed.xml") - f.get_latest() - assert True + f1 = Feed.create( + name="feed1", + url="https://raw.githubusercontent.com/DocNow/diffengine/master/test-data/feed1.xml", + ) + f1.get_latest() + f2 = Feed.create( + name="feed2", + url="https://raw.githubusercontent.com/DocNow/diffengine/master/test-data/feed2.xml", + ) + f2.get_latest() -def test_whitespace(): - f = Feed.get(url="https://inkdroid.org/feed.xml") - e = f.entries[0] - v1 = e.versions[-1] + assert f1.entries.where(Entry.url == url).count() == 1 + assert f2.entries.where(Entry.url == url).count() == 1 - # add some whitespace - v1.summary = v1.summary + "\n\n " - v1.save() + e = Entry.get(Entry.url == url) + assert FeedEntry.select().where(FeedEntry.entry == e).count() == 2 - # whitespace should not count when diffing - v2 = e.get_latest() - assert v2 == None + def test_bad_feed_url(self): + # bad feed url shouldn't cause a fatal exception + f = Feed.create(name="feed1", url="http://example.org/feedfeed.xml") + f.get_latest() + assert True + def test_whitespace(self): + e = self.feed.entries[0] + v1 = e.versions[-1] -def test_fingerprint(): - from diffengine import _fingerprint + # add some whitespace + v1.summary = v1.summary + "\n\n " + v1.save() - assert _fingerprint("foo bar") == "foobar" - assert _fingerprint("foo bar\nbaz") == "foobarbaz" - assert _fingerprint("foo
bar") == "foobar" - assert _fingerprint("foo'bar") == "foobar" - assert _fingerprint("foo’bar") == "foobar" + # whitespace should not count when diffing + v2 = e.get_latest() + assert v2 == None class EnvVarsTest(TestCase): @@ -186,7 +186,7 @@ def test_config_file_integration(self): test_config = { "example": {"private_value": private_yaml_key, "public_value": public_value} } - generate_config("test", test_config) + generate_config(test_home, test_config) # test! new_config = load_config() From ff785c7f1f1cc6cd8322e50e6de46c274be7b4e7 Mon Sep 17 00:00:00 2001 From: Nahue Date: Mon, 25 May 2020 14:04:40 -0300 Subject: [PATCH 04/17] Useless comment now all the test suite is covered --- test_diffengine.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test_diffengine.py b/test_diffengine.py index e99cb79..0499f85 100644 --- a/test_diffengine.py +++ b/test_diffengine.py @@ -47,8 +47,6 @@ if os.path.isdir(test_home): shutil.rmtree(test_home) -# the sequence of these tests is significant - def test_version(): assert setup.version in UA From 680e510623732a339652788efd7e876cc63f6ed7 Mon Sep 17 00:00:00 2001 From: Nahue Date: Mon, 25 May 2020 09:45:48 -0300 Subject: [PATCH 05/17] Readme (cherry picked from commit 913bf8eaf60f2c99e825e35f19d828443d1390ae) --- README.md | 56 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 9470b9c..31a3484 100644 --- a/README.md +++ b/README.md @@ -88,30 +88,31 @@ Logs can be found in `diffengine.log` in the storage directory, for example Checkout [Ryan Baumann's "diffengine" Twitter list] for a list of known diffengine Twitter accounts that are out there. -## Tweeting text options +## Config options -By default, the tweeted diff will include the article's title and the archive diff url, [like this](https://twitter.com/mp_diff/status/1255973684994625539). +### Database engine -You change this by tweeting what's changed: the url, the title and/or the summary. For doing so, you need to specify **all** the following `lang` keys: +By default the database is configured for Sqlite and the file `./diffengine.db` through the `db` config prop ```yaml -lang: - change_in: "Change in" - the_url: "the URL" - the_title: "the title" - and: "and" - the_summary: "the summary" +db: sqlite:///diffengine.db ``` -Only if all the keys are defined, the tweet will include what's changed on its content, followed by the `diff.url`. Some examples: +This value responds to the [database URL connection string format](http://docs.peewee-orm.com/en/latest/peewee/playhouse.html#database-url). -- "Change in the title" -- "Change in the summary" -- "Change in the title and the summary" +For instance, you can co˚nnect to your postgresql database using something like this. -And so on with all the possible combinations between url, title and summary +```yaml +db: postgresql://postgres:my_password@localhost:5432/my_database +``` + +In case you store your database url connection into an environment var, like in Heroku. You can simply do as follows. -## Multiple Accounts & Feed Implementation Example +```yaml +db: "${DATABASE_URL}" +``` + +### Multiple Accounts & Feed Implementation Example If you are setting multiple accounts, and multiple feeds if may be helpful to setup a directory for each account. For example: @@ -155,6 +156,29 @@ twitter: consumer_secret: CONSUMER_SECRET ``` +### Tweet content + +By default, the tweeted diff will include the article's title and the archive diff url, [like this](https://twitter.com/mp_diff/status/1255973684994625539). + +You change this by tweeting what's changed: the url, the title and/or the summary. For doing so, you need to specify **all** the following `lang` keys: + +```yaml +lang: + change_in: "Change in" + the_url: "the URL" + the_title: "the title" + and: "and" + the_summary: "the summary" +``` + +Only if all the keys are defined, the tweet will include what's changed on its content, followed by the `diff.url`. Some examples: + +- "Change in the title" +- "Change in the summary" +- "Change in the title and the summary" + +And so on with all the possible combinations between url, title and summary + ### Support for environment vars The configuration file has support for [environment variables](https://medium.com/chingu/an-introduction-to-environment-variables-and-how-to-use-them-f602f66d15fa). This is useful if you want to keeping your credentials secure when deploying to Heroku, Vercel (former ZEIT Now), AWS, Azure, Google Cloud or any other similar services. The environment variables are defined on the app of the platform you use or directly in a [dotenv file](https://12factor.net/config), which is the usual case when coding locally. @@ -176,7 +200,7 @@ MY_CONSUMER_SECRET_ENV_VAR="CONSUMER_SECRET" Done! You can use diffengine as usual and keep your credentials safe. -## Adding a Twitter account when the configuration file is already created +### Adding a Twitter account when the configuration file is already created You can use the following command for adding Twitter accounts to the config file. From 777351c798d20e198d0456824e7d3e049dc41c40 Mon Sep 17 00:00:00 2001 From: Nahue Date: Mon, 25 May 2020 14:15:34 -0300 Subject: [PATCH 06/17] Configurable time sleep. Default to 0. Closes #76 --- diffengine/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/diffengine/__init__.py b/diffengine/__init__.py index 76cee0a..81f0c32 100755 --- a/diffengine/__init__.py +++ b/diffengine/__init__.py @@ -159,7 +159,9 @@ def get_latest(self): # make sure we don't go too fast # TODO: can we remove this? Why is this here? - time.sleep(1) + time_sleep = config.get("time_sleep", 0) + if time_sleep > 0: + time.sleep(time_sleep) # fetch the current readability-ized content for the page logging.info("checking %s", self.url) From 6ebfcd5b6028d1214af67c7f660055971fb931b3 Mon Sep 17 00:00:00 2001 From: Nahue Date: Mon, 25 May 2020 14:19:21 -0300 Subject: [PATCH 07/17] Binary update --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d62c7b6..9cced30 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,4 @@ readability-lxml envyaml>=0.1912 pre-commit==2.3.0 sendgrid -psycopg2-binary==2.8.3 +psycopg2-binary==2.8.5 From df8256ed22a779e0292af583d9f1411bae39f8e0 Mon Sep 17 00:00:00 2001 From: Nahue Date: Mon, 25 May 2020 19:39:07 -0300 Subject: [PATCH 08/17] Missing diff props to be able to tweet --- diffengine/__init__.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/diffengine/__init__.py b/diffengine/__init__.py index 81f0c32..61be628 100755 --- a/diffengine/__init__.py +++ b/diffengine/__init__.py @@ -304,6 +304,18 @@ class Diff(BaseModel): emailed = DateTimeField(null=True) blogged = DateTimeField(null=True) + @property + def url_changed(self): + return self.old.url != self.new.url + + @property + def title_changed(self): + return self.old.title != self.new.title + + @property + def summary_changed(self): + return self.old.summary != self.new.summary + @property def html_path(self): # use prime number to spread across directories From 69fa1932b6f61f1eba847a0313eacb2ac4d8762c Mon Sep 17 00:00:00 2001 From: Nahue Date: Mon, 25 May 2020 19:57:42 -0300 Subject: [PATCH 09/17] Thread creation changed to be based on the default value set by @andresfib at #77, which is not `None` anymore but `''` --- diffengine/twitter.py | 2 +- test_diffengine.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/diffengine/twitter.py b/diffengine/twitter.py index 02d3631..1e29674 100644 --- a/diffengine/twitter.py +++ b/diffengine/twitter.py @@ -66,7 +66,7 @@ def tweet_diff(self, diff, token=None, lang={}): # Check if the thread exists thread_status_id_str = None - if diff.old.entry.tweet_status_id_str is None: + if diff.old.entry.tweet_status_id_str == "": try: thread_status_id_str = self.create_thread( diff.old.entry, diff.old, token diff --git a/test_diffengine.py b/test_diffengine.py index 0499f85..6947e04 100644 --- a/test_diffengine.py +++ b/test_diffengine.py @@ -428,7 +428,7 @@ def test_create_thread_if_old_entry_has_no_related_tweet( ): entry = MagicMock() - type(entry).tweet_status_id_str = PropertyMock(return_value=None) + type(entry).tweet_status_id_str = PropertyMock(return_value="") diff = get_mocked_diff() type(diff.old).entry = entry From 032ad34a0280ce9db7c04ef165a08dd50a15d12f Mon Sep 17 00:00:00 2001 From: Nahue Date: Mon, 25 May 2020 20:38:30 -0300 Subject: [PATCH 10/17] Travis specific chromium-chromedriver version to work with Chrome 83 --- .travis.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 30d011b..ef8f266 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,10 @@ script: py.test before_install: - sudo apt-get -y update - sudo apt-get install firefox-geckodriver - - sudo apt-get install --upgrade chromium-chromedriver +before_script: + - wget https://chromedriver.storage.googleapis.com/83.0.4103.39/chromedriver_linux64.zip + - unzip chromedriver_linux64.zip -d /home/travis/virtualenv/python3.7.1/bin/ + - export CHROME_BIN=chromium-browser after_failure: cat test/diffengine.log notifications: slack: From 6cdc5b6381ce56eb635da7563b1318931345a9b0 Mon Sep 17 00:00:00 2001 From: Nahue Date: Mon, 25 May 2020 20:56:08 -0300 Subject: [PATCH 11/17] field type changes to work with postgresql --- diffengine/__init__.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/diffengine/__init__.py b/diffengine/__init__.py index 61be628..8a5fde3 100755 --- a/diffengine/__init__.py +++ b/diffengine/__init__.py @@ -39,6 +39,7 @@ ForeignKeyField, Model, SqliteDatabase, + TextField, ) from playhouse.db_url import connect from playhouse.migrate import SqliteMigrator, migrate @@ -60,8 +61,8 @@ class Meta: class Feed(BaseModel): - url = CharField(primary_key=True) - name = CharField() + url = TextField(primary_key=True) + name = TextField() created = DateTimeField(default=datetime.utcnow) @property @@ -105,7 +106,7 @@ def get_latest(self): class Entry(BaseModel): - url = CharField() + url = TextField() created = DateTimeField(default=datetime.utcnow) checked = DateTimeField(default=datetime.utcnow) tweet_status_id_str = CharField(null=False, default="") @@ -236,11 +237,11 @@ class FeedEntry(BaseModel): class EntryVersion(BaseModel): - title = CharField() - url = CharField(index=True) - summary = CharField() + title = TextField() + url = TextField(index=True) + summary = TextField() created = DateTimeField(default=datetime.utcnow) - archive_url = CharField(null=True) + archive_url = TextField(null=True) entry = ForeignKeyField(Entry, backref="versions") tweet_status_id_str = CharField(null=False, default="") From 7f168c8e2faaa224c2ef800cbbb5fefe763dfa65 Mon Sep 17 00:00:00 2001 From: Nahue Date: Mon, 25 May 2020 21:47:58 -0300 Subject: [PATCH 12/17] Remove this TODO as it's resolved --- diffengine/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/diffengine/__init__.py b/diffengine/__init__.py index 8a5fde3..28508de 100755 --- a/diffengine/__init__.py +++ b/diffengine/__init__.py @@ -158,8 +158,6 @@ def get_latest(self): be returned. """ - # make sure we don't go too fast - # TODO: can we remove this? Why is this here? time_sleep = config.get("time_sleep", 0) if time_sleep > 0: time.sleep(time_sleep) From 55adaf560605dcbf7a06b85d548829340887e450 Mon Sep 17 00:00:00 2001 From: Nahue Date: Mon, 25 May 2020 21:56:11 -0300 Subject: [PATCH 13/17] Hacky test for checking correct tweeting directly from the project --- .env.sample | 5 +++++ config-test.yaml | 7 +++++++ diffengine/twitter.py | 5 +++++ test_diffengine.py | 30 +++++++++++++++++++++++++++++- 4 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 .env.sample create mode 100644 config-test.yaml diff --git a/.env.sample b/.env.sample new file mode 100644 index 0000000..31b8633 --- /dev/null +++ b/.env.sample @@ -0,0 +1,5 @@ +TEST_DATABASE_URL= +TEST_CONSUMER_KEY= +TEST_CONSUMER_SECRET= +TEST_ACCESS_TOKEN= +TEST_ACCESS_TOKEN_SECRET= diff --git a/config-test.yaml b/config-test.yaml new file mode 100644 index 0000000..9b963ae --- /dev/null +++ b/config-test.yaml @@ -0,0 +1,7 @@ +db: "${TEST_DATABASE_URL}" +twitter: + consumer_key: "${TEST_CONSUMER_KEY}" + consumer_secret: "${TEST_CONSUMER_SECRET}" + token: + access_token: "${TEST_ACCESS_TOKEN}" + access_token_secret: "${TEST_ACCESS_TOKEN_SECRET}" diff --git a/diffengine/twitter.py b/diffengine/twitter.py index 1e29674..e886aff 100644 --- a/diffengine/twitter.py +++ b/diffengine/twitter.py @@ -98,3 +98,8 @@ def tweet_diff(self, diff, token=None, lang={}): diff.save() except Exception as e: logging.error("unable to tweet: %s", e) + + def delete_diff(self, diff, token=None): + twitter = self.api(token) + twitter.destroy_status(diff.old.tweet_status_id_str) + twitter.destroy_status(diff.new.tweet_status_id_str) diff --git a/test_diffengine.py b/test_diffengine.py index 6947e04..3da01e9 100644 --- a/test_diffengine.py +++ b/test_diffengine.py @@ -2,6 +2,8 @@ import os import re import yaml +from envyaml import EnvYAML + import setup import pytest import shutil @@ -43,6 +45,7 @@ ) test_home = "test" +test_config = EnvYAML("config-test.yaml", env_file=".env") if os.path.isdir(test_home): shutil.rmtree(test_home) @@ -66,7 +69,7 @@ class FeedTest(TestCase): version = None def setUp(self) -> None: - generate_config(test_home, {"db": "sqlite:///:memory:"}) + generate_config(test_home, {"db": test_config.get("db", "sqlite:///:memory:")}) # set things up but disable prompting for initial feed init(test_home, prompt=False) self.feed = Feed.create(name="Test", url="https://inkdroid.org/feed.xml") @@ -111,6 +114,31 @@ def test_diff(self): "^https://web.archive.org/web/diff/\\d+/\\d+/https.+$", diff.url ) + def test_tweet_diff(self): + e = self.entry + v1 = e.versions[0] + + # remove some characters from the version + v1.summary = v1.summary[0:-20] + v1.save() + + v2 = e.get_latest() + + # Actual tweeting purposes only + # run this alone for checking correct tweeting behavior + if v2 is not None: + diff = v2.diff + try: + token = test_config.get("twitter.token") + twitter_handler = TwitterHandler( + test_config.get("twitter.consumer_key"), + test_config.get("twitter.consumer_secret"), + ) + twitter_handler.tweet_diff(diff, token) + twitter_handler.delete_diff(diff, token) + except Exception: + logging.debug("no tweet configured for test. Doing nothing") + def test_html_diff(self): e = self.entry From 11ccbd36194c561426a1af5710c83eed33a29c9f Mon Sep 17 00:00:00 2001 From: Nahue Date: Mon, 25 May 2020 22:33:25 -0300 Subject: [PATCH 14/17] Hacky test for checking correct tweeting directly from the project --- test_diffengine.py | 50 +++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/test_diffengine.py b/test_diffengine.py index 3da01e9..507a607 100644 --- a/test_diffengine.py +++ b/test_diffengine.py @@ -114,31 +114,6 @@ def test_diff(self): "^https://web.archive.org/web/diff/\\d+/\\d+/https.+$", diff.url ) - def test_tweet_diff(self): - e = self.entry - v1 = e.versions[0] - - # remove some characters from the version - v1.summary = v1.summary[0:-20] - v1.save() - - v2 = e.get_latest() - - # Actual tweeting purposes only - # run this alone for checking correct tweeting behavior - if v2 is not None: - diff = v2.diff - try: - token = test_config.get("twitter.token") - twitter_handler = TwitterHandler( - test_config.get("twitter.consumer_key"), - test_config.get("twitter.consumer_secret"), - ) - twitter_handler.tweet_diff(diff, token) - twitter_handler.delete_diff(diff, token) - except Exception: - logging.debug("no tweet configured for test. Doing nothing") - def test_html_diff(self): e = self.entry @@ -195,6 +170,31 @@ def test_whitespace(self): v2 = e.get_latest() assert v2 == None + # This one is only for tweeting purposes only + def test_tweet_diff(self): + e = self.entry + v1 = e.versions[0] + + # remove some characters from the version + v1.summary = v1.summary[0:-20] + v1.save() + + v2 = e.get_latest() + + # run this alone for checking correct tweeting behavior + if v2 is not None: + diff = v2.diff + try: + token = test_config.get("twitter.token") + twitter_handler = TwitterHandler( + test_config.get("twitter.consumer_key"), + test_config.get("twitter.consumer_secret"), + ) + twitter_handler.tweet_diff(diff, token) + twitter_handler.delete_diff(diff, token) + except Exception: + logging.debug("no tweet configured for test. Doing nothing") + class EnvVarsTest(TestCase): def test_config_file_integration(self): From bbcd3c4747afe16bf9d2f92de1b4e70ce742c43a Mon Sep 17 00:00:00 2001 From: Nahue Date: Mon, 25 May 2020 22:34:04 -0300 Subject: [PATCH 15/17] Note for developers --- test_diffengine.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test_diffengine.py b/test_diffengine.py index 507a607..d5e363b 100644 --- a/test_diffengine.py +++ b/test_diffengine.py @@ -171,6 +171,7 @@ def test_whitespace(self): assert v2 == None # This one is only for tweeting purposes only + # If no .env var is set, this one will success anyway :) def test_tweet_diff(self): e = self.entry v1 = e.versions[0] From a0a8ef13edbc7ced6395806a45000f5f6b91914f Mon Sep 17 00:00:00 2001 From: Nahue Date: Mon, 25 May 2020 22:41:40 -0300 Subject: [PATCH 16/17] .env file fallback for Travis CI --- config-test.yaml | 10 +++++----- test_diffengine.py | 6 +++++- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/config-test.yaml b/config-test.yaml index 9b963ae..3fefc3b 100644 --- a/config-test.yaml +++ b/config-test.yaml @@ -1,7 +1,7 @@ -db: "${TEST_DATABASE_URL}" +db: ${TEST_DATABASE_URL} twitter: - consumer_key: "${TEST_CONSUMER_KEY}" - consumer_secret: "${TEST_CONSUMER_SECRET}" + consumer_key: ${TEST_CONSUMER_KEY} + consumer_secret: ${TEST_CONSUMER_SECRET} token: - access_token: "${TEST_ACCESS_TOKEN}" - access_token_secret: "${TEST_ACCESS_TOKEN_SECRET}" + access_token: ${TEST_ACCESS_TOKEN} + access_token_secret: ${TEST_ACCESS_TOKEN_SECRET} diff --git a/test_diffengine.py b/test_diffengine.py index d5e363b..4662811 100644 --- a/test_diffengine.py +++ b/test_diffengine.py @@ -45,7 +45,11 @@ ) test_home = "test" -test_config = EnvYAML("config-test.yaml", env_file=".env") +test_env_file = ".env" +test_config = EnvYAML( + "config-test.yaml", + env_file=test_env_file if os.path.isfile(test_env_file) else None, +) if os.path.isdir(test_home): shutil.rmtree(test_home) From 0bca6b45470ea258004eb0c4ec96d7683f1ba11c Mon Sep 17 00:00:00 2001 From: Nahue Date: Tue, 2 Jun 2020 21:13:20 -0300 Subject: [PATCH 17/17] Test conflicts resolution --- test_diffengine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_diffengine.py b/test_diffengine.py index 4662811..5d53ed3 100644 --- a/test_diffengine.py +++ b/test_diffengine.py @@ -31,12 +31,12 @@ ) from diffengine.text_builder import build_text from diffengine.utils import generate_config -from exceptions.sendgrid import ( +from diffengine.exceptions.sendgrid import ( SendgridConfigNotFoundError, AlreadyEmailedError, SendgridArchiveUrlNotFoundError, ) -from exceptions.twitter import ( +from diffengine.exceptions.twitter import ( TwitterConfigNotFoundError, TokenNotFoundError, AlreadyTweetedError,