Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/DocNow/diffengine into re…
Browse files Browse the repository at this point in the history
…quests.exceptions.MissingSchema
  • Loading branch information
ruebot committed Jan 16, 2017
2 parents f72e895 + 8a90763 commit 6eec9a6
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 15 deletions.
36 changes: 23 additions & 13 deletions diffengine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,18 +109,18 @@ def stale(self):
logging.debug("%s not stale (r=%f)", self.url, r)
return False

def get_latest(self, force=True):
def get_latest(self):
"""
get_latest is the heart of the application. If the entry is stale
it will get the current version on the web, extract its summary with
readability and compare it against a previous version. If a difference
is found it will compute the diff, save it as html and png files, and
tell Internet Archive to create a snapshot.
get_latest is the heart of the application. It will get the current
version on the web, extract its summary with readability and compare
it against a previous version. If a difference is found it will
compute the diff, save it as html and png files, and tell Internet
Archive to create a snapshot.
If a new version was found it will be returned, otherwise None will
be returned.
"""

if not self.stale and not force:
return

# make sure we don't go too fast
time.sleep(1)

Expand All @@ -129,7 +129,7 @@ def get_latest(self, force=True):
resp = requests.get(self.url, headers={"User-Agent": UA})
if resp.status_code != 200:
logging.warn("Got %s when fetching %s", resp.status_code, self.url)
return
return None

doc = readability.Document(resp.text)
title = doc.title()
Expand Down Expand Up @@ -175,6 +175,7 @@ def get_latest(self, force=True):

self.checked = datetime.utcnow()
self.save()

return new


Expand Down Expand Up @@ -402,7 +403,7 @@ def tweet_diff(diff, token):
if len(status) >= 85:
status = status[0:85] + "…"

status += " " + diff.old.archive_url + " -> " + diff.new.archive_url
status += " " + diff.old.archive_url + " " + diff.new.archive_url

try:
twitter.update_with_media(diff.thumbnail_path, status)
Expand Down Expand Up @@ -431,9 +432,10 @@ def main():
start_time = datetime.utcnow()
logging.info("starting up with home=%s", home)

checked = skipped = new = 0

try:
for f in config.get('feeds', []):

feed, created = Feed.create_or_get(url=f['url'], name=f['name'])
if created:
logging.debug("created new feed for %s", f['url'])
Expand All @@ -443,13 +445,21 @@ def main():

# get latest content for each entry
for entry in feed.entries:
if not entry.stale:
skipped += 1
continue
checked += 1
version = entry.get_latest()
if version:
new_count += 1
if version and version.diff and 'twitter' in f:
tweet_diff(version.diff, f['twitter'])
except Exception as e:
logging.error("unable to access: %s due to %s", f['url'], e)

logging.info("shutting down: %s", (datetime.utcnow() - start_time))
elapsed = datetime.utcnow() - start_time
logging.info("shutting down: new=%s checked=%s skipped=%s elapsed=%s",
new, checked, skipped, elapsed)

def _dt(d):
return d.strftime("%Y-%m-%d %H:%M:%S")
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

setup(
name="diffengine",
version="0.0.21",
version="0.0.22",
author="Ed Summers",
author_email="ehs@pobox.com",
packages=find_packages(exclude=['test_diffengine']),
Expand Down
2 changes: 1 addition & 1 deletion test_diffengine.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_diff():
v1.summary = v1.summary[0:-20]
v1.save()

v2 = e.get_latest(force=True)
v2 = e.get_latest()
assert type(v2) == EntryVersion
assert v2.diff
assert re.match("^https://wayback.archive.org/web/[0-9]+/.+$",
Expand Down

0 comments on commit 6eec9a6

Please sign in to comment.