diff --git a/pipeline.py b/pipeline.py index 3a29489..114638d 100644 --- a/pipeline.py +++ b/pipeline.py @@ -58,7 +58,8 @@ # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker. -VERSION = '20181211.01' + +VERSION = '20181212.02' USER_AGENT = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html; ArchiveTeam)' TRACKER_ID = 'tumblr' TRACKER_HOST = 'tracker.archiveteam.org' diff --git a/tumblr.lua b/tumblr.lua index 1499b53..86ada48 100644 --- a/tumblr.lua +++ b/tumblr.lua @@ -62,7 +62,8 @@ allowed = function(url, parenturl) or string.match(url, "^https?://" .. item_value .. "%.tumblr%.com/.*/amp$") or string.match(url, "^https?://" .. item_value .. "%.tumblr%.com/rss$") or string.match(url, "^https?://" .. item_value .. "%.tumblr%.com/reblog") - or string.match(url, "^https?://" .. item_value .. "%.tumblr%.com/.*%?route=") then + or string.match(url, "^https?://" .. item_value .. "%.tumblr%.com/.*%?route=") + or string.match(url, "^https?://" .. item_value .. "%.tumblr%.com/[^/]+%%") then return false end @@ -117,10 +118,21 @@ wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_pars local url = urlpos["url"]["url"] local html = urlpos["link_expect_html"] + if string.find(url, "code%.jquery%.com") then + -- Ignore code.jquery.com + return false + end + + if string.find(url, "fonts%.googleapis%.com") then + -- Ignore fonts.googleapis.com + return false + end + if string.find(url, "px.srvcs.tumblr.com") then -- Ignore px.srvcs.tumblr.com tracking domain return false end + if string.match(url, "^https?://www.tumblr.com/oembed/1.0") or string.match(url, "^https?://[0-9]+%.media%.tumblr%.com/avatar_[a-zA-Z0-9]+_64%.pnj") or string.match(url, "^https?://[0-9]+%.media%.tumblr%.com/avatar_[a-zA-Z0-9]+_64%.gif")