Skip to content

Commit

Permalink
Merge pull request #24 from marked/patch-7
Browse files Browse the repository at this point in the history
More URLs block oembed, amp, rss
  • Loading branch information
kiska3 committed Dec 12, 2018
2 parents b7adcfc + 8f2a3a4 commit 9093c09
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 3 deletions.
2 changes: 1 addition & 1 deletion pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
#
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = '20181210.05'
VERSION = '20181211.01'
USER_AGENT = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html; ArchiveTeam)'
TRACKER_ID = 'tumblr'
TRACKER_HOST = 'tracker.archiveteam.org'
Expand Down
6 changes: 4 additions & 2 deletions tumblr.lua
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ allowed = function(url, parenturl)
or string.match(url, "^https?://assets%.tumblr%.com/client")
or string.match(url, "^https?://static%.tumblr%.com/[%u%p%l]+")
or string.match(url, "ios%-app://")
or string.match(url, "^https?://" .. item_value .. "%.tumblr%.com/.*/amp$")
or string.match(url, "^https?://" .. item_value .. "%.tumblr%.com/rss$")
or string.match(url, "^https?://" .. item_value .. "%.tumblr%.com/reblog")
or string.match(url, "^https?://" .. item_value .. "%.tumblr%.com/.*%?route=") then
return false
Expand Down Expand Up @@ -119,8 +121,8 @@ wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_pars
-- Ignore px.srvcs.tumblr.com tracking domain
return false
end

if string.match(url, "^https?://[0-9]+%.media%.tumblr%.com/avatar_[a-zA-Z0-9]+_64%.pnj")
if string.match(url, "^https?://www.tumblr.com/oembed/1.0")
or string.match(url, "^https?://[0-9]+%.media%.tumblr%.com/avatar_[a-zA-Z0-9]+_64%.pnj")
or string.match(url, "^https?://[0-9]+%.media%.tumblr%.com/avatar_[a-zA-Z0-9]+_64%.gif")
or string.match(url, "^https?://[0-9]+%.media%.tumblr%.com/avatar_[a-zA-Z0-9]+_16%.pnj")
or string.match(url, "^https?://[0-9]+%.media%.tumblr%.com/avatar_[a-zA-Z0-9]+_16%.gif") then
Expand Down

0 comments on commit 9093c09

Please sign in to comment.