From e926de30fd2f134a94c54282a78bdf6adf1f12ad Mon Sep 17 00:00:00 2001 From: arkiver Date: Wed, 17 Apr 2024 17:31:04 +0200 Subject: [PATCH] Version 20240417.03. Handle new form of old spam loop. --- pipeline.py | 2 +- urls.lua | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pipeline.py b/pipeline.py index 06195fd..fb17781 100644 --- a/pipeline.py +++ b/pipeline.py @@ -83,7 +83,7 @@ def search(self, text): # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker. -VERSION = '20240417.02' +VERSION = '20240417.03' #USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36' TRACKER_ID = 'urls' TRACKER_HOST = 'legacy-api.arpa.li' diff --git a/urls.lua b/urls.lua index ecc0b93..d6eadd4 100644 --- a/urls.lua +++ b/urls.lua @@ -247,11 +247,16 @@ local filter_pattern_sets = { --["flashplayer"]="^https?://www%.macromedia%.com/go/getflashplayer$", ["tupian"]={ "^https?://[^/]+%.[^%./]+%.[a-z]+/tupian_1/[^%./]+%.jpg$", - "^https?://[^/]+%.[^%./]+%.[a-z]+/templates/moban", - "^https?://[^/]+%.[^%./]+%.[a-z]+/templates/[^/]+/moban" + "^https?://[^/]+%.[^%./]+%.[a-z]+/templates/moban[0-9]*/", + "^https?://[^/]+%.[^%./]+%.[a-z]+/templates/[^/]+/moban[0-9]*/" }, ["slash"]="^https?://[^/]+%.[^%./]+%.[a-z]+/[^/]+/$", - ["tk88"]="tk88", + ["other1"]={ + "tk88", + "^https?://[^/]+%.[^%./]+%.[a-z]+/list_[a-z]+/$", + "^https?://[^/]+%.[^%./]+%.[a-z]+/news/[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][^0-9]", + "^https?://[^/]+%.[^%./]+%.[a-z]+/static/logo%.jpg" + }, ["other"]={ "%.xlsx?$", "%.pptx?$",