Skip to content

Commit

Permalink
Merge pull request #262 from JustAnotherArchivist/ignoracle-regex-cache
Browse files Browse the repository at this point in the history
Cache the ignoracle patterns while primary_url and primary_netloc stay constant
  • Loading branch information
hannahwhy committed Sep 20, 2018
2 parents 89a9d91 + dea91b3 commit fc99354
Showing 1 changed file with 29 additions and 14 deletions.
43 changes: 29 additions & 14 deletions pipeline/archivebot/wpull/ignoracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ class Ignoracle(object):

patterns = []

def __init__(self):
self._primary = None
self._compiled = []

def set_patterns(self, strings):
'''
Given a list of strings, replaces this Ignoracle's pattern state with
Expand All @@ -70,6 +74,9 @@ def set_patterns(self, strings):

self.patterns.append(string)

self._primary = None
self._compiled = []

def ignores(self, url_record: wpull.pipeline.item.URLRecord):
'''
If an ignore pattern matches the given URL, returns that pattern as a string.
Expand All @@ -78,20 +85,28 @@ def ignores(self, url_record: wpull.pipeline.item.URLRecord):

params = parameterize_record_info(url_record)

primary_url = re.escape(params.get('primary_url') or '')
primary_loc = re.escape(params.get('primary_netloc') or '')

for pattern in self.patterns:
try:
expanded = pattern.replace('{primary_url}', primary_url)
expanded = expanded.replace('{primary_netloc}', primary_loc)
match = re.search(expanded, url_record.url)

if match:
return pattern
except re.error as error:
print('Pattern %s is invalid (error: %s). Ignored.'
% (pattern, str(error)), file=sys.stderr)
primaryUrl = params.get('primary_url') or ''
primaryNetloc = params.get('primary_netloc') or ''
if self._primary != (primaryUrl, primaryNetloc):
self._compiled = []
escapedPrimaryUrl = re.escape(primaryUrl)
escapedPrimaryNetloc = re.escape(primaryNetloc)
for pattern in self.patterns:
try:
expanded = pattern.replace('{primary_url}', escapedPrimaryUrl)
expanded = expanded.replace('{primary_netloc}', escapedPrimaryNetloc)
compiledPattern = re.compile(expanded)
except re.error as error:
print('Pattern %s is invalid (error: %s). Ignored.'
% (pattern, str(error)), file=sys.stderr)
self._compiled.append((pattern, compiledPattern))
self._primary = (primaryUrl, primaryNetloc)

for pattern, compiled in self._compiled:
match = compiled.search(url_record.url)

if match:
return pattern

return False

Expand Down

0 comments on commit fc99354

Please sign in to comment.