Skip to content

Commit 465e111

Browse files
committed
multi_query error temporarily closes down worker
multi_query raises errors in case of severe problems only. In those cases cases continued querying causes more problems. With this commit an error that is raised out of multi_query will cause the worker to temporarily be closed. Further calls to multi_query will be logged as warning and raise a MultiQueryException. This allows the scoop main process to early terminate and handle the exception. Waiting times are now configurable and in general increased to 30s as they are more direct now. This for example allows the endpoint to restart in case of buuuuugs.
1 parent 6dd4cf7 commit 465e111

File tree

4 files changed

+78
-27
lines changed

4 files changed

+78
-27
lines changed

config/defaults.py

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
PAUSE_FILE = 'pause.lck'
2020

2121
ERROR_RETRIES = 5 # in case an unexpected error occurs retry? (real world!)
22+
ERROR_WAIT = 30 # seconds to wait in case of error before retry
2223

2324
NRUNS = 64 # number of whole coverage runs of the evolutionary algorithm
2425
NRUNS_NO_IMPROVEMENT = 5 # stop if no more coverage patterns found in n runs

gp_learner.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -1447,7 +1447,7 @@ def find_graph_pattern_coverage(
14471447
raise
14481448
except Exception as e:
14491449
error_count += 1
1450-
logger.error('uncaught exception in run %d' % run)
1450+
logger.error('uncaught exception in run %d', run)
14511451
log_wrapped_exception(logger, e)
14521452
if error_count > error_retries:
14531453
logger.error(
@@ -1456,9 +1456,13 @@ def find_graph_pattern_coverage(
14561456
)
14571457
raise
14581458
else:
1459-
logger.error('will retry in 15s despite error...')
1459+
logger.error(
1460+
'this was uncaught exception number %d, will retry in %ds '
1461+
'despite error...',
1462+
error_count, config.ERROR_WAIT
1463+
)
14601464
logging_config.save_error_logs()
1461-
sleep(15)
1465+
sleep(config.ERROR_WAIT)
14621466

14631467
# sort patterns by fitness, run and then pattern
14641468
patterns = sorted(

gp_query.py

+67-22
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,13 @@
77
from collections import defaultdict
88
from collections import Counter
99
from collections import Sequence
10+
from datetime import datetime
11+
from datetime import timedelta
12+
from functools import wraps
1013
import logging
1114
import re
1215
import socket
16+
import sys
1317
from time import sleep
1418

1519
from cachetools import LRUCache
@@ -18,6 +22,7 @@
1822
from rdflib.term import Identifier
1923
import SPARQLWrapper
2024
from SPARQLWrapper.SPARQLExceptions import EndPointNotFound
25+
from SPARQLWrapper.SPARQLExceptions import QueryBadFormed
2126
from SPARQLWrapper.SPARQLExceptions import SPARQLWrapperException
2227
from xml.sax.expatreader import SAXParseException
2328
# noinspection PyUnresolvedReferences
@@ -40,13 +45,15 @@
4045
logger = logging.getLogger(__name__)
4146

4247

48+
class QueryException(Exception):
49+
pass
4350

4451

45-
class EvalException(Exception):
52+
class IncompleteQueryException(Exception):
4653
pass
4754

4855

49-
class QueryException(EvalException):
56+
class MultiQueryException(Exception):
5057
pass
5158

5259

@@ -221,7 +228,7 @@ def _get_vars_values_mapping(graph_pattern, source_target_pairs):
221228
_values = [(t,) for t in sorted(set(targets))]
222229
_val_idx = 1
223230
else:
224-
raise QueryException(
231+
raise IncompleteQueryException(
225232
"tried to run a query on a graph pattern without "
226233
"%s and %s vars:\n%s" % (SOURCE_VAR, TARGET_VAR, graph_pattern)
227234
)
@@ -276,7 +283,42 @@ def _ask_chunk_result_extractor(q_res, _vars, _ret_val_mapping):
276283
return chunk_res
277284

278285

279-
# noinspection PyBroadException
286+
def _exception_closes_worker_guard(func):
287+
"""Temporarily closes _multi_query for current worker.
288+
289+
This is a workaround for SCOOP's map otherwise having already dispatched
290+
further work to this worker, despite an exception of a previous _multi_query
291+
not being handled in origin yet.
292+
293+
An exception being raised out of _multi_query would normally cause origin to
294+
back-off for config.ERROR_WAIT and retry. This "quick fails" all remaining
295+
work in the time frame.
296+
"""
297+
closed = []
298+
wait = timedelta(
299+
seconds=config.ERROR_WAIT * .75 # rather don't close too long
300+
)
301+
302+
@wraps(func)
303+
def _multi_query_wrapper(*args, **kwds):
304+
if closed:
305+
if datetime.utcnow() - closed[0] < wait:
306+
logger.warning(
307+
'_multi_query temporarily closed for worker due to '
308+
'previous exception'
309+
)
310+
raise MultiQueryException('closed for worker')
311+
else:
312+
closed.pop()
313+
try:
314+
return func(*args, **kwds)
315+
except:
316+
closed.append(datetime.utcnow())
317+
raise
318+
return _multi_query_wrapper
319+
320+
321+
@_exception_closes_worker_guard
280322
def _multi_query(
281323
sparql, timeout, graph_pattern, source_target_pairs,
282324
batch_size,
@@ -303,15 +345,16 @@ def _multi_query(
303345
t, q_res = _query(sparql, timeout, q, **kwds)
304346
chunk_res = _chunk_res(
305347
q_res, _vars, _ret_val_mapping, **kwds)
306-
except EndPointNotFound:
348+
except EndPointNotFound as e:
307349
# happens if the endpoint reports a 404...
308350
# as virtuoso in rare cases seems to report a 404 let's
309351
# retry after some time but then cancel
310352
if retry:
311353
logger.info(
312-
'SPARQL endpoint reports a 404, will retry once in 10s'
354+
'SPARQL endpoint reports a 404, will retry in %ds',
355+
config.ERROR_WAIT
313356
)
314-
sleep(10)
357+
sleep(config.ERROR_WAIT)
315358
continue
316359
else:
317360
logger.exception(
@@ -320,7 +363,7 @@ def _multi_query(
320363
'could not perform query:\n%s for %s\nException:',
321364
q, val_chunk,
322365
)
323-
raise
366+
six.reraise(MultiQueryException, e, sys.exc_info()[2])
324367
except (SPARQLWrapperException, SAXParseException, URLError) as e:
325368
if (isinstance(e, SPARQLWrapperException) and
326369
re.search(
@@ -346,45 +389,47 @@ def _multi_query(
346389
# error. It is very likely that the endpoint is dead...
347390
if retry:
348391
logger.warning(
349-
'could not perform query, retry in 10s:\n'
392+
'URLError, seems we cannot reach SPARQL endpoint, '
393+
'retry in %ds. Tried to perform query:\n'
350394
'%s for %s\nException:',
351-
q, val_chunk,
395+
config.ERROR_WAIT, q, val_chunk,
352396
exc_info=1, # appends exception to message
353397
)
354-
sleep(10)
398+
sleep(config.ERROR_WAIT)
355399
continue
356400
else:
357401
logger.exception(
358-
'could not perform query:\n%s for %s\nException:',
402+
'URLError, seems we cannot reach SPARQL endpoint, '
403+
'giving up after 3 retries. Tried to perform query:'
404+
'\n%s for %s\nException:',
359405
q, val_chunk,
360-
exc_info=1, # appends exception to message
361406
)
362-
raise
407+
six.reraise(MultiQueryException, e, sys.exc_info()[2])
363408
else:
364409
logger.warning(
365-
'could not perform query:\n%s for %s\nException:',
410+
'could not perform query, replacing with 0 result:\n'
411+
'%s for %s\nException:',
366412
q, val_chunk,
367413
exc_info=1, # appends exception to message
368414
)
369415
t, chunk_res = timer() - _start_time, {}
370-
except Exception:
416+
except Exception as e:
371417
if retry:
372418
logger.warning(
373-
'unhandled exception, retry in 10s:\n'
419+
'unhandled exception, retry in %ds:\n'
374420
'Query:\n%s\nChunk:%r\nException:',
375-
q, val_chunk,
421+
config.ERROR_WAIT, q, val_chunk,
376422
exc_info=1, # appends exception to message
377423
)
378-
sleep(10)
424+
sleep(config.ERROR_WAIT)
379425
continue
380426
else:
381427
logger.exception(
382-
'unhandled exception:\n'
428+
'unhandled exception, giving up after 3 retries:\n'
383429
'Query:\n%s\nChunk:%r\nException:',
384430
q, val_chunk,
385-
exc_info=1, # appends exception to message
386431
)
387-
t, chunk_res = timer() - _start_time, {}
432+
six.reraise(MultiQueryException, e, sys.exc_info()[2])
388433
break
389434
_res_update(res, chunk_res, **kwds)
390435
total_time += t

utils.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -169,17 +169,18 @@ def exception_stack_wrapper(*args, **kwds):
169169
str(e)
170170
except UnicodeEncodeError:
171171
scoop.logger.warning(
172-
're-packing exception for scoop, see'
172+
're-packing exception for scoop, see '
173173
'https://github.com/soravux/scoop/pull/24'
174174
)
175-
e_msg = repr(e.message)
175+
e_msg = repr(e)
176176
six.reraise(type(e), e_msg, exc_info[2])
177177
else:
178178
raise
179179
except BaseException as err:
180180
# append the stack as field to the re-raised exception
181181
err._exc_fmt = 'error in worker:\n%s' % (
182182
''.join(traceback.format_exception(*exc_info)))
183+
six.reraise(type(err), err, exc_info[2])
183184
raise
184185
return exception_stack_wrapper
185186

0 commit comments

Comments
 (0)