-
Notifications
You must be signed in to change notification settings - Fork 37
/
common.py
660 lines (557 loc) · 23.8 KB
/
common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
"""Base classes for storage backends, frontends, and savers in strax.
Please see the developer documentation for more details
on strax' storage hierarchy.
"""
from ast import literal_eval
from concurrent.futures import wait
import logging
from packaging import version
import time
import typing
import warnings
import numpy as np
import strax
export, __all__ = strax.exporter()
@export
class DataKey:
"""Request for data to a storage registry
Instances of this class uniquely identify a single piece of strax data
abstractly -- that is, it describes the full history of algorithms that
have to be run to reproduce it.
It is used for communication between the main Context class and storage
frontends.
"""
run_id: str
data_type: str
lineage: dict
# Do NOT use directly, use the lineage_hash method
_lineage_hash = ''
def __init__(self, run_id, data_type, lineage):
self.run_id = run_id
self.data_type = data_type
self.lineage = lineage
def __repr__(self):
return '-'.join([self.run_id, self.data_type, self.lineage_hash])
@property
def lineage_hash(self):
"""Deterministic hash of the lineage"""
# We cache the hash computation to benefit tight loops calling
# this property
if self._lineage_hash == '':
self._lineage_hash = strax.deterministic_hash(self.lineage)
return self._lineage_hash
@export
class DataNotAvailable(Exception):
"""Raised when requested data is not available"""
pass
@export
class EmptyDataWarning(UserWarning):
pass
@export
class DataExistsError(Exception):
"""Raised when attempting to write a piece of data
that is already written"""
def __init__(self, at, message=''):
super().__init__(message)
self.at = at
@export
class DataCorrupted(Exception):
pass
@export
class RunMetadataNotAvailable(Exception):
pass
@export
class StorageFrontend:
"""Interface to something that knows data-locations and run-level metadata.
For example, a runs database, or a data directory on the file system.
"""
backends: list
can_define_runs = False
provide_run_metadata = False
def __init__(self,
readonly=False,
provide_run_metadata=None,
overwrite='if_broken',
take_only=tuple(),
exclude=tuple()):
"""
:param readonly: If True, throws CannotWriteData whenever saving is
attempted.
:param overwrite: When to overwrite data that already exists.
- 'never': Never overwrite any data.
- 'if_broken': Only overwrites data if it is incomplete or broken.
- 'always': Always overwrite data. Use with caution!
:param take_only: Provide/accept only these data types.
:param exclude: Do NOT provide/accept these data types.
:param provide_run_metadata: Whether to provide run-level metadata
(run docs). If None, use class-specific default
If take_only and exclude are both omitted, provide all data types.
If a data type is listed in both, it will not be provided.
Attempting to read/write unwanted data types throws DataTypeNotWanted.
"""
if overwrite not in 'never if_broken always'.split():
raise RuntimeError(f"Invalid 'overwrite' setting {overwrite}. ")
self.take_only = strax.to_str_tuple(take_only)
self.exclude = strax.to_str_tuple(exclude)
self.overwrite = overwrite
if provide_run_metadata is not None:
self.provide_run_metadata = provide_run_metadata
self.readonly = readonly
self.log = logging.getLogger(self.__class__.__name__)
def loader(self, key: DataKey,
time_range=None,
allow_incomplete=False,
fuzzy_for=tuple(),
fuzzy_for_options=tuple(),
chunk_number=None,
executor=None):
"""Return loader for data described by DataKey.
:param key: DataKey describing data
:param time_range: 2-length arraylike of (start, exclusive end)
of row numbers to get. Default is None, which means get the entire
run.
:param allow_incomplete: Allow loading of data which has not been
completely written to disk yet.
:param fuzzy_for: list/tuple of plugin names for which no
plugin name, version, or option check is performed.
:param fuzzy_for_options: list/tuple of configuration options for which
no check is performed.
:param chunk_number: Chunk number to load exclusively.
:param executor: Executor for pushing load computation to
"""
backend, backend_key = self.find(key,
write=False,
allow_incomplete=allow_incomplete,
fuzzy_for=fuzzy_for,
fuzzy_for_options=fuzzy_for_options)
return self._get_backend(backend).loader(
backend_key,
time_range=time_range,
executor=executor,
chunk_number=chunk_number)
def saver(self, key, metadata):
"""Return saver for data described by DataKey."""
backend, backend_key = self.find(key, write=True)
return self._get_backend(backend).saver(backend_key,
metadata)
def get_metadata(self, key,
allow_incomplete=False,
fuzzy_for=tuple(),
fuzzy_for_options=tuple()):
"""Retrieve data-level metadata for the specified key.
Other parameters are the same as for .find
"""
backend, backend_key = self.find(key,
write=False,
check_broken=False,
allow_incomplete=allow_incomplete,
fuzzy_for=fuzzy_for,
fuzzy_for_options=fuzzy_for_options)
return self._get_backend(backend).get_metadata(backend_key)
def _we_take(self, data_type):
"""Return if data_type can be provided by this frontend"""
return not (data_type in self.exclude
or self.take_only and data_type not in self.take_only)
def find(self, key: DataKey,
write=False,
check_broken=True,
allow_incomplete=False,
fuzzy_for=tuple(), fuzzy_for_options=tuple()):
"""Return (str: backend class name, backend-specific) key
to get at / write data, or raise exception.
:param key: DataKey of data to load
{data_type: (plugin_name, version, {config_option: value, ...}, ...}
:param write: Set to True if writing new data. The data is immediately
registered, so you must follow up on the write!
:param check_broken: If True, raise DataNotAvailable if data has not
been complete written, or writing terminated with an exception.
"""
message = (
f"\nRequested lineage: {key.lineage}."
f"\nIgnoring plugin lineage for: {fuzzy_for}."
f"\nIgnoring config options: {fuzzy_for}.")
if not self._we_take(key.data_type):
raise DataNotAvailable(
f"{self} does not accept or provide data type {key.data_type}")
if write:
if self.readonly:
raise DataNotAvailable(f"{self} cannot write any-data, "
"it's readonly")
try:
at = self.find(key, write=False,
allow_incomplete=allow_incomplete,
fuzzy_for=fuzzy_for,
fuzzy_for_options=fuzzy_for_options)
raise DataExistsError(
at=at,
message=(f"Data already exists at {at}.\n"
+ message))
except DataNotAvailable:
pass
try:
backend_name, backend_key = self._find(
key=key,
write=write,
allow_incomplete=allow_incomplete,
fuzzy_for=fuzzy_for,
fuzzy_for_options=fuzzy_for_options)
except DataNotAvailable:
raise DataNotAvailable(
f"{key.data_type} for {key.run_id} not available." + message)
if not write and check_broken:
# Get the metadata to check if the data is broken
meta = self._get_backend(backend_name).get_metadata(backend_key)
if 'exception' in meta:
exc = meta['exception']
raise DataNotAvailable(
f"Data in {backend_name} {backend_key} corrupted due to "
f"exception during writing: {exc}.")
if 'writing_ended' not in meta and not allow_incomplete:
raise DataNotAvailable(
f"Data in {backend_name} {backend_key} corrupted. No "
f"writing_ended field present!")
return backend_name, backend_key
def _get_backend(self, backend):
for b in self.backends:
if b.__class__.__name__ == backend:
return b
raise KeyError(f"Unknown storage backend {backend} specified")
def _matches(self, lineage: dict, desired_lineage: dict,
fuzzy_for: tuple, fuzzy_for_options: tuple):
"""Return if lineage matches desired_lineage given ignore options
"""
if not (fuzzy_for or fuzzy_for_options):
return lineage == desired_lineage
args = [fuzzy_for, fuzzy_for_options]
return (
self._filter_lineage(lineage, *args)
== self._filter_lineage(desired_lineage, *args))
@staticmethod
def _filter_lineage(lineage, fuzzy_for, fuzzy_for_options):
"""Return lineage without parts to be ignored in matching"""
return {data_type: (v[0],
v[1],
{option_name: b
for option_name, b in v[2].items()
if option_name not in fuzzy_for_options})
for data_type, v in lineage.items()
if data_type not in fuzzy_for}
def _can_overwrite(self, key: DataKey):
if self.overwrite == 'always':
return True
if self.overwrite == 'if_broken':
metadata = self.get_metadata(key)
return not ('writing_ended' in metadata
and 'exception' not in metadata)
return False
def find_several(self, keys, **kwargs):
"""Return list with backend keys or False
for several data keys.
Options are as for find()
"""
# You can override this if the backend has a smarter way
# of checking availability (e.g. a single DB query)
result = []
for key in keys:
try:
r = self.find(key, **kwargs)
except (strax.DataNotAvailable,
strax.DataCorrupted):
r = False
result.append(r)
return result
def define_run(self, name, sub_run_spec, **metadata):
self.write_run_metadata(name, dict(
sub_run_spec=sub_run_spec,
**metadata))
##
# Abstract methods (to override in child)
##
def _scan_runs(self, store_fields):
"""Iterable of run document / metadata dictionaries
"""
yield from tuple()
def _find(self, key: DataKey,
write, allow_incomplete, fuzzy_for, fuzzy_for_options):
"""Return backend key (e.g. for filename) for data identified by key,
raise DataNotAvailable, or DataExistsError
Parameters are as for find.
"""
# Use the self._matches attribute to compare lineages according to
# the fuzzy options
raise NotImplementedError
def run_metadata(self, run_id, projection=None):
"""Return run metadata dictionary, or raise RunMetadataNotAvailable"""
raise NotImplementedError
def write_run_metadata(self, run_id, metadata):
"""Stores metadata for run_id. Silently overwrites any previously
stored run-level metadata."""
raise NotImplementedError
def remove(self, key):
"""Removes a registration. Does not delete any actual data"""
raise NotImplementedError
@export
class StorageBackend:
"""Storage backend for strax data.
This is a 'dumb' interface to data. Each bit of data stored is described
by backend-specific keys (e.g. directory names).
Finding and assigning backend keys is the responsibility of the
StorageFrontend.
The backend class name + backend_key must together uniquely identify a
piece of data. So don't make __init__ take options like 'path' or 'host',
these have to be hardcoded (or made part of the key).
"""
def loader(self,
backend_key,
time_range=None,
chunk_number=None,
executor=None):
"""Iterates over strax data in backend_key
:param time_range: 2-length arraylike of (start, exclusive end)
of desired data. Will return all data that partially overlaps with
the range.
Default is None, which means get the entire
:param chunk_number: Chunk number to get exclusively
:param executor: Executor to push load/decompress operations to
"""
metadata = self.get_metadata(backend_key)
if 'strax_version' in metadata:
v_old = metadata['strax_version']
if version.parse(v_old) < version.parse('0.9.0'):
raise strax.DataNotAvailable(
f"Cannot load data at {backend_key}: "
f"it was created with strax {v_old}, "
f"but you have strax {strax.__version__}. ")
else:
warnings.warn(f"Data at {backend_key} does not say what strax "
"version it was generated with. This means it is "
"corrupted, or very, very old. Probably "
"we cannot load this.")
# 'start' and 'end' are not required, to allow allow_incomplete
required_fields = (
'run_id data_type data_kind dtype compressor').split()
missing_fields = [x for x in required_fields if x not in metadata]
if len(missing_fields):
raise strax.DataNotAvailable(
f"Cannot load data at {backend_key}: metadata is "
f"missing the required fields {missing_fields}. ")
if not len(metadata['chunks']):
raise ValueError(
f"Cannot load data at {backend_key}, it has no chunks!")
dtype = literal_eval(metadata['dtype'])
# Common arguments for chunk construction, not stored with chunk-level
# metadata
chunk_kwargs = dict(
data_type=metadata['data_type'],
data_kind=metadata['data_kind'],
dtype=dtype,
target_size_mb=metadata.get('chunk_target_size_mb',
strax.default_chunk_size_mb))
required_chunk_metadata_fields = 'start end run_id'.split()
for i, chunk_info in enumerate(strax.iter_chunk_meta(metadata)):
missing_fields = [x for x in required_chunk_metadata_fields
if x not in chunk_info]
if len(missing_fields):
raise ValueError(
f"Error reading chunk {i} of {metadata['dtype']} "
f"of {metadata['run_d']} from {backend_key}: "
f"chunk metadata is missing fields {missing_fields}")
# Chunk number constraint
if chunk_number is not None:
if i != chunk_number:
continue
# Time constraint
if time_range:
if (chunk_info['end'] <= time_range[0]
or time_range[1] <= chunk_info['start']):
# Chunk does not cover any part of range
continue
read_chunk_kwargs = dict(
backend_key=backend_key,
dtype=dtype,
metadata=metadata,
chunk_info=chunk_info,
time_range=time_range,
chunk_construction_kwargs=chunk_kwargs)
if executor is None:
yield self._read_and_format_chunk(**read_chunk_kwargs)
else:
yield executor.submit(self._read_and_format_chunk,
**read_chunk_kwargs)
def _read_and_format_chunk(self,
*,
backend_key,
dtype,
metadata,
chunk_info,
time_range,
chunk_construction_kwargs) -> strax.Chunk:
if chunk_info['n'] == 0:
# No data, no need to load
data = np.empty(0, dtype=dtype)
else:
data = self._read_chunk(backend_key,
chunk_info=chunk_info,
dtype=dtype,
compressor=metadata['compressor'])
result = strax.Chunk(
start=chunk_info['start'],
end=chunk_info['end'],
run_id=chunk_info['run_id'],
data=data,
**chunk_construction_kwargs)
if time_range:
if result.start < time_range[0]:
_, result = result.split(t=time_range[0],
allow_early_split=True)
if result.end > time_range[1]:
try:
result, _ = result.split(t=time_range[1],
allow_early_split=False)
except strax.CannotSplit:
pass
return result
def saver(self, key, metadata):
"""Return saver for data described by key"""
metadata.setdefault('compressor', 'blosc') # TODO wrong place?
metadata['strax_version'] = strax.__version__
if 'dtype' in metadata:
metadata['dtype'] = metadata['dtype'].descr.__repr__()
return self._saver(key, metadata)
##
# Abstract methods (to override in child)
##
def get_metadata(self, backend_key):
"""Return metadata of data described by key.
"""
raise NotImplementedError
def _read_chunk(self, backend_key, chunk_info, dtype, compressor):
"""Return a single data chunk"""
raise NotImplementedError
def _saver(self, key, metadata):
raise NotImplementedError
@export # Needed for type hints elsewhere
class Saver:
"""Interface for saving a data type
Must work even if forked.
Do NOT add unpickleable things as attributes (such as loggers)!
"""
closed = False
allow_rechunk = True # If False, do not rechunk even if plugin allows it
allow_fork = True # If False, cannot be inlined / forked
# This is set if the saver is operating in multiple processes at once
# Do not set it yourself
is_forked = False
got_exception = None
def __init__(self, metadata):
self.md = metadata
self.md['writing_started'] = time.time()
self.md['chunks'] = []
def save_from(self, source: typing.Generator, rechunk=True, executor=None):
"""Iterate over source and save the results under key
along with metadata
"""
pending = []
exhausted = False
chunk_i = 0
try:
while not exhausted:
chunk = None
try:
if rechunk and self.allow_rechunk:
while (chunk is None or
chunk.data.nbytes < chunk.target_size_mb*1e6):
chunk = strax.Chunk.concatenate(
[chunk, next(source)])
else:
chunk = next(source)
except StopIteration:
exhausted = True
if chunk is None:
break
new_f = self.save(chunk=chunk,
chunk_i=chunk_i, executor=executor)
pending = [f for f in pending if not f.done()]
if new_f is not None:
pending += [new_f]
chunk_i += 1
except strax.MailboxKilled:
# Write exception (with close), but exit gracefully.
# One traceback on screen is enough
self.close(wait_for=pending)
pass
except Exception as e:
# log exception for the final check
self.got_exception = e
# Throw the exception back into the mailbox
# (hoping that it is still listening...)
source.throw(e)
raise e
finally:
if not self.closed:
self.close(wait_for=pending)
def save(self, chunk: strax.Chunk, chunk_i: int, executor=None):
"""Save a chunk, returning future to wait on or None"""
if self.closed:
raise RuntimeError(f"Attmpt to save to {self.md} saver, "
f"which is already closed!")
chunk_info = dict(chunk_i=chunk_i,
n=len(chunk),
start=chunk.start,
end=chunk.end,
run_id=chunk.run_id,
nbytes=chunk.nbytes)
if len(chunk) != 0 and 'time' in chunk.dtype.names:
for desc, i in (('first', 0), ('last', -1)):
chunk_info[f'{desc}_time'] = \
int(chunk.data[i]['time'])
chunk_info[f'{desc}_endtime'] = \
int(strax.endtime(chunk.data[i]))
if len(chunk):
bonus_info, future = self._save_chunk(
chunk.data,
chunk_info,
executor=None if self.is_forked else executor)
chunk_info.update(bonus_info)
else:
# No need to create an empty file for an empty chunk;
# the annotation in the metadata is sufficient.
future = None
self._save_chunk_metadata(chunk_info)
return future
def close(self,
wait_for: typing.Union[list, tuple] = tuple(),
timeout=300):
if self.closed:
raise RuntimeError(f"{self.md} saver already closed")
if wait_for:
done, not_done = wait(wait_for, timeout=timeout)
if len(not_done):
raise RuntimeError(
f"{len(not_done)} futures of {self.md} did not"
"complete in time!")
self.closed = True
exc_info = strax.formatted_exception()
if exc_info:
self.md['exception'] = exc_info
if self.md['chunks']:
# Update to precise start and end values
self.md['start'] = self.md['chunks'][0]['start']
self.md['end'] = self.md['chunks'][-1]['end']
# If there were no chunks, we are certainly crashing.
# Don't throw another exception
self.md['writing_ended'] = time.time()
self._close()
##
# Abstract methods (to override in child)
##
def _save_chunk(self, data, chunk_info, executor=None):
"""Save a chunk to file. Return (
dict with extra info for metadata,
future to wait on or None)
"""
raise NotImplementedError
def _save_chunk_metadata(self, chunk_info):
raise NotImplementedError
def _close(self):
raise NotImplementedError