From ba8ffd738abffc1e6ea827817d759208d419c321 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 27 Jun 2014 16:31:10 +0200 Subject: [PATCH] Code reformatted through PyCharm. --- bcolz/arrayprint.py | 114 +- bcolz/bcolz_ext.pyx | 4102 ++++++++++++++++++++-------------------- bench/arange.py | 11 +- bench/concat.py | 21 +- bench/ctable-query.py | 72 +- bench/eval-profile.py | 22 +- bench/eval.py | 38 +- bench/expression.py | 20 +- bench/fill.py | 18 +- bench/fromiter.py | 37 +- bench/getitem.py | 23 +- bench/iter.py | 13 +- bench/iterator.py | 27 +- bench/large_carray.py | 6 +- bench/query.py | 27 +- bench/serialization.py | 29 +- bench/sum.py | 17 +- bench/zeros.py | 11 +- doc/conf.py | 38 +- doc/index.rst | 20 +- pavement.py | 99 +- persistence.rst | 34 +- setup.py | 86 +- 23 files changed, 2527 insertions(+), 2358 deletions(-) diff --git a/bcolz/arrayprint.py b/bcolz/arrayprint.py index 97205ce8..66dd5b78 100644 --- a/bcolz/arrayprint.py +++ b/bcolz/arrayprint.py @@ -14,22 +14,25 @@ # adapted by Francesc Alted 20012-8-18 for bcolz import sys + import numpy as np from numpy.core import numerictypes as _nt from numpy import maximum, minimum, absolute, not_equal, isnan, isinf from numpy.core.multiarray import format_longfloat from numpy.core.fromnumeric import ravel + try: from numpy.core.multiarray import datetime_as_string, datetime_data except ImportError: pass -def product(x, y): return x*y +def product(x, y): return x * y + -_summaryEdgeItems = 3 # repr N leading and trailing items of each dimension -_summaryThreshold = 1000 # total items > triggers array summarization +_summaryEdgeItems = 3 # repr N leading and trailing items of each dimension +_summaryThreshold = 1000 # total items > triggers array summarization _float_output_precision = 8 _float_output_suppress_small = False @@ -41,6 +44,7 @@ def product(x, y): return x*y if sys.version_info[0] >= 3: from functools import reduce + def set_printoptions(precision=None, threshold=None, edgeitems=None, linewidth=None, suppress=None, nanstr=None, infstr=None, @@ -147,8 +151,8 @@ def set_printoptions(precision=None, threshold=None, edgeitems=None, """ global _summaryThreshold, _summaryEdgeItems, _float_output_precision, \ - _line_width, _float_output_suppress_small, _nan_str, _inf_str, \ - _formatter + _line_width, _float_output_suppress_small, _nan_str, _inf_str, \ + _formatter if linewidth is not None: _line_width = linewidth if threshold is not None: @@ -165,6 +169,7 @@ def set_printoptions(precision=None, threshold=None, edgeitems=None, _inf_str = infstr _formatter = formatter + def get_printoptions(): """ Return the current print options. @@ -200,25 +205,28 @@ def get_printoptions(): formatter=_formatter) return d + def _leading_trailing(a): import numpy.core.numeric as _nc + if a.ndim == 1: - if len(a) > 2*_summaryEdgeItems: + if len(a) > 2 * _summaryEdgeItems: b = _nc.concatenate((a[:_summaryEdgeItems], - a[-_summaryEdgeItems:])) + a[-_summaryEdgeItems:])) else: b = a else: - if len(a) > 2*_summaryEdgeItems: + if len(a) > 2 * _summaryEdgeItems: l = [_leading_trailing(a[i]) for i in range( min(len(a), _summaryEdgeItems))] l.extend([_leading_trailing(a[-i]) for i in range( - min(len(a), _summaryEdgeItems),0,-1)]) + min(len(a), _summaryEdgeItems), 0, -1)]) else: l = [_leading_trailing(a[i]) for i in range(0, len(a))] b = _nc.concatenate(tuple(l)) return b + def _boolFormatter(x): if x: return ' True' @@ -229,9 +237,9 @@ def _boolFormatter(x): def repr_format(x): return repr(x) + def _array2string(a, max_line_width, precision, suppress_small, separator=' ', prefix="", formatter=None): - if max_line_width is None: max_line_width = _line_width @@ -251,17 +259,17 @@ def _array2string(a, max_line_width, precision, suppress_small, separator=' ', summary_insert = "" data = ravel(a) - formatdict = {'bool' : _boolFormatter, - 'int' : IntegerFormat(data), - 'float' : FloatFormat(data, precision, suppress_small), - 'longfloat' : LongFloatFormat(precision), - 'complexfloat' : ComplexFormat(data, precision, - suppress_small), - 'longcomplexfloat' : LongComplexFormat(precision), - 'datetime' : DatetimeFormat(data), - 'timedelta' : TimedeltaFormat(data), - 'numpystr' : repr_format, - 'str' : str} + formatdict = {'bool': _boolFormatter, + 'int': IntegerFormat(data), + 'float': FloatFormat(data, precision, suppress_small), + 'longfloat': LongFloatFormat(precision), + 'complexfloat': ComplexFormat(data, precision, + suppress_small), + 'longcomplexfloat': LongComplexFormat(precision), + 'datetime': DatetimeFormat(data), + 'timedelta': TimedeltaFormat(data), + 'numpystr': repr_format, + 'str': str} if formatter is not None: fkeys = [k for k in formatter.keys() if formatter[k] is not None] @@ -289,6 +297,7 @@ def _array2string(a, max_line_width, precision, suppress_small, separator=' ', msg = "The `_format` attribute is deprecated in Numpy 2.0 and " \ "will be removed in 2.1. Use the `formatter` kw instead." import warnings + warnings.warn(msg, DeprecationWarning) except AttributeError: # find the right formatting function for the array @@ -297,7 +306,7 @@ def _array2string(a, max_line_width, precision, suppress_small, separator=' ', format_function = formatdict['bool'] elif issubclass(dtypeobj, _nt.integer): if (hasattr(_nt, "timedelta64") and - issubclass(dtypeobj, _nt.timedelta64)): + issubclass(dtypeobj, _nt.timedelta64)): format_function = formatdict['timedelta'] else: format_function = formatdict['int'] @@ -313,8 +322,8 @@ def _array2string(a, max_line_width, precision, suppress_small, separator=' ', format_function = formatdict['complexfloat'] elif issubclass(dtypeobj, (_nt.unicode_, _nt.string_)): format_function = formatdict['numpystr'] - elif(hasattr(_nt, "datetime64") and - issubclass(dtypeobj, _nt.datetime64)): + elif (hasattr(_nt, "datetime64") and + issubclass(dtypeobj, _nt.datetime64)): format_function = formatdict['datetime'] else: format_function = formatdict['str'] @@ -322,15 +331,17 @@ def _array2string(a, max_line_width, precision, suppress_small, separator=' ', # skip over "[" next_line_prefix = " " # skip over array( - next_line_prefix += " "*len(prefix) + next_line_prefix += " " * len(prefix) lst = _formatArray(a, format_function, len(a.shape), max_line_width, next_line_prefix, separator, _summaryEdgeItems, summary_insert)[:-1] return lst + def _convert_arrays(obj): import numpy.core.numeric as _nc + newtup = [] for k in obj: if isinstance(k, _nc.ndarray): @@ -441,6 +452,7 @@ def array2string(a, max_line_width=None, precision=None, "2.0 and will be removed in 2.1. Use the " \ "`formatter` kw instead." import warnings + warnings.warn(msg, DeprecationWarning) except AttributeError: if isinstance(x, tuple): @@ -454,6 +466,7 @@ def array2string(a, max_line_width=None, precision=None, separator, prefix, formatter=formatter) return lst + def _extendLine(s, line, word, max_line_len, next_line_prefix): if len(line.rstrip()) + len(word.rstrip()) >= max_line_len: s += line.rstrip() + "\n" @@ -477,9 +490,9 @@ def _formatArray(a, format_function, rank, max_line_len, obj = _convert_arrays(obj) return str(obj) - if summary_insert and 2*edge_items < len(a): + if summary_insert and 2 * edge_items < len(a): leading_items, trailing_items, summary_insert1 = \ - edge_items, edge_items, summary_insert + edge_items, edge_items, summary_insert else: leading_items, trailing_items, summary_insert1 = 0, len(a), "" @@ -488,14 +501,17 @@ def _formatArray(a, format_function, rank, max_line_len, line = next_line_prefix for i in xrange(leading_items): word = format_function(a[i]) + separator - s, line = _extendLine(s, line, word, max_line_len, next_line_prefix) + s, line = _extendLine(s, line, word, max_line_len, + next_line_prefix) if summary_insert1: - s, line = _extendLine(s, line, summary_insert1, max_line_len, next_line_prefix) + s, line = _extendLine(s, line, summary_insert1, max_line_len, + next_line_prefix) for i in xrange(trailing_items, 1, -1): word = format_function(a[-i]) + separator - s, line = _extendLine(s, line, word, max_line_len, next_line_prefix) + s, line = _extendLine(s, line, word, max_line_len, + next_line_prefix) word = format_function(a[-1]) s, line = _extendLine(s, line, word, max_line_len, next_line_prefix) @@ -507,10 +523,10 @@ def _formatArray(a, format_function, rank, max_line_len, for i in xrange(leading_items): if i > 0: s += next_line_prefix - s += _formatArray(a[i], format_function, rank-1, max_line_len, + s += _formatArray(a[i], format_function, rank - 1, max_line_len, " " + next_line_prefix, separator, edge_items, summary_insert) - s = s.rstrip() + sep.rstrip() + '\n'*max(rank-1,1) + s = s.rstrip() + sep.rstrip() + '\n' * max(rank - 1, 1) if summary_insert1: s += next_line_prefix + summary_insert1 + "\n" @@ -518,17 +534,18 @@ def _formatArray(a, format_function, rank, max_line_len, for i in xrange(trailing_items, 1, -1): if leading_items or i != trailing_items: s += next_line_prefix - s += _formatArray(a[-i], format_function, rank-1, max_line_len, + s += _formatArray(a[-i], format_function, rank - 1, max_line_len, " " + next_line_prefix, separator, edge_items, summary_insert) - s = s.rstrip() + sep.rstrip() + '\n'*max(rank-1,1) + s = s.rstrip() + sep.rstrip() + '\n' * max(rank - 1, 1) if leading_items or trailing_items > 1: s += next_line_prefix - s += _formatArray(a[-1], format_function, rank-1, max_line_len, + s += _formatArray(a[-1], format_function, rank - 1, max_line_len, " " + next_line_prefix, separator, edge_items, - summary_insert).rstrip()+']\n' + summary_insert).rstrip() + ']\n' return s + class FloatFormat(object): def __init__(self, data, precision, suppress_small, sign=False): self.precision = precision @@ -546,6 +563,7 @@ def __init__(self, data, precision, suppress_small, sign=False): def fillFormat(self, data): import numpy.core.numeric as _nc + errstate = _nc.seterr(all='ignore') try: special = isnan(data) | isinf(data) @@ -560,7 +578,7 @@ def fillFormat(self, data): if max_val >= 1.e8: self.exp_format = True if not self.suppress_small and (min_val < 0.0001 - or max_val/min_val > 1000.): + or max_val / min_val > 1000.): self.exp_format = True finally: _nc.seterr(**errstate) @@ -587,7 +605,7 @@ def fillFormat(self, data): if _nc.any(special): self.max_str_len = max(self.max_str_len, len(_nan_str), - len(_inf_str)+1) + len(_inf_str) + 1) if self.sign: format = '%#+' else: @@ -599,6 +617,7 @@ def fillFormat(self, data): def __call__(self, x, strip_zeros=True): import numpy.core.numeric as _nc + err = _nc.seterr(invalid='ignore') try: if isnan(x): @@ -629,7 +648,7 @@ def __call__(self, x, strip_zeros=True): s = ' ' + s[:-3] + s[-2:] elif strip_zeros: z = s.rstrip('0') - s = z + ' '*(len(s)-len(z)) + s = z + ' ' * (len(s) - len(z)) return s @@ -640,7 +659,9 @@ def _digits(x, precision, format): _MAXINT = sys.maxint -_MININT = -sys.maxint-1 +_MININT = -sys.maxint - 1 + + class IntegerFormat(object): def __init__(self, data): try: @@ -661,6 +682,7 @@ def __call__(self, x): else: return "%s" % x + class LongFloatFormat(object): # XXX Have to add something to determine the width to use a la FloatFormat # Right now, things won't line up properly @@ -713,14 +735,15 @@ def __call__(self, x): i = self.imag_format(x.imag, strip_zeros=False) if not self.imag_format.exp_format: z = i.rstrip('0') - i = z + 'j' + ' '*(len(i)-len(z)) + i = z + 'j' + ' ' * (len(i) - len(z)) else: i = i + 'j' return r + i + class DatetimeFormat(object): def __init__(self, x, unit=None, - timezone=None, casting='same_kind'): + timezone=None, casting='same_kind'): # Get the unit from the dtype if unit is None: if x.dtype.kind == 'M': @@ -742,9 +765,10 @@ def __init__(self, x, unit=None, def __call__(self, x): return "'%s'" % datetime_as_string(x, - unit=self.unit, - timezone=self.timezone, - casting=self.casting) + unit=self.unit, + timezone=self.timezone, + casting=self.casting) + class TimedeltaFormat(object): def __init__(self, data): diff --git a/bcolz/bcolz_ext.pyx b/bcolz/bcolz_ext.pyx index 85fde5e8..50af4aae 100644 --- a/bcolz/bcolz_ext.pyx +++ b/bcolz/bcolz_ext.pyx @@ -8,19 +8,22 @@ import sys -import numpy as np -import bcolz -from bcolz import utils, attrs, array2string -import os, os.path +import os +import os.path import struct import shutil import tempfile import json + +import numpy as np import cython +import bcolz +from bcolz import utils, attrs, array2string + _KB = 1024 -_MB = 1024*_KB +_MB = 1024 * _KB # Directories for saving the data and metadata for carray persistency DATA_DIR = 'data' @@ -35,7 +38,7 @@ BLOSCPACK_HEADER_LENGTH = 16 BLOSC_HEADER_LENGTH = 16 FORMAT_VERSION = 1 MAX_FORMAT_VERSION = 255 -MAX_CHUNKS = (2**63)-1 +MAX_CHUNKS = (2 ** 63) - 1 # The type used for size values: indexes, coordinates, dimension # lengths, row numbers, shapes, chunk shapes, byte counts... @@ -48,37 +51,36 @@ IntType = np.dtype(np.int_) # numpy functions & objects from definitions cimport import_array, ndarray, dtype, \ - malloc, realloc, free, memcpy, memset, strdup, strcmp, \ - PyString_AsString, PyString_FromString, \ - PyString_FromStringAndSize, \ - Py_BEGIN_ALLOW_THREADS, Py_END_ALLOW_THREADS, \ - PyArray_GETITEM, PyArray_SETITEM, \ - npy_intp + malloc, realloc, free, memcpy, memset, strdup, strcmp, \ + PyString_AsString, PyString_FromString, \ + PyString_FromStringAndSize, \ + Py_BEGIN_ALLOW_THREADS, Py_END_ALLOW_THREADS, \ + PyArray_GETITEM, PyArray_SETITEM, \ + npy_intp #----------------------------------------------------------------- # Blosc routines cdef extern from "blosc.h": - - cdef enum: - BLOSC_MAX_OVERHEAD, - BLOSC_VERSION_STRING, - BLOSC_VERSION_DATE - - void blosc_get_versions(char *version_str, char *version_date) - int blosc_set_nthreads(int nthreads) - int blosc_compress(int clevel, int doshuffle, size_t typesize, - size_t nbytes, void *src, void *dest, - size_t destsize) nogil - int blosc_decompress(void *src, void *dest, size_t destsize) nogil - int blosc_getitem(void *src, int start, int nitems, void *dest) nogil - void blosc_free_resources() - void blosc_cbuffer_sizes(void *cbuffer, size_t *nbytes, - size_t *cbytes, size_t *blocksize) - void blosc_cbuffer_metainfo(void *cbuffer, size_t *typesize, int *flags) - void blosc_cbuffer_versions(void *cbuffer, int *version, int *versionlz) - void blosc_set_blocksize(size_t blocksize) + cdef enum: + BLOSC_MAX_OVERHEAD, + BLOSC_VERSION_STRING, + BLOSC_VERSION_DATE + + void blosc_get_versions(char *version_str, char *version_date) + int blosc_set_nthreads(int nthreads) + int blosc_compress(int clevel, int doshuffle, size_t typesize, + size_t nbytes, void *src, void *dest, + size_t destsize) nogil + int blosc_decompress(void *src, void *dest, size_t destsize) nogil + int blosc_getitem(void *src, int start, int nitems, void *dest) nogil + void blosc_free_resources() + void blosc_cbuffer_sizes(void *cbuffer, size_t *nbytes, + size_t *cbytes, size_t *blocksize) + void blosc_cbuffer_metainfo(void *cbuffer, size_t *typesize, int *flags) + void blosc_cbuffer_versions(void *cbuffer, int *version, int *versionlz) + void blosc_set_blocksize(size_t blocksize) #---------------------------------------------------------------------------- @@ -93,321 +95,327 @@ import_array() # Some utilities def _blosc_set_nthreads(nthreads): - """ - blosc_set_nthreads(nthreads) + """ + blosc_set_nthreads(nthreads) - Sets the number of threads that Blosc can use. + Sets the number of threads that Blosc can use. - Parameters - ---------- - nthreads : int - The desired number of threads to use. + Parameters + ---------- + nthreads : int + The desired number of threads to use. - Returns - ------- - out : int - The previous setting for the number of threads. + Returns + ------- + out : int + The previous setting for the number of threads. - """ - return blosc_set_nthreads(nthreads) + """ + return blosc_set_nthreads(nthreads) def blosc_version(): - """ - blosc_version() + """ + blosc_version() - Return the version of the Blosc library. + Return the version of the Blosc library. - """ - return (BLOSC_VERSION_STRING, BLOSC_VERSION_DATE) + """ + return ( BLOSC_VERSION_STRING, BLOSC_VERSION_DATE) # This is the same than in utils.py, but works faster in extensions cdef get_len_of_range(npy_intp start, npy_intp stop, npy_intp step): - """Get the length of a (start, stop, step) range.""" - cdef npy_intp n + """Get the length of a (start, stop, step) range.""" + cdef npy_intp n - n = 0 - if start < stop: - # Do not use a cython.cdiv here (do not ask me why!) - n = ((stop - start - 1) // step + 1) - return n + n = 0 + if start < stop: + # Do not use a cython.cdiv here (do not ask me why!) + n = ((stop - start - 1) // step + 1) + return n cdef clip_chunk(npy_intp nchunk, npy_intp chunklen, npy_intp start, npy_intp stop, npy_intp step): - """Get the limits of a certain chunk based on its length.""" - cdef npy_intp startb, stopb, blen, distance - - startb = start - nchunk * chunklen - stopb = stop - nchunk * chunklen - - # Check limits - if (startb >= chunklen) or (stopb <= 0): - return startb, stopb, 0 # null size - if startb < 0: - startb = 0 - if stopb > chunklen: - stopb = chunklen - - # step corrections - if step > 1: - # Just correcting startb is enough - distance = (nchunk * chunklen + startb) - start - if distance % step > 0: - startb += (step - (distance % step)) - if startb > chunklen: + """Get the limits of a certain chunk based on its length.""" + cdef npy_intp startb, stopb, blen, distance + + startb = start - nchunk * chunklen + stopb = stop - nchunk * chunklen + + # Check limits + if (startb >= chunklen) or (stopb <= 0): return startb, stopb, 0 # null size + if startb < 0: + startb = 0 + if stopb > chunklen: + stopb = chunklen + + # step corrections + if step > 1: + # Just correcting startb is enough + distance = (nchunk * chunklen + startb) - start + if distance % step > 0: + startb += (step - (distance % step)) + if startb > chunklen: + return startb, stopb, 0 # null size - # Compute size of the clipped block - blen = get_len_of_range(startb, stopb, step) + # Compute size of the clipped block + blen = get_len_of_range(startb, stopb, step) - return startb, stopb, blen + return startb, stopb, blen cdef int check_zeros(char *data, int nbytes): - """Check whether [data, data+nbytes] is zero or not.""" - cdef int i, iszero, chunklen, leftover - cdef size_t *sdata - - iszero = 1 - sdata = data - chunklen = cython.cdiv(nbytes, sizeof(size_t)) - leftover = nbytes % sizeof(size_t) - with nogil: - for i from 0 <= i < chunklen: - if sdata[i] != 0: - iszero = 0 - break - else: - data += nbytes - leftover - for i from 0 <= i < leftover: - if data[i] != 0: - iszero = 0 - break - return iszero + """Check whether [data, data+nbytes] is zero or not.""" + cdef int i, iszero, chunklen, leftover + cdef size_t *sdata + + iszero = 1 + sdata = data + chunklen = cython.cdiv(nbytes, sizeof(size_t)) + leftover = nbytes % sizeof(size_t) + with nogil: + for i from 0 <= i < chunklen: + if sdata[i] != 0: + iszero = 0 + break + else: + data += nbytes - leftover + for i from 0 <= i < leftover: + if data[i] != 0: + iszero = 0 + break + return iszero cdef int true_count(char *data, int nbytes): - """Count the number of true values in data (boolean).""" - cdef int i, count + """Count the number of true values in data (boolean).""" + cdef int i, count - with nogil: - count = 0 - for i from 0 <= i < nbytes: - count += (data[i]) - return count + with nogil: + count = 0 + for i from 0 <= i < nbytes: + count += (data[i]) + return count #------------------------------------------------------------- cdef class chunk: - """ - chunk(array, atom, cparams) - - Compressed in-memory container for a data chunk. - - This class is meant to be used only by the `carray` class. - - """ - - # To save space, keep these variables under a minimum - cdef char typekind, isconstant - cdef int atomsize, itemsize, blocksize - cdef int nbytes, cbytes, cdbytes - cdef int true_count - cdef char *data - cdef object atom, constant, dobject - - property dtype: - "The NumPy dtype for this chunk." - def __get__(self): - return self.atom - - def __cinit__(self, object dobject, object atom, object cparams, - object _memory=True, object _compr=False): - cdef int itemsize, footprint - cdef size_t nbytes, cbytes, blocksize - cdef dtype dtype_ - - self.atom = atom - self.atomsize = atom.itemsize - dtype_ = atom.base - self.itemsize = itemsize = dtype_.elsize - self.typekind = dtype_.kind - self.dobject = None - footprint = 0 - - if _compr: - # Data comes in an already compressed state inside a Python String - self.data = PyString_AsString(dobject) - self.dobject = dobject # Increment the reference so that data don't go - # Set size info for the instance - blosc_cbuffer_sizes(self.data, &nbytes, &cbytes, &blocksize) - else: - # Compress the data object (a NumPy object) - nbytes, cbytes, blocksize, footprint = self.compress_data( - dobject, cparams, _memory) - footprint += 128 # add the (aprox) footprint of this instance in bytes - - # Fill instance data - self.nbytes = nbytes - self.cbytes = cbytes + footprint - self.cdbytes = cbytes - self.blocksize = blocksize - - cdef compress_data(self, ndarray array, object cparams, object _memory): - """Compress data in `array` and put it in ``self.data``""" - cdef size_t nbytes, cbytes, blocksize, itemsize, footprint - cdef int clevel, shuffle - cdef char *dest - - # Compute the total number of bytes in this array - itemsize = array.itemsize - nbytes = itemsize * array.size - cbytes = 0 - footprint = 0 - - # Check whether incoming data can be expressed as a constant or not. - # Disk-based chunks are not allowed to do this. - self.isconstant = 0 - self.constant = None - if _memory and (array.strides[0] == 0 - or check_zeros(array.data, nbytes)): - - self.isconstant = 1 - # Get the NumPy constant. Avoid this NumPy quirk: - # np.array(['1'], dtype='S3').dtype != s[0].dtype - if array.dtype.kind != 'S': - self.constant = array[0] - else: - self.constant = np.array(array[0], dtype=array.dtype) - # Add overhead (64 bytes for the overhead of the numpy container) - footprint += 64 + self.constant.size * self.constant.itemsize - - if self.isconstant: - blocksize = 4*1024 # use 4 KB as a cache for blocks - # Make blocksize a multiple of itemsize - if blocksize % itemsize > 0: - blocksize = cython.cdiv(blocksize, itemsize) * itemsize - # Correct in case we have a large itemsize - if blocksize == 0: - blocksize = itemsize - else: - if self.typekind == 'b': - self.true_count = true_count(array.data, nbytes) - - if array.strides[0] == 0: - # The chunk is made of constants. Regenerate the actual data. - array = array.copy() - - # Compress data - dest = malloc(nbytes+BLOSC_MAX_OVERHEAD) - clevel = cparams.clevel - shuffle = cparams.shuffle - with nogil: - cbytes = blosc_compress(clevel, shuffle, itemsize, nbytes, - array.data, dest, nbytes+BLOSC_MAX_OVERHEAD) - if cbytes <= 0: - raise RuntimeError, "fatal error during Blosc compression: %d" % cbytes - # Free the unused data - self.data = realloc(dest, cbytes) - # Set size info for the instance - blosc_cbuffer_sizes(self.data, &nbytes, &cbytes, &blocksize) - - return (nbytes, cbytes, blocksize, footprint) - - def getdata(self): - """Get a compressed String object out of this chunk (for persistence).""" - cdef object string - - assert (not self.isconstant, - "This function can only be used for persistency") - string = PyString_FromStringAndSize(self.data, self.cdbytes) - return string - - cdef void _getitem(self, int start, int stop, char *dest): - """Read data from `start` to `stop` and return it as a numpy array.""" - cdef int ret, bsize, blen, nitems, nstart - cdef ndarray constants - - blen = stop - start - bsize = blen * self.atomsize - nitems = cython.cdiv(bsize, self.itemsize) - nstart = cython.cdiv(start * self.atomsize, self.itemsize) - - if self.isconstant: - # The chunk is made of constants - constants = np.ndarray(shape=(blen,), dtype=self.dtype, - buffer=self.constant, strides=(0,)).copy() - memcpy(dest, constants.data, bsize) - return - - # Fill dest with uncompressed data - with nogil: - if bsize == self.nbytes: - ret = blosc_decompress(self.data, dest, bsize) - else: - ret = blosc_getitem(self.data, nstart, nitems, dest) - if ret < 0: - raise RuntimeError, "fatal error during Blosc decompression: %d" % ret - - def __getitem__(self, object key): - """__getitem__(self, key) -> values.""" - cdef ndarray array - cdef object start, stop, step, clen, idx - - if isinstance(key, (int, long)): - # Quickly return a single element - array = np.empty(shape=(1,), dtype=self.dtype) - self._getitem(key, key+1, array.data) - return PyArray_GETITEM(array, array.data) - elif isinstance(key, slice): - (start, stop, step) = key.start, key.stop, key.step - elif isinstance(key, tuple) and self.dtype.shape != (): - # Build an array to guess indices - clen = cython.cdiv(self.nbytes, self.itemsize) - idx = np.arange(clen, dtype=np.int32).reshape(self.dtype.shape) - idx2 = idx(key) - if idx2.flags.contiguous: - # The slice represents a contiguous slice. Get start and stop. - start, stop = idx2.flatten()[[0,-1]] - step = 1 - else: - (start, stop, step) = key[0].start, key[0].stop, key[0].step - else: - raise IndexError, "key not suitable:", key - - # Get the corrected values for start, stop, step - clen = cython.cdiv(self.nbytes, self.atomsize) - (start, stop, step) = slice(start, stop, step).indices(clen) - - # Build a numpy container - array = np.empty(shape=(stop-start,), dtype=self.dtype) - # Read actual data - self._getitem(start, stop, array.data) - - # Return the value depending on the step - if step > 1: - return array[::step] - return array - - def __setitem__(self, object key, object value): - """__setitem__(self, key, value) -> None.""" - raise NotImplementedError + """ + chunk(array, atom, cparams) - def __str__(self): - """Represent the chunk as an string.""" - return str(self[:]) + Compressed in-memory container for a data chunk. - def __repr__(self): - """Represent the chunk as an string, with additional info.""" - cratio = self.nbytes / float(self.cbytes) - fullrepr = "chunk(%s, %s) nbytes: %d; cbytes: %d; ratio: %.2f\n%r" % \ - (self.shape, self.dtype, self.nbytes, self.cbytes, cratio, str(self)) - return fullrepr + This class is meant to be used only by the `carray` class. - def __dealloc__(self): - """Release C resources before destruction.""" - if self.dobject: - self.dobject = None # DECREF pointer to data object - else: - free(self.data) # explictly free the data area + """ + # To save space, keep these variables under a minimum + cdef char typekind, isconstant + cdef int atomsize, itemsize, blocksize + cdef int nbytes, cbytes, cdbytes + cdef int true_count + cdef char *data + cdef object atom, constant, dobject + + property dtype: + "The NumPy dtype for this chunk." + def __get__(self): + return self.atom + + def __cinit__(self, object dobject, object atom, object cparams, + object _memory=True, object _compr=False): + cdef int itemsize, footprint + cdef size_t nbytes, cbytes, blocksize + cdef dtype dtype_ + + self.atom = atom + self.atomsize = atom.itemsize + dtype_ = atom.base + self.itemsize = itemsize = dtype_.elsize + self.typekind = dtype_.kind + self.dobject = None + footprint = 0 + + if _compr: + # Data comes in an already compressed state inside a Python String + self.data = PyString_AsString(dobject) + self.dobject = dobject # Increment the reference so that data + # don't go + # Set size info for the instance + blosc_cbuffer_sizes(self.data, &nbytes, &cbytes, &blocksize) + else: + # Compress the data object (a NumPy object) + nbytes, cbytes, blocksize, footprint = self.compress_data( + dobject, cparams, _memory) + footprint += 128 # add the (aprox) footprint of this instance in bytes + + # Fill instance data + self.nbytes = nbytes + self.cbytes = cbytes + footprint + self.cdbytes = cbytes + self.blocksize = blocksize + + cdef compress_data(self, ndarray array, object cparams, object _memory): + """Compress data in `array` and put it in ``self.data``""" + cdef size_t nbytes, cbytes, blocksize, itemsize, footprint + cdef int clevel, shuffle + cdef char *dest + + # Compute the total number of bytes in this array + itemsize = array.itemsize + nbytes = itemsize * array.size + cbytes = 0 + footprint = 0 + + # Check whether incoming data can be expressed as a constant or not. + # Disk-based chunks are not allowed to do this. + self.isconstant = 0 + self.constant = None + if _memory and (array.strides[0] == 0 + or check_zeros(array.data, nbytes)): + + self.isconstant = 1 + # Get the NumPy constant. Avoid this NumPy quirk: + # np.array(['1'], dtype='S3').dtype != s[0].dtype + if array.dtype.kind != 'S': + self.constant = array[0] + else: + self.constant = np.array(array[0], dtype=array.dtype) + # Add overhead (64 bytes for the overhead of the numpy container) + footprint += 64 + self.constant.size * self.constant.itemsize + + if self.isconstant: + blocksize = 4 * 1024 # use 4 KB as a cache for blocks + # Make blocksize a multiple of itemsize + if blocksize % itemsize > 0: + blocksize = cython.cdiv(blocksize, itemsize) * itemsize + # Correct in case we have a large itemsize + if blocksize == 0: + blocksize = itemsize + else: + if self.typekind == 'b': + self.true_count = true_count(array.data, nbytes) + + if array.strides[0] == 0: + # The chunk is made of constants. Regenerate the actual data. + array = array.copy() + + # Compress data + dest = malloc(nbytes + BLOSC_MAX_OVERHEAD) + clevel = cparams.clevel + shuffle = cparams.shuffle + with nogil: + cbytes = blosc_compress(clevel, shuffle, itemsize, nbytes, + array.data, dest, + nbytes + BLOSC_MAX_OVERHEAD) + if cbytes <= 0: + raise RuntimeError, "fatal error during Blosc compression: " \ + "%d" % cbytes + # Free the unused data + self.data = realloc(dest, cbytes) + # Set size info for the instance + blosc_cbuffer_sizes(self.data, &nbytes, &cbytes, &blocksize) + + return (nbytes, cbytes, blocksize, footprint) + + def getdata(self): + """Get a compressed String object out of this chunk (for + persistence).""" + cdef object string + + assert (not self.isconstant, + "This function can only be used for persistency") + string = PyString_FromStringAndSize(self.data, + self.cdbytes) + return string + + cdef void _getitem(self, int start, int stop, char *dest): + """Read data from `start` to `stop` and return it as a numpy array.""" + cdef int ret, bsize, blen, nitems, nstart + cdef ndarray constants + + blen = stop - start + bsize = blen * self.atomsize + nitems = cython.cdiv(bsize, self.itemsize) + nstart = cython.cdiv(start * self.atomsize, self.itemsize) + + if self.isconstant: + # The chunk is made of constants + constants = np.ndarray(shape=(blen,), dtype=self.dtype, + buffer=self.constant, strides=(0,)).copy() + memcpy(dest, constants.data, bsize) + return + + # Fill dest with uncompressed data + with nogil: + if bsize == self.nbytes: + ret = blosc_decompress(self.data, dest, bsize) + else: + ret = blosc_getitem(self.data, nstart, nitems, dest) + if ret < 0: + raise RuntimeError, "fatal error during Blosc decompression: %d" \ + % ret + + def __getitem__(self, object key): + """__getitem__(self, key) -> values.""" + cdef ndarray array + cdef object start, stop, step, clen, idx + + if isinstance(key, (int, long)): + # Quickly return a single element + array = np.empty(shape=(1,), dtype=self.dtype) + self._getitem(key, key + 1, array.data) + return PyArray_GETITEM(array, array.data) + elif isinstance(key, slice): + (start, stop, step) = key.start, key.stop, key.step + elif isinstance(key, tuple) and self.dtype.shape != (): + # Build an array to guess indices + clen = cython.cdiv(self.nbytes, self.itemsize) + idx = np.arange(clen, dtype=np.int32).reshape(self.dtype.shape) + idx2 = idx(key) + if idx2.flags.contiguous: + # The slice represents a contiguous slice. Get start and stop. + start, stop = idx2.flatten()[[0, -1]] + step = 1 + else: + (start, stop, step) = key[0].start, key[0].stop, key[0].step + else: + raise IndexError, "key not suitable:", key + + # Get the corrected values for start, stop, step + clen = cython.cdiv(self.nbytes, self.atomsize) + (start, stop, step) = slice(start, stop, step).indices(clen) + + # Build a numpy container + array = np.empty(shape=(stop - start,), dtype=self.dtype) + # Read actual data + self._getitem(start, stop, array.data) + + # Return the value depending on the step + if step > 1: + return array[::step] + return array + + def __setitem__(self, object key, object value): + """__setitem__(self, key, value) -> None.""" + raise NotImplementedError + + def __str__(self): + """Represent the chunk as an string.""" + return str(self[:]) + + def __repr__(self): + """Represent the chunk as an string, with additional info.""" + cratio = self.nbytes / float(self.cbytes) + fullrepr = "chunk(%s, %s) nbytes: %d; cbytes: %d; ratio: %.2f\n%r" % \ + (self.shape, self.dtype, self.nbytes, self.cbytes, cratio, + str(self)) + return fullrepr + + def __dealloc__(self): + """Release C resources before destruction.""" + if self.dobject: + self.dobject = None # DECREF pointer to data object + else: + free(self.data) # explictly free the data area cdef create_bloscpack_header(nchunks=None, format_version=FORMAT_VERSION): """ Create the bloscpack header string. @@ -450,16 +458,16 @@ cdef create_bloscpack_header(nchunks=None, format_version=FORMAT_VERSION): """ if not 0 <= nchunks <= MAX_CHUNKS and nchunks is not None: - raise ValueError( - "'nchunks' must be in the range 0 <= n <= %d, not '%s'" % - (MAX_CHUNKS, str(nchunks))) + raise ValueError( + "'nchunks' must be in the range 0 <= n <= %d, not '%s'" % + (MAX_CHUNKS, str(nchunks))) return (MAGIC + struct.pack('cython.cdiv(self._nbytes, self.atomsize) - - property mode: - "The mode used to create/open the `mode`." - def __get__(self): - return self._mode - def __set__(self, value): - self._mode = value - self.chunks.mode = value - - property nbytes: - "The original (uncompressed) size of this object (in bytes)." - def __get__(self): - return self._nbytes - - property ndim: - "The number of dimensions of this object." - def __get__(self): - return len(self.shape) - - property shape: - "The shape of this object." - def __get__(self): - return tuple((self.len,) + self._dtype.shape) - - property size: - "The size of this object." - def __get__(self): - return np.prod(self.shape) - - property rootdir: - "The on-disk directory used for persistency." - def __get__(self): - return self._rootdir - def __set__(self, value): - if not self.rootdir: - raise ValueError( - "cannot modify the rootdir value of an in-memory carray") - self._rootdir = value - self.chunks.rootdir = value - - def __cinit__(self, object array=None, object cparams=None, - object dtype=None, object dflt=None, - object expectedlen=None, object chunklen=None, - object rootdir=None, object mode="a"): - - self._rootdir = rootdir - if mode not in ('r', 'w', 'a'): - raise ValueError("mode should be 'r', 'w' or 'a'") - self._mode = mode - - if array is not None: - self.create_carray(array, cparams, dtype, dflt, - expectedlen, chunklen, rootdir, mode) - _new = True - elif rootdir is not None: - meta_info = self.read_meta() - self.open_carray(*meta_info) - _new = False - else: - raise ValueError("You need at least to pass an array or/and a rootdir") - - # Attach the attrs to this object - self._attrs = attrs.attrs(self._rootdir, self.mode, _new=_new) - - # Cache a len-1 array for accelerating self[int] case - self.arr1 = np.empty(shape=(1,), dtype=self._dtype) - - # Sentinels - self.sss_mode = False - self.wheretrue_mode = False - self.where_mode = False - self.idxcache = -1 # cache not initialized - - def create_carray(self, array, cparams, dtype, dflt, - expectedlen, chunklen, rootdir, mode): - """Create a new array.""" - cdef int itemsize, atomsize, chunksize - cdef ndarray lastchunkarr - cdef object array_, _dflt - - # Check defaults for cparams - if cparams is None: - cparams = bcolz.cparams() - - if not isinstance(cparams, bcolz.cparams): - raise ValueError, "`cparams` param must be an instance of `cparams` class" - - # Convert input to an appropriate type - if type(dtype) is str: - dtype = np.dtype(dtype) - array_ = utils.to_ndarray(array, dtype) - if dtype is None: - if len(array_.shape) == 1: - self._dtype = dtype = array_.dtype - else: - # Multidimensional array. The atom will have array_.shape[1:] dims. - # atom dimensions will be stored in `self._dtype`, which is different - # than `self.dtype` in that `self._dtype` dimensions are borrowed - # from `self.shape`. `self.dtype` will always be scalar (NumPy - # convention). - self._dtype = dtype = np.dtype((array_.dtype.base, array_.shape[1:])) - else: - self._dtype = dtype - # Checks for the dtype - if self._dtype.kind == 'O': - raise TypeError, "object dtypes are not supported in carray objects" - # Check that atom size is less than 2 GB - if dtype.itemsize >= 2**31: - raise ValueError, "atomic size is too large (>= 2 GB)" - - self.atomsize = atomsize = dtype.itemsize - self.itemsize = itemsize = dtype.base.itemsize - - # Check defaults for dflt - _dflt = np.zeros((), dtype=dtype) - if dflt is not None: - if dtype.shape == (): - _dflt[()] = dflt - else: - _dflt[:] = dflt - self._dflt = _dflt - - # Compute the chunklen/chunksize - if expectedlen is None: - # Try a guess - try: - expectedlen = len(array_) - except TypeError: - raise NotImplementedError( - "creating carrays from scalar objects not supported") - try: - self.expectedlen = expectedlen - except OverflowError: - raise OverflowError( - "The size cannot be larger than 2**31 on 32-bit platforms") - if chunklen is None: - # Try a guess - chunksize = utils.calc_chunksize((expectedlen * atomsize) / float(_MB)) - # Chunksize must be a multiple of atomsize - chunksize = cython.cdiv(chunksize, atomsize) * atomsize - # Protection against large itemsizes - if chunksize < atomsize: - chunksize = atomsize - else: - if not isinstance(chunklen, int) or chunklen < 1: - raise ValueError, "chunklen must be a positive integer" - chunksize = chunklen * atomsize - chunklen = cython.cdiv(chunksize, atomsize) - self._chunksize = chunksize - self._chunklen = chunklen - - # Book memory for last chunk (uncompressed) - # Use np.zeros here because they compress better - lastchunkarr = np.zeros(dtype=dtype, shape=(chunklen,)) - self.lastchunk = lastchunkarr.data - self.lastchunkarr = lastchunkarr - - # Create layout for data and metadata - self._cparams = cparams - if rootdir is None: - self.chunks = [] - else: - self.mkdirs(rootdir, mode) - metainfo = (dtype, cparams, self.shape[0], lastchunkarr, self._mode) - self.chunks = chunks(self._rootdir, metainfo=metainfo, _new=True) - # We can write the metainfo already - self.write_meta() - - # Finally, fill the chunks - self.fill_chunks(array_) - - # and flush the data pending... - self.flush() - - def open_carray(self, shape, cparams, dtype, dflt, - expectedlen, cbytes, chunklen): - """Open an existing array.""" - cdef ndarray lastchunkarr - cdef object array_, _dflt - cdef npy_intp calen - - if len(shape) == 1: - self._dtype = dtype - else: - # Multidimensional array. The atom will have array_.shape[1:] dims. - # atom dimensions will be stored in `self._dtype`, which is different - # than `self.dtype` in that `self._dtype` dimensions are borrowed - # from `self.shape`. `self.dtype` will always be scalar (NumPy - # convention). - self._dtype = dtype = np.dtype((dtype.base, shape[1:])) - - self._cparams = cparams - self.atomsize = dtype.itemsize - self.itemsize = dtype.base.itemsize - self._chunklen = chunklen - self._chunksize = chunklen * self.atomsize - self._dflt = dflt - self.expectedlen = expectedlen - - # Book memory for last chunk (uncompressed) - # Use np.zeros here because they compress better - lastchunkarr = np.zeros(dtype=dtype, shape=(chunklen,)) - self.lastchunk = lastchunkarr.data - self.lastchunkarr = lastchunkarr - - # Check rootdir hierarchy - if not os.path.isdir(self._rootdir): - raise IOError("root directory does not exist") - self.datadir = os.path.join(self._rootdir, DATA_DIR) - if not os.path.isdir(self.datadir): - raise IOError("data directory does not exist") - self.metadir = os.path.join(self._rootdir, META_DIR) - if not os.path.isdir(self.metadir): - raise IOError("meta directory does not exist") - - calen = shape[0] # the length ot the carray - # Finally, open data directory - metainfo = (dtype, cparams, calen, lastchunkarr, self._mode) - self.chunks = chunks(self._rootdir, metainfo=metainfo, _new=False) - - # Update some counters - self.leftover = (calen % chunklen) * self.atomsize - self._cbytes = cbytes - self._nbytes = calen * self.atomsize - - if self._mode == "w": - # Remove all entries when mode is 'w' - self.resize(0) - - def fill_chunks(self, object array_): - """Fill chunks, either in-memory or on-disk.""" - cdef int leftover, chunklen - cdef npy_intp i, nchunks - cdef npy_intp nbytes, cbytes - cdef chunk chunk_ - cdef ndarray remainder - - # The number of bytes in incoming array - nbytes = self.itemsize * array_.size - self._nbytes = nbytes - - # Compress data in chunks - cbytes = 0 - chunklen = self._chunklen - nchunks = cython.cdiv(nbytes, self._chunksize) - for i from 0 <= i < nchunks: - assert i*chunklen < array_.size, "i, nchunks: %d, %d" % (i, nchunks) - chunk_ = chunk(array_[i*chunklen:(i+1)*chunklen], - self._dtype, self._cparams, - _memory = self._rootdir is None) - self.chunks.append(chunk_) - cbytes += chunk_.cbytes - self.leftover = leftover = nbytes % self._chunksize - if leftover: - remainder = array_[nchunks*chunklen:] - memcpy(self.lastchunk, remainder.data, leftover) - cbytes += self._chunksize # count the space in last chunk - self._cbytes = cbytes - - def mkdirs(self, object rootdir, object mode): - """Create the basic directory layout for persistent storage.""" - if os.path.exists(rootdir): - if self._mode != "w": - raise IOError( - "specified rootdir path '%s' already exists " - "and creation mode is '%s'" % (rootdir, mode)) - if os.path.isdir(rootdir): - shutil.rmtree(rootdir) - else: - os.remove(rootdir) - os.mkdir(rootdir) - self.datadir = os.path.join(rootdir, DATA_DIR) - os.mkdir(self.datadir) - self.metadir = os.path.join(rootdir, META_DIR) - os.mkdir(self.metadir) - - def write_meta(self): - """Write metadata persistently.""" - storagef = os.path.join(self.metadir, STORAGE_FILE) - with open(storagef, 'wb') as storagefh: - storagefh.write(json.dumps({ - "dtype": str(self.dtype), - "cparams": { - "clevel": self.cparams.clevel, - "shuffle": self.cparams.shuffle, - }, - "chunklen": self._chunklen, - "expectedlen": self.expectedlen, - "dflt": self.dflt.tolist(), - })) - storagefh.write("\n") - - def read_meta(self): - """Read persistent metadata.""" - - # First read the size info - metadir = os.path.join(self._rootdir, META_DIR) - shapef = os.path.join(metadir, SIZES_FILE) - with open(shapef, 'rb') as shapefh: - sizes = json.loads(shapefh.read()) - shape = sizes['shape'] - if type(shape) == list: - shape = tuple(shape) - nbytes = sizes["nbytes"] - cbytes = sizes["cbytes"] - - # Then the rest of metadata - storagef = os.path.join(metadir, STORAGE_FILE) - with open(storagef, 'rb') as storagefh: - data = json.loads(storagefh.read()) - dtype_ = np.dtype(data["dtype"]) - chunklen = data["chunklen"] - cparams = bcolz.cparams( - clevel = data["cparams"]["clevel"], - shuffle = data["cparams"]["shuffle"]) - expectedlen = data["expectedlen"] - dflt = data["dflt"] - return (shape, cparams, dtype_, dflt, expectedlen, cbytes, chunklen) - - def append(self, object array): - """ - append(array) - - Append a numpy `array` to this instance. - - Parameters - ---------- - array : NumPy-like object - The array to be appended. Must be compatible with shape and type of - the carray. - - """ - cdef int atomsize, itemsize, chunksize, leftover - cdef int nbytesfirst, chunklen, start, stop - cdef npy_intp nbytes, cbytes, bsize, i, nchunks - cdef ndarray remainder, arrcpy, dflts - cdef chunk chunk_ - - if self.mode == "r": - raise IOError( - "cannot modify data because mode is '%s'" % self.mode) - - arrcpy = utils.to_ndarray(array, self._dtype) - if arrcpy.dtype != self._dtype.base: - raise TypeError, "array dtype does not match with self" - # Appending a single row should be supported - if arrcpy.shape == self._dtype.shape: - arrcpy = arrcpy.reshape((1,)+arrcpy.shape) - if arrcpy.shape[1:] != self._dtype.shape: - raise ValueError, "array trailing dimensions does not match with self" - - atomsize = self.atomsize - itemsize = self.itemsize - chunksize = self._chunksize - chunks = self.chunks - leftover = self.leftover - bsize = arrcpy.size*itemsize - cbytes = 0 - - # Check if array fits in existing buffer - if (bsize + leftover) < chunksize: - # Data fits in lastchunk buffer. Just copy it - if arrcpy.strides[0] > 0: - memcpy(self.lastchunk+leftover, arrcpy.data, bsize) - else: - start = cython.cdiv(leftover, atomsize) - stop = cython.cdiv((leftover+bsize), atomsize) - self.lastchunkarr[start:stop] = arrcpy - leftover += bsize - else: - # Data does not fit in buffer. Break it in chunks. - - # First, fill the last buffer completely (if needed) - if leftover: - nbytesfirst = chunksize - leftover - if arrcpy.strides[0] > 0: - memcpy(self.lastchunk+leftover, arrcpy.data, nbytesfirst) - else: - start = cython.cdiv(leftover, atomsize) - stop = cython.cdiv((leftover+nbytesfirst), atomsize) - self.lastchunkarr[start:stop] = arrcpy[start:stop] - # Compress the last chunk and add it to the list - chunk_ = chunk(self.lastchunkarr, self._dtype, self._cparams, - _memory = self._rootdir is None) - chunks.append(chunk_) - cbytes = chunk_.cbytes - else: - nbytesfirst = 0 - - # Then fill other possible chunks - nbytes = bsize - nbytesfirst - nchunks = cython.cdiv(nbytes, chunksize) - chunklen = self._chunklen - # Get a new view skipping the elements that have been already copied - remainder = arrcpy[cython.cdiv(nbytesfirst, atomsize):] - for i from 0 <= i < nchunks: - chunk_ = chunk( - remainder[i*chunklen:(i+1)*chunklen], self._dtype, self._cparams, - _memory = self._rootdir is None) - chunks.append(chunk_) - cbytes += chunk_.cbytes - - # Finally, deal with the leftover - leftover = nbytes % chunksize - if leftover: - remainder = remainder[nchunks*chunklen:] - if arrcpy.strides[0] > 0: - memcpy(self.lastchunk, remainder.data, leftover) + """Store the different carray chunks in a directory on-disk.""" + cdef object _rootdir, _mode + cdef object dtype, cparams, lastchunkarr + cdef object chunk_cached + cdef npy_intp nchunks, nchunk_cached, len + + property mode: + "The mode used to create/open the `mode`." + def __get__(self): + return self._mode + def __set__(self, value): + self._mode = value + + property rootdir: + "The on-disk directory used for persistency." + def __get__(self): + return self._rootdir + def __set__(self, value): + self._rootdir = value + + property datadir: + """The directory for data files.""" + def __get__(self): + return os.path.join(self.rootdir, DATA_DIR) + + def __cinit__(self, rootdir, metainfo=None, _new=False): + cdef ndarray lastchunkarr + cdef void *decompressed, *compressed + cdef int leftover + cdef char *lastchunk + cdef size_t chunksize + cdef object scomp + cdef int ret + cdef int itemsize, atomsize + + self._rootdir = rootdir + self.nchunks = 0 + self.nchunk_cached = -1 # no chunk cached initially + self.dtype, self.cparams, self.len, lastchunkarr, self._mode = metainfo + atomsize = self.dtype.itemsize + itemsize = self.dtype.base.itemsize + + if not _new: + self.nchunks = cython.cdiv(self.len, len(lastchunkarr)) + chunksize = len(lastchunkarr) * atomsize + lastchunk = lastchunkarr.data + leftover = (self.len % len(lastchunkarr)) * atomsize + if leftover: + # Fill lastchunk with data on disk + scomp = self.read_chunk(self.nchunks) + compressed = PyString_AsString(scomp) + with nogil: + ret = blosc_decompress(compressed, lastchunk, chunksize) + if ret < 0: + raise RuntimeError( + "error decompressing the last chunk (error code: " + "%d)" % ret) + + cdef read_chunk(self, nchunk): + """Read a chunk and return it in compressed form.""" + dname = "__%d%s" % (nchunk, EXTENSION) + schunkfile = os.path.join(self.datadir, dname) + if not os.path.exists(schunkfile): + raise ValueError("chunkfile %s not found" % schunkfile) + with open(schunkfile, 'rb') as schunk: + bloscpack_header = schunk.read(BLOSCPACK_HEADER_LENGTH) + blosc_header_raw = schunk.read(BLOSC_HEADER_LENGTH) + blosc_header = decode_blosc_header(blosc_header_raw) + ctbytes = blosc_header['ctbytes'] + nbytes = blosc_header['nbytes'] + # seek back BLOSC_HEADER_LENGTH bytes in file relative to current + # position + schunk.seek(-BLOSC_HEADER_LENGTH, 1) + scomp = schunk.read(ctbytes) + return scomp + + def __getitem__(self, nchunk): + cdef void *decompressed, *compressed + + if nchunk == self.nchunk_cached: + # Hit! + return self.chunk_cached else: - self.lastchunkarr[:len(remainder)] = remainder - - # Update some counters - self.leftover = leftover - self._cbytes += cbytes - self._nbytes += bsize - - def trim(self, object nitems): - """ - trim(nitems) - - Remove the trailing `nitems` from this instance. - - Parameters - ---------- - nitems : int - The number of trailing items to be trimmed. If negative, the object - is enlarged instead. - - """ - cdef int atomsize, leftover, leftover2 - cdef npy_intp cbytes, bsize, nchunk2 - cdef chunk chunk_ - - if not isinstance(nitems, (int, long, float)): - raise TypeError, "`nitems` must be an integer" - - # Check that we don't run out of space - if nitems > self.len: - raise ValueError, "`nitems` must be less than total length" - # A negative number of items means that we want to grow the object - if nitems <= 0: - self.resize(self.len - nitems) - return - - atomsize = self.atomsize - chunks = self.chunks - leftover = self.leftover - bsize = nitems * atomsize - cbytes = 0 - - # Check if items belong to the last chunk - if (leftover - bsize) > 0: - # Just update leftover counter - leftover -= bsize - else: - # nitems larger than last chunk - nchunk = cython.cdiv((self.len - nitems), self._chunklen) - leftover2 = (self.len - nitems) % self._chunklen - leftover = leftover2 * atomsize - - # Remove complete chunks - nchunk2 = lnchunk = cython.cdiv(self._nbytes, self._chunksize) - while nchunk2 > nchunk: - chunk_ = chunks.pop() - cbytes += chunk_.cbytes - nchunk2 -= 1 - - # Finally, deal with the leftover - if leftover: - self.lastchunkarr[:leftover2] = chunk_[:leftover2] - if self._rootdir: - # Last chunk is removed automatically by the chunks.pop() call, and - # always is counted as if it is not compressed (although it is in - # this state on-disk) - cbytes += chunk_.nbytes - - # Update some counters - self.leftover = leftover - self._cbytes -= cbytes - self._nbytes -= bsize - # Flush last chunk and update counters on-disk - self.flush() - - def resize(self, object nitems): - """ - resize(nitems) - - Resize the instance to have `nitems`. - - Parameters - ---------- - nitems : int - The final length of the object. If `nitems` is larger than the actual - length, new items will appended using `self.dflt` as filling values. - - """ - cdef object chunk - - if not isinstance(nitems, (int, long, float)): - raise TypeError, "`nitems` must be an integer" - - if nitems == self.len: - return - elif nitems < 0: - raise ValueError, "`nitems` cannot be negative" - - if nitems > self.len: - # Create a 0-strided array and append it to self - chunk = np.ndarray(nitems-self.len, dtype=self._dtype, - buffer=self._dflt, strides=(0,)) - self.append(chunk) - self.flush() - else: - # Just trim the excess of items - self.trim(self.len-nitems) - - def reshape(self, newshape): - """ - reshape(newshape) - - Returns a new carray containing the same data with a new shape. - - Parameters - ---------- - newshape : int or tuple of ints - The new shape should be compatible with the original shape. If - an integer, then the result will be a 1-D array of that length. - One shape dimension can be -1. In this case, the value is inferred - from the length of the array and remaining dimensions. - - Returns - ------- - reshaped_array : carray - A copy of the original carray. - - """ - cdef npy_intp newlen, ilen, isize, osize, newsize, rsize, i - cdef object ishape, oshape, pos, newdtype, out - - # Enforce newshape as tuple - if isinstance(newshape, (int, long)): - newshape = (newshape,) - newsize = np.prod(newshape) - - ishape = self.shape - ilen = ishape[0] - isize = np.prod(ishape) - - # Check for -1 in newshape - if -1 in newshape: - if newshape.count(-1) > 1: - raise ValueError, "only one shape dimension can be -1" - pos = newshape.index(-1) - osize = np.prod(newshape[:pos] + newshape[pos+1:]) - if isize == 0: - newshape = newshape[:pos] + (0,) + newshape[pos+1:] - else: - newshape = newshape[:pos] + (isize/osize,) + newshape[pos+1:] - newsize = np.prod(newshape) - - # Check shape compatibility - if isize != newsize: - raise ValueError, "`newshape` is not compatible with the current one" - # Create the output container - newdtype = np.dtype((self._dtype.base, newshape[1:])) - newlen = newshape[0] - - # If shapes are both n-dimensional, convert first to 1-dim shape - # and then convert again to the final newshape. - if len(ishape) > 1 and len(newshape) > 1: - out = self.reshape(-1) - return out.reshape(newshape) - - if self._rootdir: - # If persistent, do the copy to a temporary dir - absdir = os.path.dirname(self._rootdir) - rootdir = tempfile.mkdtemp(suffix='__temp__', dir=absdir) - else: - rootdir = None - - # Create the final container and fill it - out = carray([], dtype=newdtype, cparams=self.cparams, expectedlen=newlen, - rootdir=rootdir, mode='w') - if newlen < ilen: - rsize = isize / newlen - for i from 0 <= i < newlen: - out.append(self[i*rsize:(i+1)*rsize].reshape(newdtype.shape)) - else: - for i from 0 <= i < ilen: - out.append(self[i].reshape(-1)) - out.flush() - - # Finally, rename the temporary data directory to self._rootdir - if self._rootdir: - shutil.rmtree(self._rootdir) - os.rename(rootdir, self._rootdir) - # Restore the rootdir and mode - out.rootdir = self._rootdir - out.mode = self._mode - - return out - - def copy(self, **kwargs): - """ - copy(**kwargs) - - Return a copy of this object. - - Parameters - ---------- - kwargs : list of parameters or dictionary - Any parameter supported by the carray constructor. - - Returns - ------- - out : carray object - The copy of this object. + scomp = self.read_chunk(nchunk) + # Data chunk should be compressed already + chunk_ = chunk(scomp, self.dtype, self.cparams, + _memory=False, _compr=True) + # Fill cache + self.nchunk_cached = nchunk + self.chunk_cached = chunk_ + return chunk_ + + def __setitem__(self, nchunk, chunk_): + self._save(nchunk, chunk_) + + def __len__(self): + return self.nchunks + + def append(self, chunk_): + """Append an new chunk to the carray.""" + self._save(self.nchunks, chunk_) + self.nchunks += 1 + + cdef _save(self, nchunk, chunk_): + """Save the `chunk_` as chunk #`nchunk`. """ + + if self.mode == "r": + raise IOError( + "cannot modify data because mode is '%s'" % self.mode) + + dname = "__%d%s" % (nchunk, EXTENSION) + schunkfile = os.path.join(self.datadir, dname) + bloscpack_header = create_bloscpack_header(1) + with open(schunkfile, 'wb') as schunk: + schunk.write(bloscpack_header) + data = chunk_.getdata() + schunk.write(data) + # Mark the cache as dirty if needed + if nchunk == self.nchunk_cached: + self.nchunk_cached = -1 + + def flush(self, chunk_): + """Flush the leftover chunk.""" + self._save(self.nchunks, chunk_) + + def pop(self): + """Remove the last chunk and return it.""" + nchunk = self.nchunks - 1 + chunk_ = self.__getitem__(nchunk) + dname = "__%d%s" % (nchunk, EXTENSION) + schunkfile = os.path.join(self.datadir, dname) + if not os.path.exists(schunkfile): + raise IOError("chunk filename %s does exist" % schunkfile) + os.remove(schunkfile) + + # When poping a chunk, we must be sure that we don't leave anything + # behind (i.e. the lastchunk) + dname = "__%d%s" % (nchunk + 1, EXTENSION) + schunkfile = os.path.join(self.datadir, dname) + if os.path.exists(schunkfile): + os.remove(schunkfile) + + self.nchunks -= 1 + return chunk_ +cdef class carray: """ - cdef object chunklen - - # Get defaults for some parameters - cparams = kwargs.pop('cparams', self._cparams) - expectedlen = kwargs.pop('expectedlen', self.len) + carray(array, cparams=None, dtype=None, dflt=None, expectedlen=None, + chunklen=None, rootdir=None, mode='a') - # Create a new, empty carray - ccopy = carray(np.empty(0, dtype=self._dtype), - cparams=cparams, - expectedlen=expectedlen, - **kwargs) + A compressed and enlargeable in-memory data container. - # Now copy the carray chunk by chunk - chunklen = self._chunklen - for i from 0 <= i < self.len by chunklen: - ccopy.append(self[i:i+chunklen]) - ccopy.flush() - - return ccopy - - def sum(self, dtype=None): - """ - sum(dtype=None) - - Return the sum of the array elements. + `carray` exposes a series of methods for dealing with the compressed + container in a NumPy-like way. Parameters ---------- + array : a NumPy-like object + This is taken as the input to create the carray. It can be any Python + object that can be converted into a NumPy object. The data type of + the resulting carray will be the same as this NumPy object. + cparams : instance of the `cparams` class, optional + Parameters to the internal Blosc compressor. dtype : NumPy dtype - The desired type of the output. If ``None``, the dtype of `self` is - used. An exception is when `self` has an integer type with less - precision than the default platform integer. In that case, the - default platform integer is used instead (NumPy convention). - - - Return value - ------------ - out : NumPy scalar with `dtype` - - """ - cdef chunk chunk_ - cdef npy_intp nchunk, nchunks - cdef object result - - if dtype is None: - dtype = self._dtype.base - # Check if we have less precision than required for ints - # (mimick NumPy logic) - if dtype.kind in ('b', 'i') and dtype.itemsize < IntType.itemsize: - dtype = IntType - else: - dtype = np.dtype(dtype) - if dtype.kind == 'S': - raise TypeError, "cannot perform reduce with flexible type" - - # Get a container for the result - result = np.zeros(1, dtype=dtype)[0] - - nchunks = cython.cdiv(self._nbytes, self._chunksize) - for nchunk from 0 <= nchunk < nchunks: - chunk_ = self.chunks[nchunk] - if chunk_.isconstant: - result += chunk_.constant * self._chunklen - elif self._dtype.type == np.bool_: - result += chunk_.true_count - else: - result += chunk_[:].sum(dtype=dtype) - if self.leftover: - leftover = self.len - nchunks * self._chunklen - result += self.lastchunkarr[:leftover].sum(dtype=dtype) - - return result - - def __len__(self): - return self.len - - def __sizeof__(self): - return self._cbytes - - cdef int getitem_cache(self, npy_intp pos, char *dest): - """Get a single item and put it in `dest`. It caches a complete block. - - It returns 1 if asked `pos` can be copied to `dest`. Else, this returns - 0. - - NOTE: As Blosc supports decompressing just a block inside a chunk, the - data that is cached is a *block*, as it is the least amount of data that - can be decompressed. This saves both time and memory. - - IMPORTANT: Any update operation (e.g. __setitem__) *must* disable this - cache by setting self.idxcache = -2. - """ - cdef int ret, atomsize, blocksize, offset - cdef int idxcache, posinbytes, blocklen - cdef npy_intp nchunk, nchunks, chunklen - cdef chunk chunk_ - - atomsize = self.atomsize - nchunks = cython.cdiv(self._nbytes, self._chunksize) - chunklen = self._chunklen - nchunk = cython.cdiv(pos, chunklen) - - # Check whether pos is in the last chunk - if nchunk == nchunks and self.leftover: - posinbytes = (pos % chunklen) * atomsize - memcpy(dest, self.lastchunk + posinbytes, atomsize) - return 1 - - # Locate the *block* inside the chunk - chunk_ = self.chunks[nchunk] - blocksize = chunk_.blocksize - blocklen = cython.cdiv(blocksize, atomsize) - - if atomsize > blocksize: - # This request cannot be resolved here - return 0 - - # Check whether the cache block has to be initialized - if self.idxcache < 0: - self.blockcache = np.empty(shape=(blocklen,), dtype=self._dtype) - self.datacache = self.blockcache.data - # We don't want this to contribute to cbytes counter! - # if self.idxcache == -1: - # # Absolute first time. Add the cache size to cbytes counter. - # self._cbytes += chunksize - - # Check if block is cached - idxcache = cython.cdiv(pos, blocklen) * blocklen - if idxcache == self.idxcache: - # Hit! - posinbytes = (pos % blocklen) * atomsize - memcpy(dest, self.datacache + posinbytes, atomsize) - return 1 - - # No luck. Read a complete block. - offset = idxcache % chunklen - chunk_._getitem(offset, offset+blocklen, self.datacache) - # Copy the interesting bits to dest - posinbytes = (pos % blocklen) * atomsize - memcpy(dest, self.datacache + posinbytes, atomsize) - # Update the cache index - self.idxcache = idxcache - return 1 - - def __getitem__(self, object key): - """ - x.__getitem__(key) <==> x[key] - - Returns values based on `key`. All the functionality of - ``ndarray.__getitem__()`` is supported (including fancy indexing), plus a - special support for expressions: - - Parameters - ---------- - key : string - It will be interpret as a boolean expression (computed via `eval`) and - the elements where these values are true will be returned as a NumPy - array. - - See Also - -------- - eval + Force this `dtype` for the carray (rather than the `array` one). + dflt : Python or NumPy scalar + The value to be used when enlarging the carray. If None, + the default is + filling with zeros. + expectedlen : int, optional + A guess on the expected length of this object. This will serve to + decide the best `chunklen` used for compression and memory I/O + purposes. + chunklen : int, optional + The number of items that fits into a chunk. By specifying it you can + explicitely set the chunk size used for compression and memory I/O. + Only use it if you know what are you doing. + rootdir : str, optional + The directory where all the data and metadata will be stored. If + specified, then the carray object will be disk-based (i.e. all chunks + will live on-disk, not in memory) and persistent (i.e. it can be + restored in other session, e.g. via the `open()` top-level function). + mode : str, optional + The mode that a *persistent* carray should be created/opened. The + values can be: + + * 'r' for read-only + * 'w' for read/write. During carray creation, the `rootdir` will be + removed if it exists. During carray opening, the carray will be + resized to 0. + * 'a' for append (possible data inside `rootdir` will not be + removed). """ - cdef int chunklen + cdef int itemsize, atomsize, _chunksize, _chunklen, leftover + cdef int nrowsinbuf, _row + cdef int sss_mode, wheretrue_mode, where_mode cdef npy_intp startb, stopb - cdef npy_intp nchunk, keychunk, nchunks - cdef npy_intp nwrow, blen - cdef ndarray arr1 - cdef object start, stop, step - cdef object arr - - chunklen = self._chunklen - - # Check for integer - # isinstance(key, int) is not enough in Cython (?) - if isinstance(key, (int, long)) or isinstance(key, np.int_): - if key < 0: - # To support negative values - key += self.len - if key >= self.len: - raise IndexError, "index out of range" - arr1 = self.arr1 - if self.getitem_cache(key, arr1.data): - if self.itemsize == self.atomsize: - return PyArray_GETITEM(arr1, arr1.data) + cdef npy_intp start, stop, step, nextelement + cdef npy_intp _nrow, nrowsread + cdef npy_intp _nbytes, _cbytes + cdef npy_intp nhits, limit, skip + cdef npy_intp expectedlen + cdef char *lastchunk + cdef object lastchunkarr, where_arr, arr1 + cdef object _cparams, _dflt + cdef object _dtype, chunks + cdef object _rootdir, datadir, metadir, _mode + cdef object _attrs + cdef ndarray iobuf, where_buf + # For block cache + cdef int idxcache + cdef ndarray blockcache + cdef char *datacache + + property attrs: + "The attribute accessor." + def __get__(self): + return self._attrs + + property cbytes: + "The compressed size of this object (in bytes)." + def __get__(self): + return self._cbytes + + property chunklen: + "The chunklen of this object (in rows)." + def __get__(self): + return self._chunklen + + property cparams: + "The compression parameters for this object." + def __get__(self): + return self._cparams + + property dflt: + "The default value of this object." + def __get__(self): + return self._dflt + + property dtype: + "The dtype of this object." + def __get__(self): + return self._dtype.base + + property len: + "The length (leading dimension) of this object." + def __get__(self): + # Important to do the cast in order to get a npy_intp result + return cython.cdiv(self._nbytes, + self.atomsize) + + property mode: + "The mode used to create/open the `mode`." + def __get__(self): + return self._mode + def __set__(self, value): + self._mode = value + self.chunks.mode = value + + property nbytes: + "The original (uncompressed) size of this object (in bytes)." + def __get__(self): + return self._nbytes + + property ndim: + "The number of dimensions of this object." + def __get__(self): + return len(self.shape) + + property shape: + "The shape of this object." + def __get__(self): + return tuple((self.len,) + self._dtype.shape) + + property size: + "The size of this object." + def __get__(self): + return np.prod(self.shape) + + property rootdir: + "The on-disk directory used for persistency." + def __get__(self): + return self._rootdir + def __set__(self, value): + if not self.rootdir: + raise ValueError( + "cannot modify the rootdir value of an in-memory carray") + self._rootdir = value + self.chunks.rootdir = value + + def __cinit__(self, object array=None, object cparams=None, + object dtype=None, object dflt=None, + object expectedlen=None, object chunklen=None, + object rootdir=None, object mode="a"): + + self._rootdir = rootdir + if mode not in ('r', 'w', 'a'): + raise ValueError("mode should be 'r', 'w' or 'a'") + self._mode = mode + + if array is not None: + self.create_carray(array, cparams, dtype, dflt, + expectedlen, chunklen, rootdir, mode) + _new = True + elif rootdir is not None: + meta_info = self.read_meta() + self.open_carray(*meta_info) + _new = False else: - return arr1[0] - # Fallback action: use the slice code - return np.squeeze(self[slice(key, None, 1)]) - # Slices - elif isinstance(key, slice): - (start, stop, step) = key.start, key.stop, key.step - if step and step <= 0 : - raise NotImplementedError("step in slice can only be positive") - # Multidimensional keys - elif isinstance(key, tuple): - if len(key) == 0: - raise ValueError("empty tuple not supported") - elif len(key) == 1: - return self[key[0]] - # An n-dimensional slice - # First, retrieve elements in the leading dimension - arr = self[key[0]] - # Then, keep only the required elements in other dimensions - if type(key[0]) == slice: - arr = arr[(slice(None),) + key[1:]] - else: - arr = arr[key[1:]] - # Force a copy in case returned array is not contiguous - if not arr.flags.contiguous: - arr = arr.copy() - return arr - # List of integers (case of fancy indexing) - elif isinstance(key, list): - # Try to convert to a integer array - try: - key = np.array(key, dtype=np.int_) - except: - raise IndexError, "key cannot be converted to an array of indices" - return self[key] - # A boolean or integer array (case of fancy indexing) - elif hasattr(key, "dtype"): - if key.dtype.type == np.bool_: - # A boolean array - if len(key) != self.len: - raise IndexError, "boolean array length must match len(self)" - if isinstance(key, carray): - count = key.sum() + raise ValueError( + "You need at least to pass an array or/and a rootdir") + + # Attach the attrs to this object + self._attrs = attrs.attrs(self._rootdir, self.mode, _new=_new) + + # Cache a len-1 array for accelerating self[int] case + self.arr1 = np.empty(shape=(1,), dtype=self._dtype) + + # Sentinels + self.sss_mode = False + self.wheretrue_mode = False + self.where_mode = False + self.idxcache = -1 # cache not initialized + + def create_carray(self, array, cparams, dtype, dflt, + expectedlen, chunklen, rootdir, mode): + """Create a new array.""" + cdef int itemsize, atomsize, chunksize + cdef ndarray lastchunkarr + cdef object array_, _dflt + + # Check defaults for cparams + if cparams is None: + cparams = bcolz.cparams() + + if not isinstance(cparams, bcolz.cparams): + raise ValueError, "`cparams` param must be an instance of " \ + "`cparams` class" + + # Convert input to an appropriate type + if type(dtype) is str: + dtype = np.dtype(dtype) + array_ = utils.to_ndarray(array, dtype) + if dtype is None: + if len(array_.shape) == 1: + self._dtype = dtype = array_.dtype + else: + # Multidimensional array. The atom will have array_.shape[ + # 1:] dims. + # atom dimensions will be stored in `self._dtype`, which is + # different + # than `self.dtype` in that `self._dtype` dimensions are + # borrowed + # from `self.shape`. `self.dtype` will always be scalar (NumPy + # convention). + self._dtype = dtype = np.dtype( + (array_.dtype.base, array_.shape[1:])) else: - count = -1 - return np.fromiter(self.where(key), dtype=self._dtype, count=count) - elif np.issubsctype(key, np.int_): - # An integer array - return np.array([self[i] for i in key], dtype=self._dtype) - else: - raise IndexError, \ - "arrays used as indices must be of integer (or boolean) type" - # An boolean expression (case of fancy indexing) - elif type(key) is str: - # Evaluate - result = bcolz.eval(key) - if result.dtype.type != np.bool_: - raise IndexError, "only boolean expressions supported" - if len(result) != self.len: - raise IndexError, "boolean expression outcome must match len(self)" - # Call __getitem__ again - return self[result] - # All the rest not implemented - else: - raise NotImplementedError, "key not supported: %s" % repr(key) - - # From now on, will only deal with [start:stop:step] slices - - # Get the corrected values for start, stop, step - (start, stop, step) = slice(start, stop, step).indices(self.len) - - # Build a numpy container - blen = get_len_of_range(start, stop, step) - arr = np.empty(shape=(blen,), dtype=self._dtype) - if blen == 0: - # If empty, return immediately - return arr - - # Fill it from data in chunks - nwrow = 0 - nchunks = cython.cdiv(self._nbytes, self._chunksize) - if self.leftover > 0: - nchunks += 1 - for nchunk from 0 <= nchunk < nchunks: - # Compute start & stop for each block - startb, stopb, blen = clip_chunk(nchunk, chunklen, start, stop, step) - if blen == 0: - continue - # Get the data chunk and assign it to result array - if nchunk == nchunks-1 and self.leftover: - arr[nwrow:nwrow+blen] = self.lastchunkarr[startb:stopb:step] - else: - arr[nwrow:nwrow+blen] = self.chunks[nchunk][startb:stopb:step] - nwrow += blen - - return arr - - def __setitem__(self, object key, object value): - """ - x.__setitem__(key, value) <==> x[key] = value - - Sets values based on `key`. All the functionality of - ``ndarray.__setitem__()`` is supported (including fancy indexing), plus a - special support for expressions: - - Parameters - ---------- - key : string - It will be interpret as a boolean expression (computed via `eval`) and - the elements where these values are true will be set to `value`. - - See Also - -------- - eval + self._dtype = dtype + # Checks for the dtype + if self._dtype.kind == 'O': + raise TypeError, "object dtypes are not supported in carray " \ + "objects" + # Check that atom size is less than 2 GB + if dtype.itemsize >= 2 ** 31: + raise ValueError, "atomic size is too large (>= 2 GB)" + + self.atomsize = atomsize = dtype.itemsize + self.itemsize = itemsize = dtype.base.itemsize + + # Check defaults for dflt + _dflt = np.zeros((), dtype=dtype) + if dflt is not None: + if dtype.shape == (): + _dflt[()] = dflt + else: + _dflt[:] = dflt + self._dflt = _dflt + + # Compute the chunklen/chunksize + if expectedlen is None: + # Try a guess + try: + expectedlen = len(array_) + except TypeError: + raise NotImplementedError( + "creating carrays from scalar objects not supported") + try: + self.expectedlen = expectedlen + except OverflowError: + raise OverflowError( + "The size cannot be larger than 2**31 on 32-bit platforms") + if chunklen is None: + # Try a guess + chunksize = utils.calc_chunksize( + (expectedlen * atomsize) / float(_MB)) + # Chunksize must be a multiple of atomsize + chunksize = cython.cdiv(chunksize, atomsize) * atomsize + # Protection against large itemsizes + if chunksize < atomsize: + chunksize = atomsize + else: + if not isinstance(chunklen, int) or chunklen < 1: + raise ValueError, "chunklen must be a positive integer" + chunksize = chunklen * atomsize + chunklen = cython.cdiv(chunksize, atomsize) + self._chunksize = chunksize + self._chunklen = chunklen + + # Book memory for last chunk (uncompressed) + # Use np.zeros here because they compress better + lastchunkarr = np.zeros(dtype=dtype, shape=(chunklen,)) + self.lastchunk = lastchunkarr.data + self.lastchunkarr = lastchunkarr + + # Create layout for data and metadata + self._cparams = cparams + if rootdir is None: + self.chunks = [] + else: + self.mkdirs(rootdir, mode) + metainfo = ( + dtype, cparams, self.shape[0], lastchunkarr, self._mode) + self.chunks = chunks(self._rootdir, metainfo=metainfo, _new=True) + # We can write the metainfo already + self.write_meta() + + # Finally, fill the chunks + self.fill_chunks(array_) + + # and flush the data pending... + self.flush() + + def open_carray(self, shape, cparams, dtype, dflt, + expectedlen, cbytes, chunklen): + """Open an existing array.""" + cdef ndarray lastchunkarr + cdef object array_, _dflt + cdef npy_intp calen + + if len(shape) == 1: + self._dtype = dtype + else: + # Multidimensional array. The atom will have array_.shape[1:] + # dims. + # atom dimensions will be stored in `self._dtype`, which is + # different + # than `self.dtype` in that `self._dtype` dimensions are borrowed + # from `self.shape`. `self.dtype` will always be scalar (NumPy + # convention). + self._dtype = dtype = np.dtype((dtype.base, shape[1:])) + + self._cparams = cparams + self.atomsize = dtype.itemsize + self.itemsize = dtype.base.itemsize + self._chunklen = chunklen + self._chunksize = chunklen * self.atomsize + self._dflt = dflt + self.expectedlen = expectedlen + + # Book memory for last chunk (uncompressed) + # Use np.zeros here because they compress better + lastchunkarr = np.zeros(dtype=dtype, shape=(chunklen,)) + self.lastchunk = lastchunkarr.data + self.lastchunkarr = lastchunkarr + + # Check rootdir hierarchy + if not os.path.isdir(self._rootdir): + raise IOError("root directory does not exist") + self.datadir = os.path.join(self._rootdir, DATA_DIR) + if not os.path.isdir(self.datadir): + raise IOError("data directory does not exist") + self.metadir = os.path.join(self._rootdir, META_DIR) + if not os.path.isdir(self.metadir): + raise IOError("meta directory does not exist") + + calen = shape[0] # the length ot the carray + # Finally, open data directory + metainfo = (dtype, cparams, calen, lastchunkarr, self._mode) + self.chunks = chunks(self._rootdir, metainfo=metainfo, _new=False) + + # Update some counters + self.leftover = (calen % chunklen) * self.atomsize + self._cbytes = cbytes + self._nbytes = calen * self.atomsize + + if self._mode == "w": + # Remove all entries when mode is 'w' + self.resize(0) + + def fill_chunks(self, object array_): + """Fill chunks, either in-memory or on-disk.""" + cdef int leftover, chunklen + cdef npy_intp i, nchunks + cdef npy_intp nbytes, cbytes + cdef chunk chunk_ + cdef ndarray remainder + + # The number of bytes in incoming array + nbytes = self.itemsize * array_.size + self._nbytes = nbytes + + # Compress data in chunks + cbytes = 0 + chunklen = self._chunklen + nchunks = cython.cdiv(nbytes, self._chunksize) + for i from 0 <= i < nchunks: + assert i * chunklen < array_.size, "i, nchunks: %d, %d" % ( + i, nchunks) + chunk_ = chunk(array_[i * chunklen:(i + 1) * chunklen], + self._dtype, self._cparams, + _memory=self._rootdir is None) + self.chunks.append(chunk_) + cbytes += chunk_.cbytes + self.leftover = leftover = nbytes % self._chunksize + if leftover: + remainder = array_[nchunks * chunklen:] + memcpy(self.lastchunk, remainder.data, leftover) + cbytes += self._chunksize # count the space in last chunk + self._cbytes = cbytes + + def mkdirs(self, object rootdir, object mode): + """Create the basic directory layout for persistent storage.""" + if os.path.exists(rootdir): + if self._mode != "w": + raise IOError( + "specified rootdir path '%s' already exists " + "and creation mode is '%s'" % (rootdir, mode)) + if os.path.isdir(rootdir): + shutil.rmtree(rootdir) + else: + os.remove(rootdir) + os.mkdir(rootdir) + self.datadir = os.path.join(rootdir, DATA_DIR) + os.mkdir(self.datadir) + self.metadir = os.path.join(rootdir, META_DIR) + os.mkdir(self.metadir) + + def write_meta(self): + """Write metadata persistently.""" + storagef = os.path.join(self.metadir, STORAGE_FILE) + with open(storagef, 'wb') as storagefh: + storagefh.write(json.dumps({ + "dtype": str(self.dtype), + "cparams": { + "clevel": self.cparams.clevel, + "shuffle": self.cparams.shuffle, + }, + "chunklen": self._chunklen, + "expectedlen": self.expectedlen, + "dflt": self.dflt.tolist(), + })) + storagefh.write("\n") + + def read_meta(self): + """Read persistent metadata.""" + + # First read the size info + metadir = os.path.join(self._rootdir, META_DIR) + shapef = os.path.join(metadir, SIZES_FILE) + with open(shapef, 'rb') as shapefh: + sizes = json.loads(shapefh.read()) + shape = sizes['shape'] + if type(shape) == list: + shape = tuple(shape) + nbytes = sizes["nbytes"] + cbytes = sizes["cbytes"] + + # Then the rest of metadata + storagef = os.path.join(metadir, STORAGE_FILE) + with open(storagef, 'rb') as storagefh: + data = json.loads(storagefh.read()) + dtype_ = np.dtype(data["dtype"]) + chunklen = data["chunklen"] + cparams = bcolz.cparams( + clevel=data["cparams"]["clevel"], + shuffle=data["cparams"]["shuffle"]) + expectedlen = data["expectedlen"] + dflt = data["dflt"] + return (shape, cparams, dtype_, dflt, expectedlen, cbytes, chunklen) + + def append(self, object array): + """ + append(array) + + Append a numpy `array` to this instance. + + Parameters + ---------- + array : NumPy-like object + The array to be appended. Must be compatible with shape and + type of + the carray. + + """ + cdef int atomsize, itemsize, chunksize, leftover + cdef int nbytesfirst, chunklen, start, stop + cdef npy_intp nbytes, cbytes, bsize, i, nchunks + cdef ndarray remainder, arrcpy, dflts + cdef chunk chunk_ + + if self.mode == "r": + raise IOError( + "cannot modify data because mode is '%s'" % self.mode) + + arrcpy = utils.to_ndarray(array, self._dtype) + if arrcpy.dtype != self._dtype.base: + raise TypeError, "array dtype does not match with self" + # Appending a single row should be supported + if arrcpy.shape == self._dtype.shape: + arrcpy = arrcpy.reshape((1,) + arrcpy.shape) + if arrcpy.shape[1:] != self._dtype.shape: + raise ValueError, "array trailing dimensions does not match with " \ + "self" + + atomsize = self.atomsize + itemsize = self.itemsize + chunksize = self._chunksize + chunks = self.chunks + leftover = self.leftover + bsize = arrcpy.size * itemsize + cbytes = 0 + + # Check if array fits in existing buffer + if (bsize + leftover) < chunksize: + # Data fits in lastchunk buffer. Just copy it + if arrcpy.strides[0] > 0: + memcpy(self.lastchunk + leftover, arrcpy.data, bsize) + else: + start = cython.cdiv(leftover, atomsize) + stop = cython.cdiv((leftover + bsize), atomsize) + self.lastchunkarr[start:stop] = arrcpy + leftover += bsize + else: + # Data does not fit in buffer. Break it in chunks. + + # First, fill the last buffer completely (if needed) + if leftover: + nbytesfirst = chunksize - leftover + if arrcpy.strides[0] > 0: + memcpy(self.lastchunk + leftover, arrcpy.data, nbytesfirst) + else: + start = cython.cdiv(leftover, atomsize) + stop = cython.cdiv((leftover + nbytesfirst), atomsize) + self.lastchunkarr[start:stop] = arrcpy[start:stop] + # Compress the last chunk and add it to the list + chunk_ = chunk(self.lastchunkarr, self._dtype, self._cparams, + _memory=self._rootdir is None) + chunks.append(chunk_) + cbytes = chunk_.cbytes + else: + nbytesfirst = 0 + + # Then fill other possible chunks + nbytes = bsize - nbytesfirst + nchunks = cython.cdiv(nbytes, chunksize) + chunklen = self._chunklen + # Get a new view skipping the elements that have been already + # copied + remainder = arrcpy[cython.cdiv(nbytesfirst, atomsize):] + for i from 0 <= i < nchunks: + chunk_ = chunk( + remainder[i * chunklen:(i + 1) * chunklen], self._dtype, + self._cparams, + _memory=self._rootdir is None) + chunks.append(chunk_) + cbytes += chunk_.cbytes + + # Finally, deal with the leftover + leftover = nbytes % chunksize + if leftover: + remainder = remainder[nchunks * chunklen:] + if arrcpy.strides[0] > 0: + memcpy(self.lastchunk, remainder.data, leftover) + else: + self.lastchunkarr[:len(remainder)] = remainder + + # Update some counters + self.leftover = leftover + self._cbytes += cbytes + self._nbytes += bsize + + def trim(self, object nitems): + """ + trim(nitems) + + Remove the trailing `nitems` from this instance. + + Parameters + ---------- + nitems : int + The number of trailing items to be trimmed. If negative, + the object + is enlarged instead. + + """ + cdef int atomsize, leftover, leftover2 + cdef npy_intp cbytes, bsize, nchunk2 + cdef chunk chunk_ + + if not isinstance(nitems, (int, long, float)): + raise TypeError, "`nitems` must be an integer" + + # Check that we don't run out of space + if nitems > self.len: + raise ValueError, "`nitems` must be less than total length" + # A negative number of items means that we want to grow the object + if nitems <= 0: + self.resize(self.len - nitems) + return + + atomsize = self.atomsize + chunks = self.chunks + leftover = self.leftover + bsize = nitems * atomsize + cbytes = 0 + + # Check if items belong to the last chunk + if (leftover - bsize) > 0: + # Just update leftover counter + leftover -= bsize + else: + # nitems larger than last chunk + nchunk = cython.cdiv((self.len - nitems), self._chunklen) + leftover2 = (self.len - nitems) % self._chunklen + leftover = leftover2 * atomsize + + # Remove complete chunks + nchunk2 = lnchunk = cython.cdiv(self._nbytes, + self._chunksize) + while nchunk2 > nchunk: + chunk_ = chunks.pop() + cbytes += chunk_.cbytes + nchunk2 -= 1 + + # Finally, deal with the leftover + if leftover: + self.lastchunkarr[:leftover2] = chunk_[:leftover2] + if self._rootdir: + # Last chunk is removed automatically by the chunks.pop( + # ) call, and + # always is counted as if it is not compressed (although + # it is in + # this state on-disk) + cbytes += chunk_.nbytes + + # Update some counters + self.leftover = leftover + self._cbytes -= cbytes + self._nbytes -= bsize + # Flush last chunk and update counters on-disk + self.flush() + + def resize(self, object nitems): + """ + resize(nitems) + + Resize the instance to have `nitems`. + + Parameters + ---------- + nitems : int + The final length of the object. If `nitems` is larger than the + actual + length, new items will appended using `self.dflt` as filling + values. + + """ + cdef object chunk + + if not isinstance(nitems, (int, long, float)): + raise TypeError, "`nitems` must be an integer" + + if nitems == self.len: + return + elif nitems < 0: + raise ValueError, "`nitems` cannot be negative" + + if nitems > self.len: + # Create a 0-strided array and append it to self + chunk = np.ndarray(nitems - self.len, dtype=self._dtype, + buffer=self._dflt, strides=(0,)) + self.append(chunk) + self.flush() + else: + # Just trim the excess of items + self.trim(self.len - nitems) + + def reshape(self, newshape): + """ + reshape(newshape) + + Returns a new carray containing the same data with a new shape. + + Parameters + ---------- + newshape : int or tuple of ints + The new shape should be compatible with the original shape. If + an integer, then the result will be a 1-D array of that length. + One shape dimension can be -1. In this case, the value is inferred + from the length of the array and remaining dimensions. + + Returns + ------- + reshaped_array : carray + A copy of the original carray. + + """ + cdef npy_intp newlen, ilen, isize, osize, newsize, rsize, i + cdef object ishape, oshape, pos, newdtype, out + + # Enforce newshape as tuple + if isinstance(newshape, (int, long)): + newshape = (newshape,) + newsize = np.prod(newshape) + + ishape = self.shape + ilen = ishape[0] + isize = np.prod(ishape) + + # Check for -1 in newshape + if -1 in newshape: + if newshape.count(-1) > 1: + raise ValueError, "only one shape dimension can be -1" + pos = newshape.index(-1) + osize = np.prod(newshape[:pos] + newshape[pos + 1:]) + if isize == 0: + newshape = newshape[:pos] + (0,) + newshape[pos + 1:] + else: + newshape = newshape[:pos] + (isize / osize,) + newshape[ + pos + 1:] + newsize = np.prod(newshape) + + # Check shape compatibility + if isize != newsize: + raise ValueError, "`newshape` is not compatible with the current " \ + "one" + # Create the output container + newdtype = np.dtype((self._dtype.base, newshape[1:])) + newlen = newshape[0] + + # If shapes are both n-dimensional, convert first to 1-dim shape + # and then convert again to the final newshape. + if len(ishape) > 1 and len(newshape) > 1: + out = self.reshape(-1) + return out.reshape(newshape) - """ - cdef int chunklen - cdef npy_intp startb, stopb - cdef npy_intp nchunk, keychunk, nchunks - cdef npy_intp nwrow, blen, vlen - cdef chunk chunk_ - cdef object start, stop, step - cdef object cdata, arr - - if self.mode == "r": - raise IOError( - "cannot modify data because mode is '%s'" % self.mode) - - # We are going to modify data. Mark block cache as dirty. - if self.idxcache >= 0: - # -2 means that cbytes counter has not to be changed - self.idxcache = -2 - - # Check for integer - # isinstance(key, int) is not enough in Cython (?) - if isinstance(key, (int, long)) or isinstance(key, np.int_): - if key < 0: - # To support negative values - key += self.len - if key >= self.len: - raise IndexError, "index out of range" - (start, stop, step) = key, key+1, 1 - # Slices - elif isinstance(key, slice): - (start, stop, step) = key.start, key.stop, key.step - if step: - if step <= 0 : - raise NotImplementedError("step in slice can only be positive") - # Multidimensional keys - elif isinstance(key, tuple): - if len(key) == 0: - raise ValueError("empty tuple not supported") - elif len(key) == 1: - self[key[0]] = value - return - # An n-dimensional slice - # First, retrieve elements in the leading dimension - arr = self[key[0]] - # Then, assing only the requested elements in other dimensions - if type(key[0]) == slice: - arr[(slice(None),) + key[1:]] = value - else: - arr[key[1:]] = value - # Finally, update this superset of values in self - self[key[0]] = arr - return - # List of integers (case of fancy indexing) - elif isinstance(key, list): - # Try to convert to a integer array - try: - key = np.array(key, dtype=np.int_) - except: - raise IndexError, "key cannot be converted to an array of indices" - self[key] = value - return - # A boolean or integer array (case of fancy indexing) - elif hasattr(key, "dtype"): - if key.dtype.type == np.bool_: - # A boolean array - if len(key) != self.len: - raise ValueError, "boolean array length must match len(self)" - self.bool_update(key, value) - return - elif np.issubsctype(key, np.int_): - # An integer array - value = utils.to_ndarray(value, self._dtype, arrlen=len(key)) - # XXX This could be optimised, but it works like this - for i, item in enumerate(key): - self[item] = value[i] - return - else: - raise IndexError, \ - "arrays used as indices must be of integer (or boolean) type" - # An boolean expression (case of fancy indexing) - elif type(key) is str: - # Evaluate - result = bcolz.eval(key) - if result.dtype.type != np.bool_: - raise IndexError, "only boolean expressions supported" - if len(result) != self.len: - raise IndexError, "boolean expression outcome must match len(self)" - # Call __setitem__ again - self[result] = value - return - # All the rest not implemented - else: - raise NotImplementedError, "key not supported: %s" % repr(key) - - # Get the corrected values for start, stop, step - (start, stop, step) = slice(start, stop, step).indices(self.len) - - # Build a numpy object out of value - vlen = get_len_of_range(start, stop, step) - if vlen == 0: - # If range is empty, return immediately - return - value = utils.to_ndarray(value, self._dtype, arrlen=vlen) - - # Fill it from data in chunks - nwrow = 0 - chunklen = self._chunklen - nchunks = cython.cdiv(self._nbytes, self._chunksize) - if self.leftover > 0: - nchunks += 1 - for nchunk from 0 <= nchunk < nchunks: - # Compute start & stop for each block - startb, stopb, blen = clip_chunk(nchunk, chunklen, start, stop, step) - if blen == 0: - continue - # Modify the data in chunk - if nchunk == nchunks-1 and self.leftover: - self.lastchunkarr[startb:stopb:step] = value[nwrow:nwrow+blen] - else: - # Get the data chunk - chunk_ = self.chunks[nchunk] - self._cbytes -= chunk_.cbytes - # Get all the values there - cdata = chunk_[:] - # Overwrite it with data from value - cdata[startb:stopb:step] = value[nwrow:nwrow+blen] - # Replace the chunk - chunk_ = chunk(cdata, self._dtype, self._cparams, - _memory = self._rootdir is None) - self.chunks[nchunk] = chunk_ - # Update cbytes counter - self._cbytes += chunk_.cbytes - nwrow += blen - - # Safety check - assert (nwrow == vlen) - - # This is a private function that is specific for `eval` - def _getrange(self, npy_intp start, npy_intp blen, ndarray out): - cdef int chunklen - cdef npy_intp startb, stopb - cdef npy_intp nwrow, stop, cblen - cdef npy_intp schunk, echunk, nchunk, nchunks - cdef chunk chunk_ - - # Check that we are inside limits - nrows = cython.cdiv(self._nbytes, self.atomsize) - if (start + blen) > nrows: - blen = nrows - start - - # Fill `out` from data in chunks - nwrow = 0 - stop = start + blen - nchunks = cython.cdiv(self._nbytes, self._chunksize) - chunklen = cython.cdiv(self._chunksize, self.atomsize) - schunk = cython.cdiv(start, chunklen) - echunk = cython.cdiv((start+blen), chunklen) - for nchunk from schunk <= nchunk <= echunk: - # Compute start & stop for each block - startb = start % chunklen - stopb = chunklen - if (start + startb) + chunklen > stop: - # XXX I still have to explain why this expression works - # for chunklen > (start + blen) - stopb = (stop - start) + startb - cblen = stopb - startb - if cblen == 0: - continue - # Get the data chunk and assign it to result array - if nchunk == nchunks and self.leftover: - out[nwrow:nwrow+cblen] = self.lastchunkarr[startb:stopb] - else: - chunk_ = self.chunks[nchunk] - chunk_._getitem(startb, stopb, out.data+nwrow*self.atomsize) - nwrow += cblen - start += cblen + if self._rootdir: + # If persistent, do the copy to a temporary dir + absdir = os.path.dirname(self._rootdir) + rootdir = tempfile.mkdtemp(suffix='__temp__', dir=absdir) + else: + rootdir = None + + # Create the final container and fill it + out = carray([], dtype=newdtype, cparams=self.cparams, + expectedlen=newlen, + rootdir=rootdir, mode='w') + if newlen < ilen: + rsize = isize / newlen + for i from 0 <= i < newlen: + out.append( + self[i * rsize:(i + 1) * rsize].reshape(newdtype.shape)) + else: + for i from 0 <= i < ilen: + out.append(self[i].reshape(-1)) + out.flush() - cdef void bool_update(self, boolarr, value): - """Update self in positions where `boolarr` is true with `value` array.""" - cdef int chunklen - cdef npy_intp startb, stopb - cdef npy_intp nchunk, nchunks, nrows - cdef npy_intp nwrow, blen, vlen, n - cdef chunk chunk_ - cdef object cdata, boolb - - vlen = boolarr.sum() # number of true values in bool array - value = utils.to_ndarray(value, self._dtype, arrlen=vlen) - - # Fill it from data in chunks - nwrow = 0 - chunklen = self._chunklen - nchunks = cython.cdiv(self._nbytes, self._chunksize) - if self.leftover > 0: - nchunks += 1 - nrows = cython.cdiv(self._nbytes, self.atomsize) - for nchunk from 0 <= nchunk < nchunks: - # Compute start & stop for each block - startb, stopb, _ = clip_chunk(nchunk, chunklen, 0, nrows, 1) - # Get boolean values for this chunk - n = nchunk * chunklen - boolb = boolarr[n+startb:n+stopb] - blen = boolb.sum() - if blen == 0: - continue - # Modify the data in chunk - if nchunk == nchunks-1 and self.leftover: - self.lastchunkarr[boolb] = value[nwrow:nwrow+blen] - else: - # Get the data chunk + # Finally, rename the temporary data directory to self._rootdir + if self._rootdir: + shutil.rmtree(self._rootdir) + os.rename(rootdir, self._rootdir) + # Restore the rootdir and mode + out.rootdir = self._rootdir + out.mode = self._mode + + return out + + def copy(self, **kwargs): + """ + copy(**kwargs) + + Return a copy of this object. + + Parameters + ---------- + kwargs : list of parameters or dictionary + Any parameter supported by the carray constructor. + + Returns + ------- + out : carray object + The copy of this object. + + """ + cdef object chunklen + + # Get defaults for some parameters + cparams = kwargs.pop('cparams', self._cparams) + expectedlen = kwargs.pop('expectedlen', self.len) + + # Create a new, empty carray + ccopy = carray(np.empty(0, dtype=self._dtype), + cparams=cparams, + expectedlen=expectedlen, + **kwargs) + + # Now copy the carray chunk by chunk + chunklen = self._chunklen + for i from 0 <= i < self.len by chunklen: + ccopy.append(self[i:i + chunklen]) + ccopy.flush() + + return ccopy + + def sum(self, dtype=None): + """ + sum(dtype=None) + + Return the sum of the array elements. + + Parameters + ---------- + dtype : NumPy dtype + The desired type of the output. If ``None``, the dtype of + `self` is + used. An exception is when `self` has an integer type with less + precision than the default platform integer. In that case, the + default platform integer is used instead (NumPy convention). + + + Return value + ------------ + out : NumPy scalar with `dtype` + + """ + cdef chunk chunk_ + cdef npy_intp nchunk, nchunks + cdef object result + + if dtype is None: + dtype = self._dtype.base + # Check if we have less precision than required for ints + # (mimick NumPy logic) + if dtype.kind in ('b', 'i') and dtype.itemsize < IntType.itemsize: + dtype = IntType + else: + dtype = np.dtype(dtype) + if dtype.kind == 'S': + raise TypeError, "cannot perform reduce with flexible type" + + # Get a container for the result + result = np.zeros(1, dtype=dtype)[0] + + nchunks = cython.cdiv(self._nbytes, self._chunksize) + for nchunk from 0 <= nchunk < nchunks: + chunk_ = self.chunks[nchunk] + if chunk_.isconstant: + result += chunk_.constant * self._chunklen + elif self._dtype.type == np.bool_: + result += chunk_.true_count + else: + result += chunk_[:].sum(dtype=dtype) + if self.leftover: + leftover = self.len - nchunks * self._chunklen + result += self.lastchunkarr[:leftover].sum(dtype=dtype) + + return result + + def __len__(self): + return self.len + + def __sizeof__(self): + return self._cbytes + + cdef int getitem_cache(self, npy_intp pos, char *dest): + """Get a single item and put it in `dest`. It caches a complete block. + + It returns 1 if asked `pos` can be copied to `dest`. Else, + this returns + 0. + + NOTE: As Blosc supports decompressing just a block inside a chunk, the + data that is cached is a *block*, as it is the least amount of data + that + can be decompressed. This saves both time and memory. + + IMPORTANT: Any update operation (e.g. __setitem__) *must* disable this + cache by setting self.idxcache = -2. + """ + cdef int ret, atomsize, blocksize, offset + cdef int idxcache, posinbytes, blocklen + cdef npy_intp nchunk, nchunks, chunklen + cdef chunk chunk_ + + atomsize = self.atomsize + nchunks = cython.cdiv(self._nbytes, self._chunksize) + chunklen = self._chunklen + nchunk = cython.cdiv(pos, chunklen) + + # Check whether pos is in the last chunk + if nchunk == nchunks and self.leftover: + posinbytes = (pos % chunklen) * atomsize + memcpy(dest, self.lastchunk + posinbytes, atomsize) + return 1 + + # Locate the *block* inside the chunk chunk_ = self.chunks[nchunk] - self._cbytes -= chunk_.cbytes - # Get all the values there - cdata = chunk_[:] - # Overwrite it with data from value - cdata[boolb] = value[nwrow:nwrow+blen] - # Replace the chunk - chunk_ = chunk(cdata, self._dtype, self._cparams, - _memory = self._rootdir is None) - self.chunks[nchunk] = chunk_ - # Update cbytes counter - self._cbytes += chunk_.cbytes - nwrow += blen - - # Safety check - assert (nwrow == vlen) - - def __iter__(self): - - if not self.sss_mode: - self.start = 0 - self.stop = cython.cdiv(self._nbytes, self.atomsize) - self.step = 1 - if not (self.sss_mode or self.where_mode or self.wheretrue_mode): - self.nhits = 0 - self.limit = sys.maxint - self.skip = 0 - # Initialize some internal values - self.startb = 0 - self.nrowsread = self.start - self._nrow = self.start - self.step - self._row = -1 # a sentinel - if self.where_mode and isinstance(self.where_arr, carray): - self.nrowsinbuf = self.where_arr.chunklen - else: - self.nrowsinbuf = self._chunklen - - return self - - def iter(self, start=0, stop=None, step=1, limit=None, skip=0): - """ - iter(start=0, stop=None, step=1, limit=None, skip=0) - - Iterator with `start`, `stop` and `step` bounds. - - Parameters - ---------- - start : int - The starting item. - stop : int - The item after which the iterator stops. - step : int - The number of items incremented during each iteration. Cannot be - negative. - limit : int - A maximum number of elements to return. The default is return - everything. - skip : int - An initial number of elements to skip. The default is 0. - - Returns - ------- - out : iterator - - See Also - -------- - where, wheretrue - - """ - # Check limits - if step <= 0: - raise NotImplementedError, "step param can only be positive" - self.start, self.stop, self.step = \ - slice(start, stop, step).indices(self.len) - self.reset_sentinels() - self.sss_mode = True - if limit is not None: - self.limit = limit + skip - self.skip = skip - return iter(self) - - def wheretrue(self, limit=None, skip=0): - """ - wheretrue(limit=None, skip=0) - - Iterator that returns indices where this object is true. - - This is currently only useful for boolean carrays that are unidimensional. - - Parameters - ---------- - limit : int - A maximum number of elements to return. The default is return - everything. - skip : int - An initial number of elements to skip. The default is 0. - - Returns - ------- - out : iterator - - See Also - -------- - iter, where - - """ - # Check self - if self._dtype.base.type != np.bool_: - raise ValueError, "`self` is not an array of booleans" - if self.ndim > 1: - raise NotImplementedError, "`self` is not unidimensional" - self.reset_sentinels() - self.wheretrue_mode = True - if limit is not None: - self.limit = limit + skip - self.skip = skip - return iter(self) - - def where(self, boolarr, limit=None, skip=0): - """ - where(boolarr, limit=None, skip=0) - - Iterator that returns values of this object where `boolarr` is true. - - This is currently only useful for boolean carrays that are unidimensional. - - Parameters - ---------- - boolarr : a carray or NumPy array of boolean type - The boolean values. - limit : int - A maximum number of elements to return. The default is return - everything. - skip : int - An initial number of elements to skip. The default is 0. - - Returns - ------- - out : iterator - - See Also - -------- - iter, wheretrue + blocksize = chunk_.blocksize + blocklen = cython.cdiv(blocksize, atomsize) + + if atomsize > blocksize: + # This request cannot be resolved here + return 0 + + # Check whether the cache block has to be initialized + if self.idxcache < 0: + self.blockcache = np.empty(shape=(blocklen,), dtype=self._dtype) + self.datacache = self.blockcache.data + # We don't want this to contribute to cbytes counter! + # if self.idxcache == -1: + # # Absolute first time. Add the cache size to cbytes counter. + # self._cbytes += chunksize + + # Check if block is cached + idxcache = cython.cdiv(pos, blocklen) * blocklen + if idxcache == self.idxcache: + # Hit! + posinbytes = (pos % blocklen) * atomsize + memcpy(dest, self.datacache + posinbytes, atomsize) + return 1 + + # No luck. Read a complete block. + offset = idxcache % chunklen + chunk_._getitem(offset, offset + blocklen, self.datacache) + # Copy the interesting bits to dest + posinbytes = (pos % blocklen) * atomsize + memcpy(dest, self.datacache + posinbytes, atomsize) + # Update the cache index + self.idxcache = idxcache + return 1 - """ - # Check input - if self.ndim > 1: - raise NotImplementedError, "`self` is not unidimensional" - if not hasattr(boolarr, "dtype"): - raise ValueError, "`boolarr` is not an array" - if boolarr.dtype.type != np.bool_: - raise ValueError, "`boolarr` is not an array of booleans" - if len(boolarr) != self.len: - raise ValueError, "`boolarr` must be of the same length than ``self``" - self.reset_sentinels() - self.where_mode = True - self.where_arr = boolarr - if limit is not None: - self.limit = limit + skip - self.skip = skip - return iter(self) - - def __next__(self): - cdef char *vbool - cdef int nhits_buf - - self.nextelement = self._nrow + self.step - while (self.nextelement < self.stop) and (self.nhits < self.limit): - if self.nextelement >= self.nrowsread: - # Skip until there is interesting information - while self.nextelement >= self.nrowsread + self.nrowsinbuf: - self.nrowsread += self.nrowsinbuf - # Compute the end for this iteration - self.stopb = self.stop - self.nrowsread - if self.stopb > self.nrowsinbuf: - self.stopb = self.nrowsinbuf - self._row = self.startb - self.step - - # Skip chunks with zeros if in wheretrue_mode - if self.wheretrue_mode and self.check_zeros(self): - self.nrowsread += self.nrowsinbuf - self.nextelement += self.nrowsinbuf - continue - - if self.where_mode: - # Skip chunks with zeros in where_arr - if self.check_zeros(self.where_arr): - self.nrowsread += self.nrowsinbuf - self.nextelement += self.nrowsinbuf - continue - # Read a chunk of the boolean array - self.where_buf = self.where_arr[ - self.nrowsread:self.nrowsread+self.nrowsinbuf] - - # Read a data chunk - self.iobuf = self[self.nrowsread:self.nrowsread+self.nrowsinbuf] - self.nrowsread += self.nrowsinbuf - - # Check if we can skip this buffer - if (self.wheretrue_mode or self.where_mode) and self.skip > 0: - if self.wheretrue_mode: - nhits_buf = self.iobuf.sum() - else: - nhits_buf = self.where_buf.sum() - if (self.nhits + nhits_buf) < self.skip: - self.nhits += nhits_buf - self.nextelement += self.nrowsinbuf - continue - - self._row += self.step - self._nrow = self.nextelement - if self._row + self.step >= self.stopb: - # Compute the start row for the next buffer - self.startb = (self._row + self.step) % self.nrowsinbuf - self.nextelement = self._nrow + self.step - - # Return a value depending on the mode we are - if self.wheretrue_mode: - vbool = (self.iobuf.data + self._row) - if vbool[0]: - self.nhits += 1 - if self.nhits <= self.skip: - continue - return self._nrow + def __getitem__(self, object key): + """ + x.__getitem__(key) <==> x[key] + + Returns values based on `key`. All the functionality of + ``ndarray.__getitem__()`` is supported (including fancy indexing), + plus a + special support for expressions: + + Parameters + ---------- + key : string + It will be interpret as a boolean expression (computed via + `eval`) and + the elements where these values are true will be returned as a + NumPy + array. + + See Also + -------- + eval + + """ + + cdef int chunklen + cdef npy_intp startb, stopb + cdef npy_intp nchunk, keychunk, nchunks + cdef npy_intp nwrow, blen + cdef ndarray arr1 + cdef object start, stop, step + cdef object arr + + chunklen = self._chunklen + + # Check for integer + # isinstance(key, int) is not enough in Cython (?) + if isinstance(key, (int, long)) or isinstance(key, np.int_): + if key < 0: + # To support negative values + key += self.len + if key >= self.len: + raise IndexError, "index out of range" + arr1 = self.arr1 + if self.getitem_cache(key, arr1.data): + if self.itemsize == self.atomsize: + return PyArray_GETITEM(arr1, arr1.data) + else: + return arr1[0] + # Fallback action: use the slice code + return np.squeeze(self[slice(key, None, 1)]) + # Slices + elif isinstance(key, slice): + (start, stop, step) = key.start, key.stop, key.step + if step and step <= 0: + raise NotImplementedError("step in slice can only be positive") + # Multidimensional keys + elif isinstance(key, tuple): + if len(key) == 0: + raise ValueError("empty tuple not supported") + elif len(key) == 1: + return self[key[0]] + # An n-dimensional slice + # First, retrieve elements in the leading dimension + arr = self[key[0]] + # Then, keep only the required elements in other dimensions + if type(key[0]) == slice: + arr = arr[(slice(None),) + key[1:]] + else: + arr = arr[key[1:]] + # Force a copy in case returned array is not contiguous + if not arr.flags.contiguous: + arr = arr.copy() + return arr + # List of integers (case of fancy indexing) + elif isinstance(key, list): + # Try to convert to a integer array + try: + key = np.array(key, dtype=np.int_) + except: + raise IndexError, "key cannot be converted to an array of " \ + "indices" + return self[key] + # A boolean or integer array (case of fancy indexing) + elif hasattr(key, "dtype"): + if key.dtype.type == np.bool_: + # A boolean array + if len(key) != self.len: + raise IndexError, "boolean array length must match len(" \ + "self)" + if isinstance(key, carray): + count = key.sum() + else: + count = -1 + return np.fromiter(self.where(key), dtype=self._dtype, + count=count) + elif np.issubsctype(key, np.int_): + # An integer array + return np.array([self[i] for i in key], dtype=self._dtype) + else: + raise IndexError, \ + "arrays used as indices must be of integer (or boolean) " \ + "type" + # An boolean expression (case of fancy indexing) + elif type(key) is str: + # Evaluate + result = bcolz.eval(key) + if result.dtype.type != np.bool_: + raise IndexError, "only boolean expressions supported" + if len(result) != self.len: + raise IndexError, "boolean expression outcome must match " \ + "len(self)" + # Call __getitem__ again + return self[result] + # All the rest not implemented else: - continue - if self.where_mode: - vbool = (self.where_buf.data + self._row) - if not vbool[0]: - continue - self.nhits += 1 - if self.nhits <= self.skip: - continue - # Return the current value in I/O buffer - if self.itemsize == self.atomsize: - return PyArray_GETITEM( - self.iobuf, self.iobuf.data + self._row * self.atomsize) - else: - return self.iobuf[self._row] - - else: - # Release buffers - self.iobuf = np.empty(0, dtype=self._dtype) - self.where_buf = np.empty(0, dtype=np.bool_) - self.reset_sentinels() - raise StopIteration # end of iteration - - cdef reset_sentinels(self): - """Reset sentinels for iterator.""" - self.sss_mode = False - self.wheretrue_mode = False - self.where_mode = False - self.where_arr = None - self.nhits = 0 - self.limit = sys.maxint - self.skip = 0 - - cdef int check_zeros(self, object barr): - """Check for zeros. Return 1 if all zeros, else return 0.""" - cdef int bsize - cdef npy_intp nchunk - cdef carray carr - cdef ndarray ndarr - cdef chunk chunk_ - - if isinstance(barr, carray): - # Check for zero'ed chunks in carrays - carr = barr - nchunk = cython.cdiv(self.nrowsread, self.nrowsinbuf) - if nchunk < len(carr.chunks): - chunk_ = carr.chunks[nchunk] - if chunk_.isconstant and chunk_.constant in (0, ''): - return 1 - else: - # Check for zero'ed chunks in ndarrays - ndarr = barr - bsize = self.nrowsinbuf - if self.nrowsread + bsize > self.len: - bsize = self.len - self.nrowsread - if check_zeros(ndarr.data + self.nrowsread, bsize): - return 1 - return 0 - - def _update_disk_sizes(self): - """Update the sizes on-disk.""" - sizes = dict() - if self._rootdir: - sizes['shape'] = self.shape - sizes['nbytes'] = self.nbytes - sizes['cbytes'] = self.cbytes - rowsf = os.path.join(self.metadir, SIZES_FILE) - with open(rowsf, 'wb') as rowsfh: - rowsfh.write(json.dumps(sizes)) - rowsfh.write('\n') - - def flush(self): - """Flush data in internal buffers to disk. - - This call should typically be done after performing modifications - (__settitem__(), append()) in persistence mode. If you don't do this, you - risk loosing part of your modifications. + raise NotImplementedError, "key not supported: %s" % repr(key) + + # From now on, will only deal with [start:stop:step] slices + + # Get the corrected values for start, stop, step + (start, stop, step) = slice(start, stop, step).indices(self.len) + + # Build a numpy container + blen = get_len_of_range(start, stop, step) + arr = np.empty(shape=(blen,), dtype=self._dtype) + if blen == 0: + # If empty, return immediately + return arr + + # Fill it from data in chunks + nwrow = 0 + nchunks = cython.cdiv(self._nbytes, self._chunksize) + if self.leftover > 0: + nchunks += 1 + for nchunk from 0 <= nchunk < nchunks: + # Compute start & stop for each block + startb, stopb, blen = clip_chunk(nchunk, chunklen, start, stop, + step) + if blen == 0: + continue + # Get the data chunk and assign it to result array + if nchunk == nchunks - 1 and self.leftover: + arr[nwrow:nwrow + blen] = self.lastchunkarr[startb:stopb:step] + else: + arr[nwrow:nwrow + blen] = self.chunks[nchunk][ + startb:stopb:step] + nwrow += blen + + return arr + + def __setitem__(self, object key, object value): + """ + x.__setitem__(key, value) <==> x[key] = value + + Sets values based on `key`. All the functionality of + ``ndarray.__setitem__()`` is supported (including fancy indexing), + plus a + special support for expressions: + + Parameters + ---------- + key : string + It will be interpret as a boolean expression (computed via + `eval`) and + the elements where these values are true will be set to `value`. + + See Also + -------- + eval + + """ + cdef int chunklen + cdef npy_intp startb, stopb + cdef npy_intp nchunk, keychunk, nchunks + cdef npy_intp nwrow, blen, vlen + cdef chunk chunk_ + cdef object start, stop, step + cdef object cdata, arr + + if self.mode == "r": + raise IOError( + "cannot modify data because mode is '%s'" % self.mode) + + # We are going to modify data. Mark block cache as dirty. + if self.idxcache >= 0: + # -2 means that cbytes counter has not to be changed + self.idxcache = -2 + + # Check for integer + # isinstance(key, int) is not enough in Cython (?) + if isinstance(key, (int, long)) or isinstance(key, np.int_): + if key < 0: + # To support negative values + key += self.len + if key >= self.len: + raise IndexError, "index out of range" + (start, stop, step) = key, key + 1, 1 + # Slices + elif isinstance(key, slice): + (start, stop, step) = key.start, key.stop, key.step + if step: + if step <= 0: + raise NotImplementedError( + "step in slice can only be positive") + # Multidimensional keys + elif isinstance(key, tuple): + if len(key) == 0: + raise ValueError("empty tuple not supported") + elif len(key) == 1: + self[key[0]] = value + return + # An n-dimensional slice + # First, retrieve elements in the leading dimension + arr = self[key[0]] + # Then, assing only the requested elements in other dimensions + if type(key[0]) == slice: + arr[(slice(None),) + key[1:]] = value + else: + arr[key[1:]] = value + # Finally, update this superset of values in self + self[key[0]] = arr + return + # List of integers (case of fancy indexing) + elif isinstance(key, list): + # Try to convert to a integer array + try: + key = np.array(key, dtype=np.int_) + except: + raise IndexError, "key cannot be converted to an array of " \ + "indices" + self[key] = value + return + # A boolean or integer array (case of fancy indexing) + elif hasattr(key, "dtype"): + if key.dtype.type == np.bool_: + # A boolean array + if len(key) != self.len: + raise ValueError, "boolean array length must match len(" \ + "self)" + self.bool_update(key, value) + return + elif np.issubsctype(key, np.int_): + # An integer array + value = utils.to_ndarray(value, self._dtype, arrlen=len(key)) + # XXX This could be optimised, but it works like this + for i, item in enumerate(key): + self[item] = value[i] + return + else: + raise IndexError, \ + "arrays used as indices must be of integer (or boolean) " \ + "type" + # An boolean expression (case of fancy indexing) + elif type(key) is str: + # Evaluate + result = bcolz.eval(key) + if result.dtype.type != np.bool_: + raise IndexError, "only boolean expressions supported" + if len(result) != self.len: + raise IndexError, "boolean expression outcome must match " \ + "len(self)" + # Call __setitem__ again + self[result] = value + return + # All the rest not implemented + else: + raise NotImplementedError, "key not supported: %s" % repr(key) + + # Get the corrected values for start, stop, step + (start, stop, step) = slice(start, stop, step).indices(self.len) + + # Build a numpy object out of value + vlen = get_len_of_range(start, stop, step) + if vlen == 0: + # If range is empty, return immediately + return + value = utils.to_ndarray(value, self._dtype, arrlen=vlen) + + # Fill it from data in chunks + nwrow = 0 + chunklen = self._chunklen + nchunks = cython.cdiv(self._nbytes, self._chunksize) + if self.leftover > 0: + nchunks += 1 + for nchunk from 0 <= nchunk < nchunks: + # Compute start & stop for each block + startb, stopb, blen = clip_chunk(nchunk, chunklen, start, stop, + step) + if blen == 0: + continue + # Modify the data in chunk + if nchunk == nchunks - 1 and self.leftover: + self.lastchunkarr[startb:stopb:step] = value[ + nwrow:nwrow + blen] + else: + # Get the data chunk + chunk_ = self.chunks[nchunk] + self._cbytes -= chunk_.cbytes + # Get all the values there + cdata = chunk_[:] + # Overwrite it with data from value + cdata[startb:stopb:step] = value[nwrow:nwrow + blen] + # Replace the chunk + chunk_ = chunk(cdata, self._dtype, self._cparams, + _memory=self._rootdir is None) + self.chunks[nchunk] = chunk_ + # Update cbytes counter + self._cbytes += chunk_.cbytes + nwrow += blen + + # Safety check + assert (nwrow == vlen) + + # This is a private function that is specific for `eval` + def _getrange(self, npy_intp start, npy_intp blen, ndarray out): + cdef int chunklen + cdef npy_intp startb, stopb + cdef npy_intp nwrow, stop, cblen + cdef npy_intp schunk, echunk, nchunk, nchunks + cdef chunk chunk_ + + # Check that we are inside limits + nrows = cython.cdiv(self._nbytes, self.atomsize) + if (start + blen) > nrows: + blen = nrows - start + + # Fill `out` from data in chunks + nwrow = 0 + stop = start + blen + nchunks = cython.cdiv(self._nbytes, self._chunksize) + chunklen = cython.cdiv(self._chunksize, self.atomsize) + schunk = cython.cdiv(start, chunklen) + echunk = cython.cdiv((start + blen), chunklen) + for nchunk from schunk <= nchunk <= echunk: + # Compute start & stop for each block + startb = start % chunklen + stopb = chunklen + if (start + startb) + chunklen > stop: + # XXX I still have to explain why this expression works + # for chunklen > (start + blen) + stopb = (stop - start) + startb + cblen = stopb - startb + if cblen == 0: + continue + # Get the data chunk and assign it to result array + if nchunk == nchunks and self.leftover: + out[nwrow:nwrow + cblen] = self.lastchunkarr[startb:stopb] + else: + chunk_ = self.chunks[nchunk] + chunk_._getitem(startb, stopb, + out.data + nwrow * self.atomsize) + nwrow += cblen + start += cblen + + cdef void bool_update(self, boolarr, value): + """Update self in positions where `boolarr` is true with `value` + array.""" + cdef int chunklen + cdef npy_intp startb, stopb + cdef npy_intp nchunk, nchunks, nrows + cdef npy_intp nwrow, blen, vlen, n + cdef chunk chunk_ + cdef object cdata, boolb + + vlen = boolarr.sum() # number of true values in bool array + value = utils.to_ndarray(value, self._dtype, arrlen=vlen) + + # Fill it from data in chunks + nwrow = 0 + chunklen = self._chunklen + nchunks = cython.cdiv(self._nbytes, self._chunksize) + if self.leftover > 0: + nchunks += 1 + nrows = cython.cdiv(self._nbytes, self.atomsize) + for nchunk from 0 <= nchunk < nchunks: + # Compute start & stop for each block + startb, stopb, _ = clip_chunk(nchunk, chunklen, 0, nrows, 1) + # Get boolean values for this chunk + n = nchunk * chunklen + boolb = boolarr[n + startb:n + stopb] + blen = boolb.sum() + if blen == 0: + continue + # Modify the data in chunk + if nchunk == nchunks - 1 and self.leftover: + self.lastchunkarr[boolb] = value[nwrow:nwrow + blen] + else: + # Get the data chunk + chunk_ = self.chunks[nchunk] + self._cbytes -= chunk_.cbytes + # Get all the values there + cdata = chunk_[:] + # Overwrite it with data from value + cdata[boolb] = value[nwrow:nwrow + blen] + # Replace the chunk + chunk_ = chunk(cdata, self._dtype, self._cparams, + _memory=self._rootdir is None) + self.chunks[nchunk] = chunk_ + # Update cbytes counter + self._cbytes += chunk_.cbytes + nwrow += blen + + # Safety check + assert (nwrow == vlen) + + def __iter__(self): + + if not self.sss_mode: + self.start = 0 + self.stop = cython.cdiv(self._nbytes, self.atomsize) + self.step = 1 + if not (self.sss_mode or self.where_mode or self.wheretrue_mode): + self.nhits = 0 + self.limit = sys.maxint + self.skip = 0 + # Initialize some internal values + self.startb = 0 + self.nrowsread = self.start + self._nrow = self.start - self.step + self._row = -1 # a sentinel + if self.where_mode and isinstance(self.where_arr, carray): + self.nrowsinbuf = self.where_arr.chunklen + else: + self.nrowsinbuf = self._chunklen + + return self + + def iter(self, start=0, stop=None, step=1, limit=None, skip=0): + """ + iter(start=0, stop=None, step=1, limit=None, skip=0) + + Iterator with `start`, `stop` and `step` bounds. + + Parameters + ---------- + start : int + The starting item. + stop : int + The item after which the iterator stops. + step : int + The number of items incremented during each iteration. Cannot be + negative. + limit : int + A maximum number of elements to return. The default is return + everything. + skip : int + An initial number of elements to skip. The default is 0. + + Returns + ------- + out : iterator + + See Also + -------- + where, wheretrue + + """ + # Check limits + if step <= 0: + raise NotImplementedError, "step param can only be positive" + self.start, self.stop, self.step = \ + slice(start, stop, step).indices(self.len) + self.reset_sentinels() + self.sss_mode = True + if limit is not None: + self.limit = limit + skip + self.skip = skip + return iter(self) + + def wheretrue(self, limit=None, skip=0): + """ + wheretrue(limit=None, skip=0) + + Iterator that returns indices where this object is true. + + This is currently only useful for boolean carrays that are unidimensional. + + Parameters + ---------- + limit : int + A maximum number of elements to return. The default is return + everything. + skip : int + An initial number of elements to skip. The default is 0. + + Returns + ------- + out : iterator + + See Also + -------- + iter, where + + """ + # Check self + if self._dtype.base.type != np.bool_: + raise ValueError, "`self` is not an array of booleans" + if self.ndim > 1: + raise NotImplementedError, "`self` is not unidimensional" + self.reset_sentinels() + self.wheretrue_mode = True + if limit is not None: + self.limit = limit + skip + self.skip = skip + return iter(self) + + def where(self, boolarr, limit=None, skip=0): + """ + where(boolarr, limit=None, skip=0) + + Iterator that returns values of this object where `boolarr` is true. + + This is currently only useful for boolean carrays that are unidimensional. + + Parameters + ---------- + boolarr : a carray or NumPy array of boolean type + The boolean values. + limit : int + A maximum number of elements to return. The default is return + everything. + skip : int + An initial number of elements to skip. The default is 0. + + Returns + ------- + out : iterator + + See Also + -------- + iter, wheretrue + + """ + # Check input + if self.ndim > 1: + raise NotImplementedError, "`self` is not unidimensional" + if not hasattr(boolarr, "dtype"): + raise ValueError, "`boolarr` is not an array" + if boolarr.dtype.type != np.bool_: + raise ValueError, "`boolarr` is not an array of booleans" + if len(boolarr) != self.len: + raise ValueError, "`boolarr` must be of the same length than ``self``" + self.reset_sentinels() + self.where_mode = True + self.where_arr = boolarr + if limit is not None: + self.limit = limit + skip + self.skip = skip + return iter(self) + + def __next__(self): + cdef char *vbool + cdef int nhits_buf + + self.nextelement = self._nrow + self.step + while (self.nextelement < self.stop) and (self.nhits < self.limit): + if self.nextelement >= self.nrowsread: + # Skip until there is interesting information + while self.nextelement >= self.nrowsread + self.nrowsinbuf: + self.nrowsread += self.nrowsinbuf + # Compute the end for this iteration + self.stopb = self.stop - self.nrowsread + if self.stopb > self.nrowsinbuf: + self.stopb = self.nrowsinbuf + self._row = self.startb - self.step + + # Skip chunks with zeros if in wheretrue_mode + if self.wheretrue_mode and self.check_zeros(self): + self.nrowsread += self.nrowsinbuf + self.nextelement += self.nrowsinbuf + continue + + if self.where_mode: + # Skip chunks with zeros in where_arr + if self.check_zeros(self.where_arr): + self.nrowsread += self.nrowsinbuf + self.nextelement += self.nrowsinbuf + continue + # Read a chunk of the boolean array + self.where_buf = self.where_arr[ + self.nrowsread:self.nrowsread + self.nrowsinbuf] + + # Read a data chunk + self.iobuf = self[ + self.nrowsread:self.nrowsread + self.nrowsinbuf] + self.nrowsread += self.nrowsinbuf + + # Check if we can skip this buffer + if (self.wheretrue_mode or self.where_mode) and self.skip > 0: + if self.wheretrue_mode: + nhits_buf = self.iobuf.sum() + else: + nhits_buf = self.where_buf.sum() + if (self.nhits + nhits_buf) < self.skip: + self.nhits += nhits_buf + self.nextelement += self.nrowsinbuf + continue + + self._row += self.step + self._nrow = self.nextelement + if self._row + self.step >= self.stopb: + # Compute the start row for the next buffer + self.startb = (self._row + self.step) % self.nrowsinbuf + self.nextelement = self._nrow + self.step + + # Return a value depending on the mode we are + if self.wheretrue_mode: + vbool = (self.iobuf.data + self._row) + if vbool[0]: + self.nhits += 1 + if self.nhits <= self.skip: + continue + return self._nrow + else: + continue + if self.where_mode: + vbool = (self.where_buf.data + self._row) + if not vbool[0]: + continue + self.nhits += 1 + if self.nhits <= self.skip: + continue + # Return the current value in I/O buffer + if self.itemsize == self.atomsize: + return PyArray_GETITEM( + self.iobuf, self.iobuf.data + self._row * self.atomsize) + else: + return self.iobuf[self._row] - """ - cdef chunk chunk_ - cdef npy_intp nchunks - cdef int leftover_atoms - - if self._rootdir is None: - return - - if self.leftover: - leftover_atoms = cython.cdiv(self.leftover, self.atomsize) - chunk_ = chunk(self.lastchunkarr[:leftover_atoms], self.dtype, - self.cparams, - _memory = self._rootdir is None) - # Flush this chunk to disk - self.chunks.flush(chunk_) - - # Finally, update the sizes metadata on-disk - self._update_disk_sizes() - - # XXX This does not work. Will have to realize how to properly - # flush buffers before self going away... - # def __del__(self): - # # Make a flush to disk if this object get disposed - # self.flush() - - def __str__(self): - return array2string(self) - - def __repr__(self): - snbytes = utils.human_readable_size(self._nbytes) - scbytes = utils.human_readable_size(self._cbytes) - cratio = self._nbytes / float(self._cbytes) - header = "carray(%s, %s)\n" % (self.shape, self.dtype) - header += " nbytes: %s; cbytes: %s; ratio: %.2f\n" % ( - snbytes, scbytes, cratio) - header += " cparams := %r\n" % self.cparams - if self._rootdir: - header += " rootdir := '%s'\n" % self._rootdir - fullrepr = header + str(self) - return fullrepr + else: + # Release buffers + self.iobuf = np.empty(0, dtype=self._dtype) + self.where_buf = np.empty(0, dtype=np.bool_) + self.reset_sentinels() + raise StopIteration # end of iteration + + cdef reset_sentinels(self): + """Reset sentinels for iterator.""" + self.sss_mode = False + self.wheretrue_mode = False + self.where_mode = False + self.where_arr = None + self.nhits = 0 + self.limit = sys.maxint + self.skip = 0 + + cdef int check_zeros(self, object barr): + """Check for zeros. Return 1 if all zeros, else return 0.""" + cdef int bsize + cdef npy_intp nchunk + cdef carray carr + cdef ndarray ndarr + cdef chunk chunk_ + + if isinstance(barr, carray): + # Check for zero'ed chunks in carrays + carr = barr + nchunk = cython.cdiv(self.nrowsread, self.nrowsinbuf) + if nchunk < len(carr.chunks): + chunk_ = carr.chunks[nchunk] + if chunk_.isconstant and chunk_.constant in (0, ''): + return 1 + else: + # Check for zero'ed chunks in ndarrays + ndarr = barr + bsize = self.nrowsinbuf + if self.nrowsread + bsize > self.len: + bsize = self.len - self.nrowsread + if check_zeros(ndarr.data + self.nrowsread, bsize): + return 1 + return 0 + + def _update_disk_sizes(self): + """Update the sizes on-disk.""" + sizes = dict() + if self._rootdir: + sizes['shape'] = self.shape + sizes['nbytes'] = self.nbytes + sizes['cbytes'] = self.cbytes + rowsf = os.path.join(self.metadir, SIZES_FILE) + with open(rowsf, 'wb') as rowsfh: + rowsfh.write(json.dumps(sizes)) + rowsfh.write('\n') + + def flush(self): + """Flush data in internal buffers to disk. + + This call should typically be done after performing modifications + (__settitem__(), append()) in persistence mode. If you don't do this, you + risk loosing part of your modifications. + + """ + cdef chunk chunk_ + cdef npy_intp nchunks + cdef int leftover_atoms + + if self._rootdir is None: + return + + if self.leftover: + leftover_atoms = cython.cdiv(self.leftover, self.atomsize) + chunk_ = chunk(self.lastchunkarr[:leftover_atoms], self.dtype, + self.cparams, + _memory=self._rootdir is None) + # Flush this chunk to disk + self.chunks.flush(chunk_) + + # Finally, update the sizes metadata on-disk + self._update_disk_sizes() + + # XXX This does not work. Will have to realize how to properly + # flush buffers before self going away... + # def __del__(self): + # # Make a flush to disk if this object get disposed + # self.flush() + + def __str__(self): + return array2string(self) + + def __repr__(self): + snbytes = utils.human_readable_size(self._nbytes) + scbytes = utils.human_readable_size(self._cbytes) + cratio = self._nbytes / float(self._cbytes) + header = "carray(%s, %s)\n" % (self.shape, self.dtype) + header += " nbytes: %s; cbytes: %s; ratio: %.2f\n" % ( + snbytes, scbytes, cratio) + header += " cparams := %r\n" % self.cparams + if self._rootdir: + header += " rootdir := '%s'\n" % self._rootdir + fullrepr = header + str(self) + return fullrepr diff --git a/bench/arange.py b/bench/arange.py index 59cffb4f..b1fa4c77 100644 --- a/bench/arange.py +++ b/bench/arange.py @@ -1,6 +1,9 @@ +from time import time + import numpy as np + import bcolz -from time import time + N = 1e8 dtype = 'i4' @@ -9,12 +12,12 @@ t0 = time() a = np.arange(start, stop, step, dtype=dtype) -print "Time numpy.arange() --> %.3f" % (time()-t0) +print "Time numpy.arange() --> %.3f" % (time() - t0) t0 = time() ac = bcolz.arange(start, stop, step, dtype=dtype) -print "Time bcolsz.arange() --> %.3f" % (time()-t0) +print "Time bcolsz.arange() --> %.3f" % (time() - t0) print "ac-->", `ac` -#assert(np.all(a == ac)) +# assert(np.all(a == ac)) diff --git a/bench/concat.py b/bench/concat.py index 594a51bf..814f7cf5 100644 --- a/bench/concat.py +++ b/bench/concat.py @@ -13,11 +13,14 @@ # python bench/concat.py style arraysize nchunks nrepeats clevel # -import sys, math +import sys +import math +import time + import numpy -from numpy.testing import assert_array_equal, assert_array_almost_equal + import bcolz -import time + def concat(data): tlen = sum(x.shape[0] for x in data) @@ -25,11 +28,12 @@ def concat(data): pos = 0 for x in data: step = x.shape[0] - alldata[pos:pos+step] = x + alldata[pos:pos + step] = x pos += step return alldata + def append(data, clevel): alldata = bcolz.carray(data[0], cparams=bcolz.cparams(clevel)) for carr in data[1:]: @@ -37,6 +41,7 @@ def append(data, clevel): return alldata + if len(sys.argv) < 2: print "Pass at least one of these styles: 'numpy', 'concat' or 'bcolz' " sys.exit(1) @@ -45,7 +50,7 @@ def append(data, clevel): if len(sys.argv) == 2: N, K, T, clevel = (1000000, 10, 3, 1) else: - N,K,T = [int(arg) for arg in sys.argv[2:5]] + N, K, T = [int(arg) for arg in sys.argv[2:5]] if len(sys.argv) > 5: clevel = int(sys.argv[5]) else: @@ -53,7 +58,7 @@ def append(data, clevel): # The next datasets allow for very high compression ratios a = [numpy.arange(N, dtype='f8') for _ in range(K)] -print("problem size: (%d) x %d = 10^%g" % (N, K, math.log10(N*K))) +print("problem size: (%d) x %d = 10^%g" % (N, K, math.log10(N * K))) t = time.time() if style == 'numpy': @@ -72,5 +77,5 @@ def append(data, clevel): if style == 'bcolz': size = r.cbytes else: - size = r.size*r.dtype.itemsize -print("size of the final container: %.3f MB" % (size / float(1024*1024)) ) + size = r.size * r.dtype.itemsize +print("size of the final container: %.3f MB" % (size / float(1024 * 1024)) ) diff --git a/bench/ctable-query.py b/bench/ctable-query.py index f368db1c..e6d8bcb4 100644 --- a/bench/ctable-query.py +++ b/bench/ctable-query.py @@ -2,20 +2,24 @@ # is needed in order to execute this. A comparison with SQLite3 and # PyTables (if installed) is also done. -import sys, math -import os, os.path +import sys +import math +import os +import os.path import subprocess import getopt - import sqlite3 +from time import time + import numpy as np + import bcolz -from time import time -NR = 1e5 # the number of rows -NC = 500 # the number of columns -mv = 1e10 # the mean value for entries (sig digits = 17 - log10(mv)) -clevel = 3 # the compression level + +NR = 1e5 # the number of rows +NC = 500 # the number of columns +mv = 1e10 # the mean value for entries (sig digits = 17 - log10(mv)) +clevel = 3 # the compression level show = False # show statistics # The query for a ctable squery = "(f2>.9) & ((f8>.3) & (f8<.4))" # the ctable query @@ -30,7 +34,7 @@ def show_rss(explain): global tref # Build the command to obtain memory info newtref = time() - print "Time (%20s) --> %.3f" % (explain, newtref-tref), + print "Time (%20s) --> %.3f" % (explain, newtref - tref), tref = newtref if show: cmd = "cat /proc/%s/status" % os.getpid() @@ -42,27 +46,30 @@ def show_rss(explain): else: print + def enter(): global tref tref = time() + def after_create(mess=""): global tref - if mess: mess = ", "+mess - show_rss("creation"+mess) + if mess: mess = ", " + mess + show_rss("creation" + mess) + def after_query(mess=""): global tref - if mess: mess = ", "+mess - show_rss("query"+mess) + if mess: mess = ", " + mess + show_rss("query" + mess) def test_numpy(): enter() - t = np.fromiter((mv+np.random.rand(NC)-mv for i in xrange(int(NR))), + t = np.fromiter((mv + np.random.rand(NC) - mv for i in xrange(int(NR))), dtype=dt) after_create() - out = np.fromiter(((row['f1'],row['f3']) for row in t[eval(nquery)]), + out = np.fromiter(((row['f1'], row['f3']) for row in t[eval(nquery)]), dtype="f8,f8") after_query() return out @@ -70,13 +77,14 @@ def test_numpy(): def test_numexpr(): import numexpr as ne + enter() - t = np.fromiter((mv+np.random.rand(NC)-mv for i in xrange(int(NR))), + t = np.fromiter((mv + np.random.rand(NC) - mv for i in xrange(int(NR))), dtype=dt) after_create() - map_field = dict(("f%s"%i, t["f%s"%i]) for i in range(NC)) - out = np.fromiter(((row['f1'],row['f3']) for row in + map_field = dict(("f%s" % i, t["f%s" % i]) for i in range(NC)) + out = np.fromiter(((row['f1'], row['f3']) for row in t[ne.evaluate(squery, map_field)]), dtype="f8,f8") after_query() @@ -85,10 +93,11 @@ def test_numexpr(): def test_ctable(clevel): enter() - tc = bcolz.fromiter((mv+np.random.rand(NC)-mv for i in xrange(int(NR))), - dtype=dt, - cparams=bcolz.cparams(clevel), - count=int(NR)) + tc = bcolz.fromiter( + (mv + np.random.rand(NC) - mv for i in xrange(int(NR))), + dtype=dt, + cparams=bcolz.cparams(clevel), + count=int(NR)) after_create() out = np.fromiter((row for row in tc.where(squery, 'f1,f3')), @@ -104,37 +113,38 @@ def test_sqlite(): con = sqlite3.connect(":memory:") # Create table - fields = "(%s)" % ",".join(["f%d real"%i for i in range(NC)]) + fields = "(%s)" % ",".join(["f%d real" % i for i in range(NC)]) con.execute("create table bench %s" % fields) # Insert a NR rows of data vals = "(%s)" % ",".join(["?" for i in range(NC)]) with con: con.executemany("insert into bench values %s" % vals, - (mv+np.random.rand(NC)-mv for i in xrange(int(NR)))) + (mv + np.random.rand(NC) - mv for i in + xrange(int(NR)))) after_create() out = np.fromiter( (row for row in con.execute( - "select f1, f3 from bench where %s" % sqlquery)), + "select f1, f3 from bench where %s" % sqlquery)), dtype="f8,f8") after_query("non-indexed") # Create indexes - con.execute("create index f1idx on bench (f1)") - con.execute("create index f2idx on bench (f8)") + con.execute("CREATE INDEX f1idx ON bench (f1)") + con.execute("CREATE INDEX f2idx ON bench (f8)") after_create("index") out = np.fromiter( (row for row in con.execute( - "select f1, f3 from bench where %s" % sqlquery)), + "select f1, f3 from bench where %s" % sqlquery)), dtype="f8,f8") after_query("indexed") return out -if __name__=="__main__": +if __name__ == "__main__": global dt usage = """usage: %s [-s] [-m method] [-c ncols] [-r nrows] [-z clevel] @@ -168,8 +178,8 @@ def test_sqlite(): np.random.seed(12) # so as to get reproducible results # The dtype for tables - #dt = np.dtype("f8,"*NC) # aligned fields - dt = np.dtype("f8,"*(NC-1)+"i1") # unaligned fields + # dt = np.dtype("f8,"*NC) # aligned fields + dt = np.dtype("f8," * (NC - 1) + "i1") # unaligned fields if method == "numexpr": mess = "numexpr (+numpy)" diff --git a/bench/eval-profile.py b/bench/eval-profile.py index f23a672d..ef583949 100644 --- a/bench/eval-profile.py +++ b/bench/eval-profile.py @@ -3,31 +3,34 @@ # execute this. import math +from time import time + import numpy as np import numexpr as ne + import bcolz -from time import time + def compute_bcolz(sexpr, clevel, vm): # Uncomment the next for disabling threading - #bcolz.set_nthreads(1) + # bcolz.set_nthreads(1) #bcolz.blosc_set_nthreads(1) print("*** bcolz (using compression clevel = %d):" % clevel) x = cx # comment this for using numpy arrays in inputs t0 = time() cout = bcolz.eval(sexpr, vm=vm, cparams=bcolz.cparams(clevel)) - print("Time for bcolz.eval (%s) --> %.3f" % (vm, time()-t0,)) + print("Time for bcolz.eval (%s) --> %.3f" % (vm, time() - t0,)) #print(", cratio (out): %.1f" % (cout.nbytes / float(cout.cbytes))) #print "cout-->", repr(cout) -if __name__=="__main__": +if __name__ == "__main__": - N = 1e8 # the number of elements in x - clevel = 3 # the compression level + N = 1e8 # the number of elements in x + clevel = 3 # the compression level sexpr = "(x+1)<0" sexpr = "(((.25*x + .75)*x - 1.5)*x - 2)<0" - #sexpr = "(((.25*x + .75)*x - 1.5)*x - 2)" + # sexpr = "(((.25*x + .75)*x - 1.5)*x - 2)" doprofile = 0 print("Creating inputs...") @@ -39,13 +42,14 @@ def compute_bcolz(sexpr, clevel, vm): t0 = time() cout = ne.evaluate(sexpr) - print "Time for numexpr --> %.3f" % (time()-t0,) + print "Time for numexpr --> %.3f" % (time() - t0,) if doprofile: import pstats import cProfile as prof + prof.run('compute_bcolz(sexpr, clevel=clevel, vm="numexpr")', - #prof.run('compute_bcolz(sexpr, clevel=clevel, vm="python")', + #prof.run('compute_bcolz(sexpr, clevel=clevel, vm="python")', 'eval.prof') stats = pstats.Stats('eval.prof') stats.strip_dirs() diff --git a/bench/eval.py b/bench/eval.py index 164b2782..38e5c5fb 100644 --- a/bench/eval.py +++ b/bench/eval.py @@ -3,33 +3,38 @@ # execute this. import math +from time import time + import numpy as np import numexpr as ne + import bcolz -from time import time -N = 1e7 # the number of elements in x -clevel = 9 # the compression level -sexprs = [ "(x+1)<0", - "(2*x**2+.3*y**2+z+1)<0", - "((.25*x + .75)*x - 1.5)*x - 2", - "(((.25*x + .75)*x - 1.5)*x - 2)<0", - ] + +N = 1e7 # the number of elements in x +clevel = 9 # the compression level +sexprs = ["(x+1)<0", + "(2*x**2+.3*y**2+z+1)<0", + "((.25*x + .75)*x - 1.5)*x - 2", + "(((.25*x + .75)*x - 1.5)*x - 2)<0", +] # Initial dataset -#x = np.arange(N) -x = np.linspace(0,100,N) +# x = np.arange(N) +x = np.linspace(0, 100, N) doprofile = False + def compute_ref(sexpr): t0 = time() out = eval(sexpr) - print "Time for plain numpy --> %.3f" % (time()-t0,) + print "Time for plain numpy --> %.3f" % (time() - t0,) t0 = time() out = ne.evaluate(sexpr) - print "Time for numexpr (numpy) --> %.3f" % (time()-t0,) + print "Time for numexpr (numpy) --> %.3f" % (time() - t0,) + def compute_carray(sexpr, clevel, vm): # Uncomment the next for disabling threading @@ -40,12 +45,12 @@ def compute_carray(sexpr, clevel, vm): x, y, z = cx, cy, cz t0 = time() cout = bcolz.eval(sexpr, vm=vm, cparams=bcolz.cparams(clevel)) - print "Time for bcolz.eval (%s) --> %.3f" % (vm, time()-t0,), + print "Time for bcolz.eval (%s) --> %.3f" % (vm, time() - t0,), print ", cratio (out): %.1f" % (cout.nbytes / float(cout.cbytes)) #print "cout-->", repr(cout) -if __name__=="__main__": +if __name__ == "__main__": print "Creating inputs..." @@ -67,8 +72,9 @@ def compute_carray(sexpr, clevel, vm): import cProfile as prof #prof.run('compute_carray(sexpr, clevel=clevel, vm="numexpr")', prof.run('compute_carray(sexpr, clevel=0, vm="numexpr")', - #prof.run('compute_carray(sexpr, clevel=clevel, vm="python")', - #prof.run('compute_carray(sexpr, clevel=0, vm="python")', + #prof.run('compute_carray(sexpr, clevel=clevel, + # vm="python")', + #prof.run('compute_carray(sexpr, clevel=0, vm="python")', 'eval.prof') stats = pstats.Stats('eval.prof') stats.strip_dirs() diff --git a/bench/expression.py b/bench/expression.py index fbce39ef..2c87fa24 100644 --- a/bench/expression.py +++ b/bench/expression.py @@ -2,15 +2,17 @@ # ctable objects. Numexpr is needed in order to execute this. import math +from time import time + import numpy as np -from numpy.testing import assert_array_equal, assert_array_almost_equal import numexpr as ne + import bcolz -from time import time -N = 1e7 # the number of elements in x -clevel = 3 # the compression level -#sexpr = "(x+1)<0" # the expression to compute + +N = 1e7 # the number of elements in x +clevel = 3 # the compression level +# sexpr = "(x+1)<0" # the expression to compute #sexpr = "(2*x**3+.3*y**2+z+1)<0" # the expression to compute sexpr = "((.25*x + .75)*x - 1.5)*x - 2" # a computer-friendly polynomial #sexpr = "(((.25*x + .75)*x - 1.5)*x - 2)<0" # a computer-friendly polynomial @@ -29,17 +31,17 @@ z = np.arange(N) cy = bcolz.carray(y, cparams=cparams) cz = bcolz.carray(z, cparams=cparams) - t = bcolz.ctable((cx, cy, cz), names=['x','y','z']) + t = bcolz.ctable((cx, cy, cz), names=['x', 'y', 'z']) print "Evaluating '%s' with 10^%d points" % (sexpr, int(math.log10(N))) t0 = time() out = eval(sexpr) -print "Time for plain numpy--> %.3f" % (time()-t0,) +print "Time for plain numpy--> %.3f" % (time() - t0,) t0 = time() out = ne.evaluate(sexpr) -print "Time for numexpr (numpy)--> %.3f" % (time()-t0,) +print "Time for numexpr (numpy)--> %.3f" % (time() - t0,) # Uncomment the next for disabling threading #ne.set_num_threads(1) @@ -52,7 +54,7 @@ t0 = time() #cout = t.eval(sexpr, kernel=kernel, cparams=cparams) cout = t.eval(sexpr, cparams=cparams) - print "Time for ctable (%s) --> %.3f" % (kernel, time()-t0,) + print "Time for ctable (%s) --> %.3f" % (kernel, time() - t0,) #print "cout-->", repr(cout) #assert_array_equal(out, cout, "Arrays are not equal") diff --git a/bench/fill.py b/bench/fill.py index 8296b48e..ac1ebca1 100644 --- a/bench/fill.py +++ b/bench/fill.py @@ -1,28 +1,30 @@ +from time import time + import numpy as np + import bcolz -from time import time + N = 1e8 dtype = 'i4' t0 = time() a = np.ones(N, dtype=dtype) -print "Time numpy.ones() --> %.4f" % (time()-t0) +print "Time numpy.ones() --> %.4f" % (time() - t0) t0 = time() ac = bcolz.fill(N, dtype=dtype, dflt=1) -#ac = bcolz.carray(a) -print "Time carray.fill(dflt=1) --> %.4f" % (time()-t0) +# ac = bcolz.carray(a) +print "Time carray.fill(dflt=1) --> %.4f" % (time() - t0) print "ac-->", `ac` t0 = time() sa = a.sum() -print "Time a.sum() --> %.4f" % (time()-t0) +print "Time a.sum() --> %.4f" % (time() - t0) t0 = time() sac = ac.sum() -print "Time ac.sum() --> %.4f" % (time()-t0) - +print "Time ac.sum() --> %.4f" % (time() - t0) -assert(sa == sac) +assert (sa == sac) diff --git a/bench/fromiter.py b/bench/fromiter.py index f70f7bd7..06d06fe9 100644 --- a/bench/fromiter.py +++ b/bench/fromiter.py @@ -1,26 +1,29 @@ # Benchmark for assessing the `fromiter()` speed. -import numpy as np -from numpy.testing import assert_array_equal, assert_array_almost_equal -import bcolz import itertools as it from time import time +import numpy as np +from numpy.testing import assert_array_equal + +import bcolz + + N = int(1e6) # the number of elements in x -clevel = 2 # the compression level +clevel = 2 # the compression level print "Creating inputs with %d elements..." % N -x = xrange(N) # not a true iterable, but can be converted -y = xrange(1,N+1) -z = xrange(2,N+2) +x = xrange(N) # not a true iterable, but can be converted +y = xrange(1, N + 1) +z = xrange(2, N + 2) print "Starting benchmark now for creating arrays..." # Create a ndarray -#x = (i for i in xrange(N)) # true iterable +# x = (i for i in xrange(N)) # true iterable t0 = time() out = np.fromiter(x, dtype='f8', count=N) -print "Time for array--> %.3f" % (time()-t0,) +print "Time for array--> %.3f" % (time() - t0,) print "out-->", len(out) #bcolz.set_num_threads(bcolz.ncores//2) @@ -29,7 +32,7 @@ #x = (i for i in xrange(N)) # true iterable t0 = time() cout = bcolz.fromiter(x, dtype='f8', count=N, cparams=bcolz.cparams(clevel)) -print "Time for carray--> %.3f" % (time()-t0,) +print "Time for carray--> %.3f" % (time() - t0,) print "cout-->", len(cout) assert_array_equal(out, cout, "Arrays are not equal") @@ -37,29 +40,29 @@ #x = (i for i in xrange(N)) # true iterable t0 = time() cout = bcolz.fromiter(x, dtype='f8', count=-1, cparams=bcolz.cparams(clevel)) -print "Time for carray (count=-1)--> %.3f" % (time()-t0,) +print "Time for carray (count=-1)--> %.3f" % (time() - t0,) print "cout-->", len(cout) assert_array_equal(out, cout, "Arrays are not equal") # Retrieve from a structured ndarray -gen = ((i,j,k) for i,j,k in it.izip(x,y,z)) +gen = ((i, j, k) for i, j, k in it.izip(x, y, z)) t0 = time() out = np.fromiter(gen, dtype="f8,f8,f8", count=N) -print "Time for structured array--> %.3f" % (time()-t0,) +print "Time for structured array--> %.3f" % (time() - t0,) print "out-->", len(out) # Retrieve from a ctable -gen = ((i,j,k) for i,j,k in it.izip(x,y,z)) +gen = ((i, j, k) for i, j, k in it.izip(x, y, z)) t0 = time() cout = bcolz.fromiter(gen, dtype="f8,f8,f8", count=N) -print "Time for ctable--> %.3f" % (time()-t0,) +print "Time for ctable--> %.3f" % (time() - t0,) print "out-->", len(cout) assert_array_equal(out, cout[:], "Arrays are not equal") # Retrieve from a ctable (with unknown size) -gen = ((i,j,k) for i,j,k in it.izip(x,y,z)) +gen = ((i, j, k) for i, j, k in it.izip(x, y, z)) t0 = time() cout = bcolz.fromiter(gen, dtype="f8,f8,f8", count=-1) -print "Time for ctable (count=-1)--> %.3f" % (time()-t0,) +print "Time for ctable (count=-1)--> %.3f" % (time() - t0,) print "out-->", len(cout) assert_array_equal(out, cout[:], "Arrays are not equal") diff --git a/bench/getitem.py b/bench/getitem.py index 005d4a8d..d7425603 100644 --- a/bench/getitem.py +++ b/bench/getitem.py @@ -1,32 +1,35 @@ # Benchmark for getitem +from time import time + import numpy as np + import bcolz -from time import time -N = 1e7 # the number of elements in x -M = 100000 # the elements to get -clevel = 1 # the compression level + +N = 1e7 # the number of elements in x +M = 100000 # the elements to get +clevel = 1 # the compression level print "Creating inputs with %d elements..." % N cparams = bcolz.cparams(clevel) -#x = np.arange(N) +# x = np.arange(N) x = np.zeros(N, dtype="f8") y = x.copy() z = x.copy() cx = bcolz.carray(x, cparams=cparams) cy = cx.copy() cz = cx.copy() -ct = bcolz.ctable((cx, cy, cz), names=['x','y','z']) +ct = bcolz.ctable((cx, cy, cz), names=['x', 'y', 'z']) t = ct[:] print "Starting benchmark now for getting %d elements..." % M # Retrieve from a ndarray t0 = time() vals = [x[i] for i in xrange(0, M, 3)] -print "Time for array--> %.3f" % (time()-t0,) +print "Time for array--> %.3f" % (time() - t0,) print "vals-->", len(vals) #bcolz.set_num_threads(bcolz.ncores//2) @@ -35,20 +38,20 @@ t0 = time() cvals = [cx[i] for i in xrange(0, M, 3)] #cvals = cx[:M:3][:].tolist() -print "Time for carray--> %.3f" % (time()-t0,) +print "Time for carray--> %.3f" % (time() - t0,) print "vals-->", len(cvals) assert vals == cvals # Retrieve from a structured ndarray t0 = time() vals = [t[i] for i in xrange(0, M, 3)] -print "Time for structured array--> %.3f" % (time()-t0,) +print "Time for structured array--> %.3f" % (time() - t0,) print "vals-->", len(vals) # Retrieve from a ctable t0 = time() cvals = [ct[i] for i in xrange(0, M, 3)] #cvals = ct[:M:3][:].tolist() -print "Time for ctable--> %.3f" % (time()-t0,) +print "Time for ctable--> %.3f" % (time() - t0,) print "vals-->", len(cvals) assert vals == cvals diff --git a/bench/iter.py b/bench/iter.py index b7bac9da..f2bf9ef8 100644 --- a/bench/iter.py +++ b/bench/iter.py @@ -1,9 +1,12 @@ # Benchmark to compare times for iterators in generator contexts by # using carrays vs plain numpy arrays. +from time import time + import numpy as np + import bcolz -from time import time + N = 1e6 @@ -11,15 +14,15 @@ b = bcolz.carray(a) t0 = time() -#sum1 = sum(a) +# sum1 = sum(a) sum1 = sum((v for v in a[2::3] if v < 10)) -t1 = time()-t0 +t1 = time() - t0 print "Summing using numpy iterator: %.3f" % t1 t0 = time() #sum2 = sum(b) sum2 = sum((v for v in b.iter(2, None, 3) if v < 10)) -t2 = time()-t0 -print "Summing using carray iterator: %.3f speedup: %.2f" % (t2, t1/t2) +t2 = time() - t0 +print "Summing using carray iterator: %.3f speedup: %.2f" % (t2, t1 / t2) assert sum1 == sum2, "Summations are not equal!" diff --git a/bench/iterator.py b/bench/iterator.py index 588ed0fd..4f27dc42 100644 --- a/bench/iterator.py +++ b/bench/iterator.py @@ -1,13 +1,16 @@ # Benchmark for iterators +from time import time + import numpy as np + import bcolz -from time import time -N = 1e8 # the number of elements in x -clevel = 5 # the compression level + +N = 1e8 # the number of elements in x +clevel = 5 # the compression level sexpr = "(x-1) < 10." # the expression to compute -#sexpr = "((x-1) % 1000) == 0." # the expression to compute +# sexpr = "((x-1) % 1000) == 0." # the expression to compute #sexpr = "(2*x**3+.3*y**2+z+1)<0" # the expression to compute cparams = bcolz.cparams(clevel) @@ -23,31 +26,31 @@ z = np.arange(N) cy = bcolz.carray(y, cparams=cparams) cz = bcolz.carray(z, cparams=cparams) - ct = bcolz.ctable((cx, cy, cz), names=['x','y','z']) + ct = bcolz.ctable((cx, cy, cz), names=['x', 'y', 'z']) print "Evaluating...", sexpr t0 = time() cbout = ct.eval(sexpr) -print "Time for evaluation--> %.3f" % (time()-t0,) +print "Time for evaluation--> %.3f" % (time() - t0,) print "Converting to numy arrays" bout = cbout[:] t = ct[:] t0 = time() cbool = bcolz.carray(bout, cparams=cparams) -print "Time for converting boolean--> %.3f" % (time()-t0,) +print "Time for converting boolean--> %.3f" % (time() - t0,) print "cbool-->", repr(cbool) t0 = time() vals = [v for v in cbool.wheretrue()] -print "Time for wheretrue()--> %.3f" % (time()-t0,) +print "Time for wheretrue()--> %.3f" % (time() - t0,) print "vals-->", len(vals) print "Starting benchmark now..." # Retrieve from a ndarray t0 = time() vals = [v for v in x[bout]] -print "Time for array--> %.3f" % (time()-t0,) +print "Time for array--> %.3f" % (time() - t0,) #print "vals-->", len(vals) #bcolz.set_num_threads(bcolz.ncores//2) @@ -56,20 +59,20 @@ t0 = time() #cvals = [v for v in cx[cbout]] cvals = [v for v in cx.where(cbout)] -print "Time for carray--> %.3f" % (time()-t0,) +print "Time for carray--> %.3f" % (time() - t0,) #print "vals-->", len(cvals) assert vals == cvals # Retrieve from a structured ndarray t0 = time() vals = [tuple(v) for v in t[bout]] -print "Time for structured array--> %.3f" % (time()-t0,) +print "Time for structured array--> %.3f" % (time() - t0,) #print "vals-->", len(vals) # Retrieve from a ctable t0 = time() #cvals = [tuple(v) for v in ct[cbout]] cvals = [v for v in ct.where(cbout)] -print "Time for ctable--> %.3f" % (time()-t0,) +print "Time for ctable--> %.3f" % (time() - t0,) #print "vals-->", len(cvals) assert vals == cvals diff --git a/bench/large_carray.py b/bench/large_carray.py index 43ff766f..be6c9ebc 100644 --- a/bench/large_carray.py +++ b/bench/large_carray.py @@ -1,8 +1,10 @@ -## Benchmark to check the creation of an array of length > 2**32 (5e9) +# # Benchmark to check the creation of an array of length > 2**32 (5e9) -import bcolz from time import time +import bcolz + + t0 = time() #cn = bcolz.zeros(5e9, dtype="i1") cn = bcolz.zeros(5e9, dtype="i1", rootdir='large_carray-bench', mode='w') diff --git a/bench/query.py b/bench/query.py index 7898e7ca..0a4c6d2a 100644 --- a/bench/query.py +++ b/bench/query.py @@ -2,17 +2,18 @@ # Numexpr is needed in order to execute this. import math +from time import time + import numpy as np -from numpy.testing import assert_array_equal, assert_array_almost_equal -import numexpr as ne + import bcolz -from time import time -N = 1e7 # the number of elements in x -clevel = 5 # the compression level -sexpr = "(x+1)<10" # small number of items -#sexpr = "(x+1)<1000000" # large number -sexpr = "(2*x*x*x+.3*y**2+z+1)<10" # small number + +N = 1e7 # the number of elements in x +clevel = 5 # the compression level +sexpr = "(x+1)<10" # small number of items +# sexpr = "(x+1)<1000000" # large number +sexpr = "(2*x*x*x+.3*y**2+z+1)<10" # small number #sexpr = "(2*x*x*x+.3*y**2+z+1)<1e15" # medium number #sexpr = "(2*x*x*x+.3*y**2+z+1)<1e20" # large number @@ -29,22 +30,22 @@ z = np.arange(N) cy = bcolz.carray(y, cparams=cparams) cz = bcolz.carray(z, cparams=cparams) - t = bcolz.ctable((cx, cy, cz), names=['x','y','z']) + t = bcolz.ctable((cx, cy, cz), names=['x', 'y', 'z']) nt = t[:] print "Querying '%s' with 10^%d points" % (sexpr, int(math.log10(N))) t0 = time() out = [r for r in x[eval(sexpr)]] -print "Time for numpy--> %.3f" % (time()-t0,) +print "Time for numpy--> %.3f" % (time() - t0,) t0 = time() out = [r for r in t[eval(sexpr)]] -print "Time for structured array--> %.3f" % (time()-t0,) +print "Time for structured array--> %.3f" % (time() - t0,) t0 = time() out = [r for r in cx[sexpr]] -print "Time for carray --> %.3f" % (time()-t0,) +print "Time for carray --> %.3f" % (time() - t0,) # Uncomment the next for disabling threading #ne.set_num_threads(1) @@ -58,7 +59,7 @@ cout = [r for r in t.where(sexpr)] #cout = [r['x'] for r in t.where(sexpr)] #cout = [r['y'] for r in t.where(sexpr, colnames=['x', 'y'])] -print "Time for ctable--> %.3f" % (time()-t0,) +print "Time for ctable--> %.3f" % (time() - t0,) print "cout-->", len(cout), cout[:10] #assert_array_equal(out, cout, "Arrays are not equal") diff --git a/bench/serialization.py b/bench/serialization.py index f1c9ff11..3dab240b 100644 --- a/bench/serialization.py +++ b/bench/serialization.py @@ -1,6 +1,9 @@ +from time import time + import numpy as np + import bcolz -from time import time + N = int(1e7) CLEVEL = 5 @@ -9,50 +12,50 @@ t0 = time() ac = bcolz.carray(a, cparams=bcolz.cparams(clevel=CLEVEL)) -print "time creation (memory) ->", round(time()-t0, 3) +print "time creation (memory) ->", round(time() - t0, 3) print "data (memory):", repr(ac) t0 = time() b = bcolz.carray(a, cparams=bcolz.cparams(clevel=CLEVEL), rootdir='myarray', mode='w') b.flush() -print "time creation (disk) ->", round(time()-t0, 3) -#print "meta (disk):", b.read_meta() +print "time creation (disk) ->", round(time() - t0, 3) +# print "meta (disk):", b.read_meta() t0 = time() an = np.array(a) -print "time creation (numpy) ->", round(time()-t0, 3) +print "time creation (numpy) ->", round(time() - t0, 3) t0 = time() c = bcolz.carray(rootdir='myarray') -print "time open (disk) ->", round(time()-t0, 3) +print "time open (disk) ->", round(time() - t0, 3) #print "meta (disk):", c.read_meta() print "data (disk):", repr(c) t0 = time() print sum(ac) -print "time sum (memory, iter) ->", round(time()-t0, 3) +print "time sum (memory, iter) ->", round(time() - t0, 3) t0 = time() print sum(c) -print "time sum (disk, iter) ->", round(time()-t0, 3) +print "time sum (disk, iter) ->", round(time() - t0, 3) t0 = time() print bcolz.eval('sum(ac)') -print "time sum (memory, eval) ->", round(time()-t0, 3) +print "time sum (memory, eval) ->", round(time() - t0, 3) t0 = time() print bcolz.eval('sum(c)') -print "time sum (disk, eval) ->", round(time()-t0, 3) +print "time sum (disk, eval) ->", round(time() - t0, 3) t0 = time() print ac.sum() -print "time sum (memory, method) ->", round(time()-t0, 3) +print "time sum (memory, method) ->", round(time() - t0, 3) t0 = time() print c.sum() -print "time sum (disk, method) ->", round(time()-t0, 3) +print "time sum (disk, method) ->", round(time() - t0, 3) t0 = time() print a.sum() -print "time sum (numpy, method) ->", round(time()-t0, 3) +print "time sum (numpy, method) ->", round(time() - t0, 3) diff --git a/bench/sum.py b/bench/sum.py index 6ad83437..95e64375 100644 --- a/bench/sum.py +++ b/bench/sum.py @@ -1,28 +1,31 @@ +from time import time + import numpy as np + import bcolz -from time import time + N = 1e8 -#a = np.arange(N, dtype='f8') -a = np.random.randint(0,10,N).astype('bool') +# a = np.arange(N, dtype='f8') +a = np.random.randint(0, 10, N).astype('bool') t0 = time() sa = a.sum() -print "Time sum() numpy --> %.3f" % (time()-t0) +print "Time sum() numpy --> %.3f" % (time() - t0) t0 = time() ac = bcolz.carray(a, cparams=bcolz.cparams(9)) -print "Time carray conv --> %.3f" % (time()-t0) +print "Time carray conv --> %.3f" % (time() - t0) print "ac-->", `ac` t0 = time() sac = ac.sum() #sac = ac.sum(dtype=np.dtype('i8')) -print "Time sum() carray --> %.3f" % (time()-t0) +print "Time sum() carray --> %.3f" % (time() - t0) # t0 = time() # sac = sum(i for i in ac) # print "Time sum() carray (iter) --> %.3f" % (time()-t0) print "sa, sac-->", sa, sac, type(sa), type(sac) -assert(sa == sac) +assert (sa == sac) diff --git a/bench/zeros.py b/bench/zeros.py index a4f4e2bd..6a37b2eb 100644 --- a/bench/zeros.py +++ b/bench/zeros.py @@ -1,18 +1,21 @@ +from time import time + import numpy as np + import bcolz -from time import time + N = 2e8 dtype = 'i4' t0 = time() a = np.zeros(N, dtype=dtype) -print "Time numpy.zeros() --> %.4f" % (time()-t0) +print "Time numpy.zeros() --> %.4f" % (time() - t0) t0 = time() ac = bcolz.zeros(N, dtype=dtype) -#ac = bcolz.carray(a) -print "Time bcolz.zeros() --> %.4f" % (time()-t0) +# ac = bcolz.carray(a) +print "Time bcolz.zeros() --> %.4f" % (time() - t0) print "ac-->", `ac` diff --git a/doc/conf.py b/doc/conf.py index 48e76c60..a59ee3ba 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -3,7 +3,8 @@ # bcolz documentation build configuration file, created by # sphinx-quickstart on Mon Dec 13 13:54:01 2010. # -# This file is execfile()d with the current directory set to its containing dir. +# This file is execfile()d with the current directory set to its containing +# dir. # # Note that not all possible configuration values are present in this # autogenerated file. @@ -11,24 +12,26 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys, os - # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) +# sys.path.insert(0, os.path.abspath('.')) -# -- General configuration ----------------------------------------------------- +# -- General configuration +# ----------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. #needs_sphinx = '1.0' -# Add any Sphinx extension module names here, as strings. They can be extensions +# Add any Sphinx extension module names here, as strings. They can be +# extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -#extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage', 'sphinx.ext.ifconfig', 'sphinx.ext.viewcode'] +#extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage', +# 'sphinx.ext.ifconfig', 'sphinx.ext.viewcode'] # `viewcode` dona alguns problemes: # http://bitbucket.org/birkenfeld/sphinx/issue/515/keyerror-while-building -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage', 'sphinx.ext.ifconfig'] +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage', + 'sphinx.ext.ifconfig'] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -69,7 +72,8 @@ # directories to ignore when looking for source files. exclude_patterns = ['_build'] -# The reST default role (used for this markup: `text`) to use for all documents. +# The reST default role (used for this markup: `text`) to use for all +# documents. #default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. @@ -90,7 +94,8 @@ #modindex_common_prefix = [] -# -- Options for HTML output --------------------------------------------------- +# -- Options for HTML output +# --------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. @@ -170,7 +175,8 @@ htmlhelp_basename = 'bcolzdoc' -# -- Options for LaTeX output -------------------------------------------------- +# -- Options for LaTeX output +# -------------------------------------------------- # The paper size ('letter' or 'a4'). #latex_paper_size = 'letter' @@ -179,10 +185,11 @@ #latex_font_size = '10pt' # Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, author, documentclass [howto/manual]). +# (source start file, target name, title, author, documentclass [ +# howto/manual]). latex_documents = [ - ('index', 'bcolz.tex', u'bcolz Documentation', - u'Francesc Alted', 'manual'), + ('index', 'bcolz.tex', u'bcolz Documentation', + u'Francesc Alted', 'manual'), ] # Appendices only appear in the latex output, so bad luck @@ -212,7 +219,8 @@ #latex_domain_indices = True -# -- Options for manual page output -------------------------------------------- +# -- Options for manual page output +# -------------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). diff --git a/doc/index.rst b/doc/index.rst index f4af1124..a2bb9d5c 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -1,7 +1,7 @@ .. bcolz documentation master file, created by - sphinx-quickstart on Mon Dec 13 13:54:01 2010. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. +sphinx-quickstart on Mon Dec 13 13:54:01 2010. +You can adapt this file completely to your liking, but it should at least +contain the root `toctree` directive. Welcome to bcolz's documentation! ================================== @@ -9,14 +9,14 @@ Welcome to bcolz's documentation! Contents: .. toctree:: - :maxdepth: 2 +:maxdepth: 2 - intro - install - tutorial - reference - opt-tips - defaults + intro + install + tutorial + reference + opt-tips + defaults Indices and tables ================== diff --git a/pavement.py b/pavement.py index 6ff95c1e..9ded8e59 100644 --- a/pavement.py +++ b/pavement.py @@ -1,4 +1,4 @@ -######################################################################## +# ####################################################################### # # License: BSD # Created: December 14, 2010 @@ -6,13 +6,16 @@ # ######################################################################## -import sys, os, glob +import sys +import os +import glob import textwrap +from distutils.core import Extension +from distutils.dep_util import newer from paver.easy import * from paver.setuputils import setup -from distutils.core import Extension -from distutils.dep_util import newer + # Some functions for showing errors and warnings. def _print_admonition(kind, head, body): @@ -23,25 +26,28 @@ def _print_admonition(kind, head, body): for line in tw.wrap(body): print line + def exit_with_error(head, body=''): _print_admonition('error', head, body) sys.exit(1) + def print_warning(head, body=''): _print_admonition('warning', head, body) + def check_import(pkgname, pkgver): try: mod = __import__(pkgname) except ImportError: - exit_with_error( - "You need %(pkgname)s %(pkgver)s or greater to run bcolz!" - % {'pkgname': pkgname, 'pkgver': pkgver} ) + exit_with_error( + "You need %(pkgname)s %(pkgver)s or greater to run bcolz!" + % {'pkgname': pkgname, 'pkgver': pkgver}) else: if mod.__version__ < pkgver: exit_with_error( "You need %(pkgname)s %(pkgver)s or greater to run bcolz!" - % {'pkgname': pkgname, 'pkgver': pkgver} ) + % {'pkgname': pkgname, 'pkgver': pkgver}) print ( "* Found %(pkgname)s %(pkgver)s package installed." % {'pkgname': pkgname, 'pkgver': mod.__version__} ) @@ -65,6 +71,7 @@ def check_import(pkgname, pkgver): cython = False try: from Cython.Compiler.Main import Version + if Version.version >= min_cython_version: cython = True except: @@ -93,6 +100,7 @@ def check_import(pkgname, pkgver): inc_dirs = ['bcolz', 'blosc'] # Include NumPy header dirs from numpy.distutils.misc_util import get_numpy_include_dirs + inc_dirs.extend(get_numpy_include_dirs()) cython_pyxfiles = glob.glob('bcolz/*.pyx') cython_cfiles = [fn.split('.')[0] + '.c' for fn in cython_pyxfiles] @@ -117,13 +125,14 @@ def check_import(pkgname, pkgver): @task def cythonize(): for fn in glob.glob('bcolz/*.pyx'): - dest = fn.split('.')[0] + '.c' - if newer(fn, dest): - if not cython: - exit_with_error( - "Need Cython >= %s to generate extensions." - % min_cython_version) - sh("cython " + fn) + dest = fn.split('.')[0] + '.c' + if newer(fn, dest): + if not cython: + exit_with_error( + "Need Cython >= %s to generate extensions." + % min_cython_version) + sh("cython " + fn) + @task @needs('html', 'setuptools.command.sdist') @@ -131,15 +140,18 @@ def sdist(): """Generate a source distribution for the package.""" pass + @task @needs(['cythonize', 'setuptools.command.build']) def build(): - pass + pass + @task @needs(['cythonize', 'setuptools.command.build_ext']) def build_ext(): - pass + pass + @task @needs('paver.doctools.html') @@ -150,6 +162,7 @@ def html(options): builtdocs = path("doc") / options.builddir / "html" builtdocs.move(destdir) + @task def pdf(options): """Build the docs in PDF format.""" @@ -162,14 +175,13 @@ def pdf(options): # Options for Paver tasks options( - sphinx = Bunch( - docroot = "doc", - builddir = "_build" + sphinx=Bunch( + docroot="doc", + builddir="_build" ), ) - classifiers = """\ Development Status :: 4 - Beta Intended Audience :: Developers @@ -184,10 +196,10 @@ def pdf(options): # Package options setup( - name = 'bcolz', - version = VERSION, - description = 'A columnar and compressed data container.', - long_description = """\ + name='bcolz', + version=VERSION, + description='A columnar and compressed data container.', + long_description="""\ bcolz is a columnar and compressed data container. Column storage allows for efficiently querying tables with a large number of columns. It also @@ -197,26 +209,27 @@ def pdf(options): compressor that is optimized for binary data. """, - classifiers = filter(None, classifiers.split("\n")), - author = 'Francesc Alted', - author_email = 'francesc@blosc.io', - url = "https://github.com/Blosc/bcolz", - license = 'http://www.opensource.org/licenses/bsd-license.php', + classifiers=filter(None, classifiers.split("\n")), + author='Francesc Alted', + author_email='francesc@blosc.io', + url="https://github.com/Blosc/bcolz", + license='http://www.opensource.org/licenses/bsd-license.php', # It is better to upload manually to PyPI - #download_url = "http://bcolz.blosc.org/download/bcolz-%s/bcolz-%s.tar.gz" % (VERSION, VERSION), - platforms = ['any'], - ext_modules = [ - Extension( "bcolz.bcolz_ext", - include_dirs=inc_dirs, - sources = cython_cfiles + blosc_files, - depends = ["bcolz/definitions.pxd"] + blosc_files, - library_dirs=lib_dirs, - libraries=libs, - extra_link_args=LFLAGS, - extra_compile_args=CFLAGS ), + #download_url = "http://bcolz.blosc.org/download/bcolz-%s/bcolz-%s.tar + # .gz" % (VERSION, VERSION), + platforms=['any'], + ext_modules=[ + Extension("bcolz.bcolz_ext", + include_dirs=inc_dirs, + sources=cython_cfiles + blosc_files, + depends=["bcolz/definitions.pxd"] + blosc_files, + library_dirs=lib_dirs, + libraries=libs, + extra_link_args=LFLAGS, + extra_compile_args=CFLAGS), ], - packages = ['bcolz', 'bcolz.tests'], - include_package_data = True, + packages=['bcolz', 'bcolz.tests'], + include_package_data=True, ) diff --git a/persistence.rst b/persistence.rst index 8b1ae1d0..b9d90c5e 100644 --- a/persistence.rst +++ b/persistence.rst @@ -7,30 +7,30 @@ RFC for a persistence layer for bcolz :Version: 0.1 (August 19, 2012) -The original bcolz container (up to version 0.4) consisted on -basically a list of compressed in-memory blocks. This document -explains how to extend it to allow to store the data blocks on disk -too. + The original bcolz container (up to version 0.4) consisted on + basically a list of compressed in-memory blocks. This document + explains how to extend it to allow to store the data blocks on disk + too. -The goals of this proposal are: + The goals of this proposal are: -1. Allow to work with data directly on disk, exactly on the same way - than data in memory. + 1. Allow to work with data directly on disk, exactly on the same way + than data in memory. -2. Must support the same access capabilities than bcolz objects - including: append data, modifying data and direct access to data. + 2. Must support the same access capabilities than bcolz objects + including: append data, modifying data and direct access to data. -3. Transparent data compression must be possible. + 3. Transparent data compression must be possible. -4. User metadata addition must be possible too. + 4. User metadata addition must be possible too. -5. The data should be easily 'shardeable' for optimal behaviour in a - distributed storage environment. + 5. The data should be easily 'shardeable' for optimal behaviour in a + distributed storage environment. -This, in combination with a distributed filesystem, and combined with -a system that is aware of the physical topology of the -underlying storage media would allow to almost replace the need for -a distributed infrastructure for data (e.g. Disco/Hadoop). + This, in combination with a distributed filesystem, and combined with + a system that is aware of the physical topology of the + underlying storage media would allow to almost replace the need for + a distributed infrastructure for data (e.g. Disco/Hadoop). The layout ========== diff --git a/setup.py b/setup.py index de0be4ee..0780266b 100644 --- a/setup.py +++ b/setup.py @@ -1,13 +1,15 @@ -######################################################################## +# ####################################################################### # -# License: BSD +# License: BSD # Created: August 16, 2012 # Author: Francesc Alted - francesc@blosc.io # ######################################################################## -import sys, os +from __future__ import absolute_import +import sys +import os from distutils.core import Extension from distutils.core import setup import textwrap @@ -24,25 +26,28 @@ def _print_admonition(kind, head, body): for line in tw.wrap(body): print line + def exit_with_error(head, body=''): _print_admonition('error', head, body) sys.exit(1) + def print_warning(head, body=''): _print_admonition('warning', head, body) + def check_import(pkgname, pkgver): try: mod = __import__(pkgname) except ImportError: - exit_with_error( - "You need %(pkgname)s %(pkgver)s or greater to run bcolz!" - % {'pkgname': pkgname, 'pkgver': pkgver} ) + exit_with_error( + "You need %(pkgname)s %(pkgver)s or greater to run bcolz!" + % {'pkgname': pkgname, 'pkgver': pkgver}) else: if mod.__version__ < pkgver: exit_with_error( "You need %(pkgname)s %(pkgver)s or greater to run bcolz!" - % {'pkgname': pkgname, 'pkgver': pkgver} ) + % {'pkgname': pkgname, 'pkgver': pkgver}) print ( "* Found %(pkgname)s %(pkgver)s package installed." % {'pkgname': pkgname, 'pkgver': mod.__version__} ) @@ -75,12 +80,12 @@ def check_import(pkgname, pkgver): except: exit_with_error( "You need %(pkgname)s %(pkgver)s or greater to compile bcolz!" - % {'pkgname': 'Cython', 'pkgver': min_cython_version} ) + % {'pkgname': 'Cython', 'pkgver': min_cython_version}) if Version.version < min_cython_version: exit_with_error( "At least Cython %s is needed so as to generate extensions!" - % (min_cython_version) ) + % (min_cython_version)) else: print ( "* Found %(pkgname)s %(pkgver)s package installed." % {'pkgname': 'Cython', 'pkgver': Version.version} ) @@ -105,7 +110,7 @@ def check_import(pkgname, pkgver): print_warning( "Numexpr %s installed, but version is not >= %s. " "Disabling support for it." % ( - numexpr.__version__, min_numexpr_version)) + numexpr.__version__, min_numexpr_version)) ########### End of checks ########## @@ -124,6 +129,7 @@ def check_import(pkgname, pkgver): inc_dirs = ['blosc'] # Include NumPy header dirs from numpy.distutils.misc_util import get_numpy_include_dirs + inc_dirs.extend(get_numpy_include_dirs()) optional_libs = [] @@ -144,7 +150,6 @@ def check_import(pkgname, pkgver): # Add some macros here for debugging purposes, if needed def_macros = [] - classifiers = """\ Development Status :: 4 - Beta Intended Audience :: Developers @@ -156,10 +161,10 @@ def check_import(pkgname, pkgver): Operating System :: Microsoft :: Windows Operating System :: Unix """ -setup(name = "bcolz", - version = VERSION, - description = 'columnar and compressed data containers.', - long_description = """\ +setup(name="bcolz", + version=VERSION, + description='columnar and compressed data containers.', + long_description="""\ bcolz provides columnar and compressed data containers. Column storage allows for efficiently querying tables with a large number of columns. It @@ -169,32 +174,33 @@ def check_import(pkgname, pkgver): a high-performance compressor that is optimized for binary data. """, - classifiers = filter(None, classifiers.split("\n")), - author = 'Francesc Alted', - author_email = 'francesc@blosc.io', - maintainer = 'Francesc Alted', - maintainer_email = 'francesc@blosc.io', - url = 'https://github.com/Blosc/bcolz', - license = 'http://www.opensource.org/licenses/bsd-license.php', + classifiers=filter(None, classifiers.split("\n")), + author='Francesc Alted', + author_email='francesc@blosc.io', + maintainer='Francesc Alted', + maintainer_email='francesc@blosc.io', + url='https://github.com/Blosc/bcolz', + license='http://www.opensource.org/licenses/bsd-license.php', # It is better to upload manually to PyPI - #download_url = 'http://github.com/downloads/Blosc/bcolz/python-bcolz-%s.tar.gz' % (VERSION,), - platforms = ['any'], - cmdclass = {'build_ext': build_ext}, - ext_modules = [ - Extension( "bcolz.bcolz_ext", - include_dirs=inc_dirs, - define_macros=def_macros, - sources = [ "bcolz/bcolz_ext.pyx", - "blosc/blosc.c", "blosc/blosclz.c", - "blosc/shuffle.c" ], - depends = [ "blosc/blosc.h", "blosc/blosclz.h", - "blosc/shuffle.h" ], - library_dirs=lib_dirs, - libraries=libs, - extra_link_args=LFLAGS, - extra_compile_args=CFLAGS ), - ], - packages = ['bcolz', 'bcolz.tests'], + #download_url = 'http://github.com/downloads/Blosc/bcolz/python-bcolz + # -%s.tar.gz' % (VERSION,), + platforms=['any'], + cmdclass={'build_ext': build_ext}, + ext_modules=[ + Extension("bcolz.bcolz_ext", + include_dirs=inc_dirs, + define_macros=def_macros, + sources=["bcolz/bcolz_ext.pyx", + "blosc/blosc.c", "blosc/blosclz.c", + "blosc/shuffle.c"], + depends=["blosc/blosc.h", "blosc/blosclz.h", + "blosc/shuffle.h"], + library_dirs=lib_dirs, + libraries=libs, + extra_link_args=LFLAGS, + extra_compile_args=CFLAGS), + ], + packages=['bcolz', 'bcolz.tests'], )