Skip to content
This repository has been archived by the owner on Dec 11, 2023. It is now read-only.

introduction of an abstraction layer for the "results array" #187

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 102 additions & 22 deletions bcolz/ctable.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import os
import shutil
from .py2help import _inttypes, _strtypes, imap, xrange
import weakref

_inttypes += (np.integer,)
islice = itertools.islice
Expand Down Expand Up @@ -222,8 +223,36 @@ def __init__(self, columns=None, names=None, **kwargs):
# Attach the attrs to this object
self.attrs = attrs.attrs(self.rootdir, self.mode, _new=_new)

# Cache a structured array of len 1 for ctable[int] acceleration
self._arr1 = np.empty(shape=(1,), dtype=self.dtype)
# Initialise output structure cache
self._outstruc_update_cache()

def __new__(cls, *args, **kwargs):
# keep track of all ctable instances to be able to update their
# output structure caches when the output processor changes
if not hasattr(cls, '_instances'):
cls._instances = []
new_instance = object.__new__(cls)
cls._instances.append(weakref.ref(new_instance))
return new_instance

@classmethod
def _update_outstruc_processor(cls, processor):
bcolz.ctable._outstruc_allocate = processor.allocate
bcolz.ctable._outstruc_update_cache = processor.update_cache
bcolz.ctable._outstruc_fromindices = processor.fromindices
bcolz.ctable._outstruc_fromboolarr = processor.fromboolarr
assert hasattr(processor, '__setitem__')

if not hasattr(cls, '_instances'):
return

live_instances = []
for instance in cls._instances:
if instance() is not None:
instance()._outstruc_update_cache()
live_instances.append(instance)
cls._instances = live_instances


def create_ctable(self, columns, names, **kwargs):
"""Create a ctable anew."""
Expand Down Expand Up @@ -487,8 +516,9 @@ def addcol(self, newcol, name=None, pos=None, move=False, **kwargs):

# Insert the column
self.cols.insert(name, pos, newcol)
# Update _arr1
self._arr1 = np.empty(shape=(1,), dtype=self.dtype)
# Update output structure cache
self._outstruc_update_cache()


if self.auto_flush:
self.flush()
Expand Down Expand Up @@ -540,8 +570,9 @@ def delcol(self, name=None, pos=None, keep=False):
if not keep:
col.purge()

# Update _arr1
self._arr1 = np.empty(shape=(1,), dtype=self.dtype)
# Update output structure cache
self._outstruc_update_cache()


if self.auto_flush:
self.flush()
Expand Down Expand Up @@ -1013,11 +1044,7 @@ def _where(self, boolarr, colnames=None):

if colnames is None:
colnames = self.names
cols = [self.cols[name][boolarr] for name in colnames]
dtype = np.dtype([(name, self.cols[name].dtype) for name in colnames])
result = np.rec.fromarrays(cols, dtype=dtype).view(np.ndarray)

return result
return self._outstruc_fromboolarr(boolarr, colnames).ra

def __getitem__(self, key):
"""Returns values based on `key`.
Expand All @@ -1043,10 +1070,10 @@ def __getitem__(self, key):
# First, check for integer
if isinstance(key, _inttypes):
# Get a copy of the len-1 array
ra = self._arr1.copy()
result = self._outstruc_allocate(1)
# Fill it
ra[0] = tuple([self.cols[name][key] for name in self.names])
return ra[0]
result[0] = [self.cols[name][key] for name in self.names]
return result.ra
# Slices
elif type(key) == slice:
(start, stop, step) = key.start, key.stop, key.step
Expand All @@ -1060,7 +1087,7 @@ def __getitem__(self, key):
# List of integers (case of fancy indexing), or list of column names
elif type(key) is list:
if len(key) == 0:
return np.empty(0, self.dtype)
return self._outstruc_allocate(0, self.dtype).ra
strlist = [type(v) for v in key] == [str for v in key]
# Range of column names
if strlist:
Expand All @@ -1072,15 +1099,14 @@ def __getitem__(self, key):
except:
raise IndexError(
"key cannot be converted to an array of indices")
return np.fromiter((self[i] for i in key),
dtype=self.dtype, count=len(key))
return self._outstruc_fromindices(key).ra
# A boolean array (case of fancy indexing)
elif hasattr(key, "dtype"):
if key.dtype.type == np.bool_:
return self._where(key)
elif np.issubsctype(key, np.int_):
# An integer array
return np.array([self[i] for i in key], dtype=self.dtype)
return self._outstruc_fromindices(key).ra
else:
raise IndexError(
"arrays used as indices must be integer (or boolean)")
Expand All @@ -1105,12 +1131,12 @@ def __getitem__(self, key):
(start, stop, step) = slice(start, stop, step).indices(self.len)
# Build a numpy container
n = utils.get_len_of_range(start, stop, step)
ra = np.empty(shape=(n,), dtype=self.dtype)
result = self._outstruc_allocate(n, self.dtype)
# Fill it
for name in self.names:
ra[name][:] = self.cols[name][start:stop:step]
result[name] = self.cols[name][start:stop:step]

return ra
return result.ra

def __setitem__(self, key, value):
"""Sets values based on `key`.
Expand Down Expand Up @@ -1231,7 +1257,22 @@ def _get_stats(self):
return (nbytes, cbytes, cratio)

def __str__(self):
return array2string(self)
if self._outstruc_allocate.__func__ == OutputStructure_numpy.allocate:
return array2string(self)

# if a custom output structure is configured, use numpy for
# bcolz string representation for consistent output formatting
current_allocate_fn = self._outstruc_allocate
OutputStructure_numpy.update_cache(self)
def tmp_allocate(*args, **kwargs):
return OutputStructure_numpy.allocate(self, *args, **kwargs)
self._outstruc_allocate = tmp_allocate

result = array2string(self)

del self._outstruc_allocate
self._outstruc_update_cache()
return result

def __repr__(self):
nbytes, cbytes, cratio = self._get_stats()
Expand All @@ -1247,6 +1288,45 @@ def __repr__(self):
return fullrepr


class OutputStructure_numpy(object):
__slots__ = ['ra']

@staticmethod
def update_cache(ctable_):
ctable_._outstruc_cache = np.empty(shape=(1,), dtype=ctable_.dtype)

@staticmethod
def allocate(ctable_, size, dtype=None):
result = object.__new__(OutputStructure_numpy)
if size == 1:
result.ra = ctable_._outstruc_cache.copy()
else:
result.ra = np.empty(size, dtype)
return result

@staticmethod
def fromindices(ctable_, iter):
result = object.__new__(OutputStructure_numpy)
result.ra = np.fromiter((ctable_[i] for i in iter),
dtype=ctable_.dtype, count=len(iter))
return result

@staticmethod
def fromboolarr(ctable_, boolarr, colnames):
result = object.__new__(OutputStructure_numpy)

dtype = np.dtype([(name, ctable_.cols[name].dtype) for name in colnames])
cols = [ctable_.cols[name][boolarr] for name in colnames]
result.ra = np.rec.fromarrays(cols, dtype=dtype).view(np.ndarray)
return result

def __setitem__(self, key, value):
if isinstance(key, int):
self.ra[key] = tuple(value)
else:
self.ra[key][:] = value


# Local Variables:
# mode: python
# tab-width: 4
Expand Down
23 changes: 23 additions & 0 deletions bcolz/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from __future__ import absolute_import

import bcolz
from bcolz.ctable import OutputStructure_numpy


class Defaults(object):
Expand Down Expand Up @@ -70,6 +71,22 @@ def eval_out_flavor(self, value):
self.check_choices('eval_out_flavor', value)
self.__eval_out_flavor = value

@property
def ctable_out_implementation(self):
return self.__ctable_out_implementation

@ctable_out_implementation.setter
def ctable_out_implementation(self, value):
if value is None:
value = OutputStructure_numpy
try:
bcolz.ctable._update_outstruc_processor(value)
self.__ctable_out_implementation = value
except (AttributeError, AssertionError):
bcolz.ctable._update_outstruc_processor(OutputStructure_numpy)
raise NotImplementedError(
'The output structure implementation is incomplete')

@property
def cparams(self):
return self.__cparams
Expand All @@ -90,6 +107,12 @@ def cparams(self, value):
'numpy'. Default is 'carray'.
"""

defaults.ctable_out_implementation = None
"""
The implementation of the output structure abstraction layer for the
output object in `__getitem__()`.
"""

defaults.eval_vm = "numexpr" if bcolz.numexpr_here else "python"
"""
The virtual machine to be used in computations (via `eval`). It can
Expand Down