Skip to content
This repository has been archived by the owner on Dec 11, 2023. It is now read-only.

pandas out_flavor for ctable #184

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
245 changes: 230 additions & 15 deletions bcolz/ctable.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import numpy as np
import bcolz
from bcolz import utils, attrs, array2string
if bcolz.pandas_here:
import pandas as pd
import itertools
from collections import namedtuple
import json
Expand Down Expand Up @@ -1013,11 +1015,9 @@ def _where(self, boolarr, colnames=None):

if colnames is None:
colnames = self.names
cols = [self.cols[name][boolarr] for name in colnames]
dtype = np.dtype([(name, self.cols[name].dtype) for name in colnames])
result = np.rec.fromarrays(cols, dtype=dtype).view(np.ndarray)
result = OutputStructure.fromboolarr(self, boolarr, colnames)

return result
return result.ra

def __getitem__(self, key):
"""Returns values based on `key`.
Expand All @@ -1043,10 +1043,10 @@ def __getitem__(self, key):
# First, check for integer
if isinstance(key, _inttypes):
# Get a copy of the len-1 array
ra = self._arr1.copy()
result = OutputStructure(1, self.dtype)
# Fill it
ra[0] = tuple([self.cols[name][key] for name in self.names])
return ra[0]
result[0] = tuple([self.cols[name][key] for name in self.names])
return result.ra
# Slices
elif type(key) == slice:
(start, stop, step) = key.start, key.stop, key.step
Expand All @@ -1060,7 +1060,7 @@ def __getitem__(self, key):
# List of integers (case of fancy indexing), or list of column names
elif type(key) is list:
if len(key) == 0:
return np.empty(0, self.dtype)
return OutputStructure(0, self.dtype).ra
strlist = [type(v) for v in key] == [str for v in key]
# Range of column names
if strlist:
Expand All @@ -1072,15 +1072,16 @@ def __getitem__(self, key):
except:
raise IndexError(
"key cannot be converted to an array of indices")
return np.fromiter((self[i] for i in key),
dtype=self.dtype, count=len(key))
result = OutputStructure.fromindices(self, key)
return result.ra
# A boolean array (case of fancy indexing)
elif hasattr(key, "dtype"):
if key.dtype.type == np.bool_:
return self._where(key)
elif np.issubsctype(key, np.int_):
# An integer array
return np.array([self[i] for i in key], dtype=self.dtype)
result = OutputStructure.fromindices(self, key)
return result.ra
else:
raise IndexError(
"arrays used as indices must be integer (or boolean)")
Expand All @@ -1105,12 +1106,12 @@ def __getitem__(self, key):
(start, stop, step) = slice(start, stop, step).indices(self.len)
# Build a numpy container
n = utils.get_len_of_range(start, stop, step)
ra = np.empty(shape=(n,), dtype=self.dtype)
result = OutputStructure(n, self.dtype)
# Fill it
for name in self.names:
ra[name][:] = self.cols[name][start:stop:step]
result[name] = self.cols[name][start:stop:step]

return ra
return result.ra

def __setitem__(self, key, value):
"""Sets values based on `key`.
Expand Down Expand Up @@ -1247,7 +1248,221 @@ def __repr__(self):
return fullrepr


# Local Variables:

class OutputStructureEngine(object):
# holds the return array
ra = None

# poor-man's cache, better would be a LRU cache from python3 or its backport
template_cache = {}
template_order = []
template_type = None
template_maxsize = 10

@classmethod
def _push_to_cache(cls, key, value):
if len(cls.template_cache) >= cls.template_maxsize:
# remove first inserted
del cls.template_cache[cls.template_order[0]]
del cls.template_order[0]
cls.template_order.append(key)
cls.template_cache[key] = value

@classmethod
def _try_cache(cls, key):
if cls.template_type != bcolz.defaults.ctable_out_flavor:
return None

if dtype not in cls.template_cache:
return None
else:
return cls.template_cache[key]

# dispatcher functions
def __init__(self, size, dtype):
"""Allocate an output array and return it encapsulated in a class
abstracting data access."""

method = '_allocate_' + bcolz.defaults.ctable_out_flavor
allocate = getattr(self, method, self._fallback)
self.ra = allocate(size, dtype)

@classmethod
def fromindices(cls, ctable_, iter):
"""Create an output array from an iterator or row indices and return
it encapsulated in a class abstracting data access."""

method = '_fromindices_' + bcolz.defaults.ctable_out_flavor
fromindices = getattr(cls, method, cls._fallback)
return fromindices(ctable_, iter)

@classmethod
def fromboolarr(cls, ctable_, boolarr, colnames):
"""Create an output array from a boolean row selector arrayand return
it encapsulated in a class abstracting data access."""

method = '_fromboolarr_' + bcolz.defaults.ctable_out_flavor
fromboolarr = getattr(cls, method, cls._fallback)
return fromboolarr(ctable_, boolarr, colnames)

def __setitem__(self, key, value):
"""Abstract data access to an output array."""

method = '_setitem_' + bcolz.defaults.ctable_out_flavor
setitem = getattr(self, method, self._fallback)
return setitem(key, value)

@classmethod
def _fallback(cls, *args, **kwargs):
import inspect
raise NotImplementedError('_%s_%s not implemented.' %
(inspect.stack()[1][3].strip('_'),
cls.out_flavor)
)


class OutputStructure(OutputStructureEngine):
### numpy implementation ###
@classmethod
def _allocate_numpy(cls, size, dtype):
if size == 1:
# only cache size-1 numpy arrays
result = cls._try_cache(dtype)
if result is None:
result = np.empty(shape=(1,), dtype=dtype)
cls._push_to_cache(dtype, result)

return result.copy()

else:
return np.empty(size, dtype)

@classmethod
def _fromindices_numpy(cls, ctable_, iter):
result = object.__new__(cls)
result.ra = np.fromiter((ctable_[i] for i in iter),
dtype=ctable_.dtype, count=len(iter))
return result

@classmethod
def _fromboolarr_numpy(cls, ctable_, boolarr, colnames):
result = object.__new__(cls)

dtype = np.dtype([(name, ctable_.cols[name].dtype) for name in colnames])
cols = [ctable_.cols[name][boolarr] for name in colnames]
result.ra = np.rec.fromarrays(cols, dtype=dtype).view(np.ndarray)
return result

def _setitem_numpy(self, key, value):
if isinstance(key, int):
self.ra[key] = value
else:
self.ra[key][:] = value

### pandas implementation ###
@classmethod
def _allocate_pandas(cls, size, dtype):
# cache templates of pandas dataframes for faster instantiation
template = cls._try_cache(dtype)
if template is None:
template = pd.DataFrame(np.empty(shape=(0,), dtype=dtype))
cls._push_to_cache(dtype, template)

return allocate_like(template, size)

@classmethod
def _fromindices_pandas(cls, ctable_, iter):
result = object.__new__(cls)
result.ra = cls._allocate_pandas(len(iter), ctable_.dtype)

for name in colnames:
result[name] = ctable_.cols[name][iter]

return result

@classmethod
def _fromboolarr_pandas(cls, ctable_, boolarr, colnames):
dtype = np.dtype([(name, ctable_.cols[name].dtype) for name in colnames])
result = object.__new__(cls)
result.ra = cls._allocate_pandas(len(boolarr[boolarr]), dtype)

for name in colnames:
result[name] = ctable_.cols[name][boolarr]

return result

def _setitem_pandas(self, key, value):
if isinstance(key, int):
blknos = self.ra._data._blknos[range(len(value))]
blklocs = self.ra._data._blklocs[range(len(value))]
for i, (blkno, blkloc) in enumerate(zip(blknos, blklocs)):
self.ra._data.blocks[blkno].values[blkloc, key] = value[i]
else:
# efficiently setting pandas columns
loc = self.ra._data.items.get_loc(key)
blkno = self.ra._data._blknos[loc]
blkloc = self.ra._data._blklocs[loc]
self.ra._data.blocks[blkno].values[blkloc, :] = value


# avoid making pandas a requirement for bcolz
# needs more polished solution eventualls
try:
from pandas.core.internals import BlockManager
from pandas.core.frame import DataFrame
from pandas.core.common import CategoricalDtype
from pandas.core.categorical import Categorical
except ImportError:
pass

try:
from pandas.core.index import RangeIndex
except ImportError:
try:
from pandas.core.index import Int64Index
def RangeIndex(start, stop, step, **kwargs):
return Int64Index(np.arange(start, stop, step), **kwargs)
except ImportError:
pass

def allocate_like(df, size, keep_categories=False):
"""High-performance pandas dataframe constructor for numpy dtype
columns + categoricals working from a template dataframe.
This significantly speed up dataframe instantiation for dataframes
with only a few rows, gains for large dataframes are minimal."""

# define axes (ideally uses PR #9977 for MUCH better performance)
axes = [df.columns.values.tolist(), RangeIndex(0, size, 1, fastpath=True)]

# allocate and create blocks
blocks = []
for block in df._data.blocks:
# special treatment for non-ordinary block types
if isinstance(block.dtype, CategoricalDtype):
if keep_categories:
categories = block.values.categories
else:
categories = Index([])
values = Categorical(values=np.empty(shape=block.values.shape,
dtype=block.values.codes.dtype),
categories=categories,
fastpath=True)
# ordinary block types
else:
new_shape = (block.values.shape[0], size)
values = np.empty(shape=new_shape, dtype=block.dtype)

new_block = block.make_block_same_class(values=values,
placement=block.mgr_locs.as_array)
blocks.append(new_block)

# create block manager
mgr = BlockManager(blocks, axes)

# create dataframe
return DataFrame(mgr)


# mode: python
# tab-width: 4
# fill-column: 78
Expand Down
23 changes: 23 additions & 0 deletions bcolz/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from __future__ import absolute_import

import bcolz
import warnings


class Defaults(object):
Expand All @@ -22,6 +23,7 @@ def __init__(self):

# Choices setup
self.choices['eval_out_flavor'] = ("carray", "numpy")
self.choices['ctable_out_flavor'] = ("numpy", "pandas")
self.choices['eval_vm'] = ("numexpr", "python")

def check_choices(self, name, value):
Expand Down Expand Up @@ -70,6 +72,21 @@ def eval_out_flavor(self, value):
self.check_choices('eval_out_flavor', value)
self.__eval_out_flavor = value

@property
def ctable_out_flavor(self):
return self.__ctable_out_flavor

@ctable_out_flavor.setter
def ctable_out_flavor(self, value):
try:
self.check_choices('ctable_out_flavor', value)
except ValueError:
warnings.warn(
"'%s' is not implemented out of the box for '%s' default."
% (value, ctable_out_flavor)
+ " Provide your own OutputStructure implementation.")
self.__ctable_out_flavor = value

@property
def cparams(self):
return self.__cparams
Expand All @@ -90,6 +107,12 @@ def cparams(self, value):
'numpy'. Default is 'carray'.
"""

defaults.ctable_out_flavor = "numpy"
"""
The flavor for the output object in `eval()`. It can be 'carray' or
'numpy'. Default is 'carray'.
"""

defaults.eval_vm = "numexpr" if bcolz.numexpr_here else "python"
"""
The virtual machine to be used in computations (via `eval`). It can
Expand Down