In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
%run vaex_implementation_v13.py

## Testing _VaexBuffer

In [3]:
x = np.ndarray(shape=(5,), dtype=float, order='F')

In [4]:
x
x.strides
x.dtype.itemsize
x.copy()

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

(8,)

8

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [5]:
x_buffer = _VaexBuffer(x)
x_buffer

VaexBuffer({'bufsize': 40, 'ptr': 2319359411344, 'device': 'CPU'})

In [6]:
import pytest

assert x_buffer.bufsize == 5*x.itemsize
assert x_buffer.ptr == x.__array_interface__['data'][0]

assert x_buffer.__repr__() == f"VaexBuffer({{'bufsize': {5*x.itemsize}, 'ptr': {x.__array_interface__['data'][0]}, 'device': 'CPU'}})"
assert x_buffer.__dlpack_device__() == (1, None)

with pytest.raises(NotImplementedError):
    assert x_buffer.__dlpack__()

# Testing _VaexColumn methods

In [7]:
# Test data 
indices = pa.array([0, 1, 2, 1, 2])
dictionary = pa.array(['foo', 'bar', 'baz'])
dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)

the_df = vaex.from_arrays(
    numpy_int=np.array([1, 2, 3, 4, 0]), # Numpy int
    numpy_float=np.array([1.5, 2.5, 3.5, 4.5, 0]), # Numpy float
    numpy_bool=np.array([True, False, True, True, True]), # Numpy bool
    
    numpy_int_m=np.ma.array([1, 2, 3, 4, 0], mask=[0, 0, 0, 1, 1], dtype=int), # Numpy masked int
    numpy_float_m=np.ma.array([1.5, 2.5, 3.5, 4.5, 0], mask=[False, True, True, True, False], dtype=float), # Numpy masked float
    numpy_bool_m=np.ma.array([True, False, True, True, True], mask=[1, 0, 0, 1, 0], dtype=bool), # Numpy masked bool
    
    arrow_int = pa.array([0, 1, 2, 3, 0]), # Arrow integer
    arrow_float = pa.array([0.5, 1.5, 2.5, 3.5, 0.5]), # Arrow integer
    arrow_bool = pa.array([True, False, True, False, True]), # Arrow integer
    
    arrow_int_m = pa.array([0, 1, 2, None, 0], mask=np.array([0, 0, 0, 1, 1], dtype=bool)), # Arrow masked integer
    arrow_float_m = pa.array([0.5, 1.5, 2.5, None, 0.5], mask=np.array([0, 0, 0, 1, 0], dtype=bool)), # Arrow masked integer
    arrow_bool_m = pa.array([True, False, True, None, True], mask=np.array([0, 0, 1, 1, 0], dtype=bool)), # Arrow masked integer
    
    arrow_dict = pa.DictionaryArray.from_arrays(pa.array([0, 1, 2, 0, 1]), pa.array(['aap', 'noot', 'mies'])), # arrow dictionary
    arrow_dict_m = pa.DictionaryArray.from_arrays(pa.array([0, 1, 2, 0, 1], mask=np.array([0, 1, 1, 0, 0], dtype=bool)), pa.array(['aap', 'noot', 'mies'])) # arrow dictionary masked
)
add_df = vaex.from_arrays(
    vaex_cat = np.ma.array(['red', 'green', 'blue', 'green', 'MISSING'], mask=[False, False, True, False, False]) # Vaex dictionary
)
add_df = add_df.ordinal_encode('vaex_cat', ['red', 'green', 'blue'])
the_df


#,numpy_int,numpy_float,numpy_bool,numpy_int_m,numpy_float_m,numpy_bool_m,arrow_int,arrow_float,arrow_bool,arrow_int_m,arrow_float_m,arrow_bool_m,arrow_dict,arrow_dict_m
0,1,1.5,True,1,1.5,--,0,0.5,True,0,0.5,True,aap,aap
1,2,2.5,False,2,--,False,1,1.5,False,1,1.5,False,noot,--
2,3,3.5,True,3,--,True,2,2.5,True,2,2.5,--,mies,--
3,4,4.5,True,--,--,--,3,3.5,False,--,--,--,aap,aap
4,0,0.0,True,--,0.0,True,0,0.5,True,--,0.5,True,noot,noot


In [50]:
# test arrow dict

column = the_df.arrow_float_m
columnvaex_c = _VaexColumn(column)

assert vaex_c._col.tolist() == column.tolist()
assert vaex_c._allow_copy == True

assert vaex_c.size == 5
assert vaex_c.offset == 0

assert vaex_c.dtype[0]==2 # 2: float64
assert vaex_c.dtype[1] == 64 # 64: float64
assert vaex_c.dtype == (2, 64, '<f8', '=')

with pytest.raises(TypeError):
    assert vaex_c.describe_categorical
    
assert vaex_c.describe_null == (3, 1)

In [51]:
# test arrow dict

column2 = the_df.arrow_dict
vaex_c2 = _VaexColumn(column2)

assert vaex_c2._col.tolist() == column2.tolist()
assert vaex_c2.size == 5

assert vaex_c2.dtype[0]==23 # 23: categorical
assert vaex_c2.dtype[1] == 64
assert vaex_c2.dtype == (23, 64, 'u', '=')

assert vaex_c2.describe_categorical == (False, True, {0: 'aap', 1: 'noot', 2: 'mies'})
assert vaex_c2.describe_null == (3, 1)

In [52]:
# test object / not supported
# for now not implemented as it falls under 'O': _k.STRING

df = vaex.from_arrays(a=np.array([None, True, False]))
column3 = df.a
vaex_c3 = _VaexColumn(column3)

assert vaex_c3._col.tolist() == column3.tolist()
assert vaex_c3.size == 3

with pytest.raises(NotImplementedError):
    assert vaex_c3.dtype

with pytest.raises(NotImplementedError):
    assert vaex_c3.describe_null

In [53]:
# test string / not handled yet

df2 = vaex.from_dict({"A": ["a", "b", "cdef", "", "g"]})
column4 = df2.A
vaex_c4 = _VaexColumn(column4)

assert vaex_c4._col.tolist() == column4.tolist()
assert vaex_c4.size == 5

with pytest.raises(NotImplementedError):
    assert vaex_c4.dtype
with pytest.raises(NotImplementedError):
    assert vaex_c4.describe_null

# Testing _VaexDataFrame methods

In [101]:
assert the_df.__dataframe__()._allow_copy == True
assert the_df.__dataframe__().num_columns() == 14
assert the_df.__dataframe__().num_rows() == 5
assert the_df.__dataframe__().num_chunks() == 1

assert df2.__dataframe__().column_names() == ['A']
assert the_df.__dataframe__().get_column(1)._col.tolist() == the_df.numpy_float.tolist()
assert the_df.__dataframe__().get_column_by_name('arrow_dict')._col.tolist() == the_df.arrow_dict.tolist()

for col in the_df.__dataframe__().get_columns():
    assert col._col.tolist() == the_df[col._col.expression].tolist()
    
the_df.__dataframe__().select_columns_by_name(('numpy_bool', 'arrow_int_m', 'arrow_dict'))._df

#,numpy_bool,arrow_int_m,arrow_dict
0,True,0,aap
1,False,1,noot
2,True,2,mies
3,True,--,aap
4,True,--,noot
