In [1]:
import numpy as np
import pyarrow as pa
import vaex

%run vaex_implementation_v13.py
%run pandas_implementation.py

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Numpy format string research

In [3]:
dt1 = np.dtype(np.int8) 
dt1

dtype('int8')

In [4]:
dt1.byteorder
dt1.itemsize
dt1.name
dt1.kind
dt1.type is np.int8

'|'

1

'int8'

'i'

True

In [5]:
dt2 = np.dtype('>i4')
dt2

dtype('>i4')

In [6]:
dt2.byteorder
dt2.itemsize
dt2.name
dt2.kind
dt2.type is np.int32

'>'

4

'int32'

'i'

True

In [7]:
dt3 = np.dtype('|b1')
dt3

dtype('bool')

In [8]:
dt3.byteorder
dt3.itemsize
dt3.name
dt3.kind
dt3.type is np.bool_

'|'

1

'bool'

'b'

True

# Vaex

In [9]:
indices = pa.array([0, 1, 2, 1, 2])
dictionary = pa.array(['foo', 'bar', 'baz'])
dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)

the_df = vaex.from_arrays(
    numpy_int=np.array([1, 2, 3, 4, 0]), # Numpy int
    numpy_float=np.array([1.5, 2.5, 3.5, 4.5, 0]), # Numpy float
    numpy_bool=np.array([True, False, True, True, True]), # Numpy bool
    
    numpy_int_m=np.ma.array([1, 2, 3, 4, 0], mask=[0, 0, 0, 1, 1], dtype=int), # Numpy masked int
    numpy_float_m=np.ma.array([1.5, 2.5, 3.5, 4.5, 0], mask=[False, True, True, True, False], dtype=float), # Numpy masked float
    numpy_bool_m=np.ma.array([True, False, True, True, True], mask=[1, 0, 0, 1, 0], dtype=bool), # Numpy masked bool
    
    arrow_int = pa.array([0, 1, 2, 3, 0]), # Arrow integer
    arrow_float = pa.array([0.5, 1.5, 2.5, 3.5, 0.5]), # Arrow integer
    arrow_bool = pa.array([True, False, True, False, True]), # Arrow integer
    
    arrow_int_m = pa.array([0, 1, 2, None, 0], mask=np.array([0, 0, 0, 1, 1], dtype=bool)), # Arrow masked integer
    arrow_float_m = pa.array([0.5, 1.5, 2.5, None, 0.5], mask=np.array([0, 0, 0, 1, 0], dtype=bool)), # Arrow masked integer
    arrow_bool_m = pa.array([True, False, True, None, True], mask=np.array([0, 0, 1, 1, 0], dtype=bool)), # Arrow masked integer
    
    arrow_dict = pa.DictionaryArray.from_arrays(pa.array([0, 1, 2, 0, 1]), pa.array(['aap', 'noot', 'mies'])), # arrow dictionary
    arrow_dict_m = pa.DictionaryArray.from_arrays(pa.array([0, 1, 2, 0, 1], mask=np.array([0, 1, 1, 0, 0], dtype=bool)), pa.array(['aap', 'noot', 'mies'])) # arrow dictionary masked
)
add_df = vaex.from_arrays(
    vaex_cat = np.ma.array(['red', 'green', 'blue', 'green', 'MISSING'], mask=[False, False, True, False, False]) # Vaex dictionary
)
add_df = add_df.ordinal_encode('vaex_cat', ['red', 'green', 'blue'])
the_df

#,numpy_int,numpy_float,numpy_bool,numpy_int_m,numpy_float_m,numpy_bool_m,arrow_int,arrow_float,arrow_bool,arrow_int_m,arrow_float_m,arrow_bool_m,arrow_dict,arrow_dict_m
0,1,1.5,True,1,1.5,--,0,0.5,True,0,0.5,True,aap,aap
1,2,2.5,False,2,--,False,1,1.5,False,1,1.5,False,noot,--
2,3,3.5,True,3,--,True,2,2.5,True,2,2.5,--,mies,--
3,4,4.5,True,--,--,--,3,3.5,False,--,--,--,aap,aap
4,0,0.0,True,--,0.0,True,0,0.5,True,--,0.5,True,noot,noot


In [10]:
for col in the_df.get_column_names():
    column = _VaexColumn(the_df[col])
    print(column.dtype)

(<_DtypeKind.INT: 0>, 32, '<i4', '=')
(<_DtypeKind.FLOAT: 2>, 64, '<f8', '=')
(<_DtypeKind.BOOL: 20>, 8, '|b1', '|')
(<_DtypeKind.INT: 0>, 32, '<i4', '=')
(<_DtypeKind.FLOAT: 2>, 64, '<f8', '=')
(<_DtypeKind.BOOL: 20>, 8, '|b1', '|')
(<_DtypeKind.INT: 0>, 64, '<i8', '=')
(<_DtypeKind.FLOAT: 2>, 64, '<f8', '=')
(<_DtypeKind.BOOL: 20>, 8, '|b1', '|')
(<_DtypeKind.INT: 0>, 64, '<i8', '=')
(<_DtypeKind.FLOAT: 2>, 64, '<f8', '=')
(<_DtypeKind.BOOL: 20>, 8, '|b1', '|')
(<_DtypeKind.CATEGORICAL: 23>, 64, 'u', '=')
(<_DtypeKind.CATEGORICAL: 23>, 64, 'u', '=')


In [11]:
%run vaex_implementation_v14.py

In [12]:
for col in the_df.get_column_names():
    column = _VaexColumn(the_df[col])
    print(column.dtype)

(<_DtypeKind.INT: 0>, 32, 'i', '=')
(<_DtypeKind.FLOAT: 2>, 64, 'g', '=')
(<_DtypeKind.BOOL: 20>, 8, 'b', '|')
(<_DtypeKind.INT: 0>, 32, 'i', '=')
(<_DtypeKind.FLOAT: 2>, 64, 'g', '=')
(<_DtypeKind.BOOL: 20>, 8, 'b', '|')
(<_DtypeKind.INT: 0>, 64, 'l', '=')
(<_DtypeKind.FLOAT: 2>, 64, 'g', '=')
(<_DtypeKind.BOOL: 20>, 8, 'b', '|')
(<_DtypeKind.INT: 0>, 64, 'l', '=')
(<_DtypeKind.FLOAT: 2>, 64, 'g', '=')
(<_DtypeKind.BOOL: 20>, 8, 'b', '|')
(<_DtypeKind.CATEGORICAL: 23>, 64, 'u', '=')
(<_DtypeKind.CATEGORICAL: 23>, 64, 'u', '=')


In [13]:
_from_dataframe_to_vaex(the_df.__dataframe__())

#,numpy_int,numpy_float,numpy_bool,numpy_int_m,numpy_float_m,numpy_bool_m,arrow_int,arrow_float,arrow_bool,arrow_int_m,arrow_float_m,arrow_bool_m,arrow_dict,arrow_dict_m
0,1,1.5,True,1,1.5,--,0,0.5,True,0,0.5,True,aap,aap
1,2,2.5,False,2,--,False,1,1.5,False,1,1.5,False,noot,--
2,3,3.5,True,3,--,True,2,2.5,True,2,2.5,--,mies,--
3,4,4.5,True,--,--,--,3,3.5,False,--,--,--,aap,aap
4,0,0.0,True,--,0.0,True,0,0.5,True,--,0.5,True,noot,noot


In [14]:
the_df

#,numpy_int,numpy_float,numpy_bool,numpy_int_m,numpy_float_m,numpy_bool_m,arrow_int,arrow_float,arrow_bool,arrow_int_m,arrow_float_m,arrow_bool_m,arrow_dict,arrow_dict_m
0,1,1.5,True,1,1.5,--,0,0.5,True,0,0.5,True,aap,aap
1,2,2.5,False,2,--,False,1,1.5,False,1,1.5,False,noot,--
2,3,3.5,True,3,--,True,2,2.5,True,2,2.5,--,mies,--
3,4,4.5,True,--,--,--,3,3.5,False,--,--,--,aap,aap
4,0,0.0,True,--,0.0,True,0,0.5,True,--,0.5,True,noot,noot


# Pandas

In [15]:
dfp = pd.DataFrame(data=dict(a=[1, 2, 3], B=[3, 4, 5],
                            c=[1.5, 2.5, 3.5], D=["a", "b", "cdef"]))
dfp["B"] = dfp["B"].astype("category")
dfp["D"] = dfp["D"].astype("object")
from_dataframe(dfp)

Unnamed: 0,a,B,c,D
0,1,3,1.5,a
1,2,4,2.5,b
2,3,5,3.5,cdef


In [17]:
format_strings = {'a': 'l', 'B': 'U', 'c': 'g', 'D': 'u'}
for col in dfp.columns.tolist():
    column = _PandasColumn(dfp[col])
    print(column.dtype)
    assert column.dtype[2] == format_strings[col]

(<_DtypeKind.INT: 0>, 64, 'l', '=')
(23, 64, 'U', '=')
(<_DtypeKind.FLOAT: 2>, 64, 'g', '=')
(<_DtypeKind.STRING: 21>, 8, 'u', '=')
