# Version 12
**2021/09/1-** *Update protocol implementation*

Changes:
- _buffers added in `from_dataframe_to_vaex`
- allow_copy : bool = True added to all the classes
- get buffers now incorporates three get_buffer methods:
    - `_get_data_buffer` for the data
    - `_get_validity_buffer` for the masks
    - `_get_offsets_buffer` ?

Also added:
- correction of sentinel values in `convert_categorical_column`

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
%run vaex_implementation_v12.py

### Test data

In [3]:
x1 = np.array([True, True, False])
y1 = np.array([1, 2, 0])
z1 = np.array([9.2, 10.5, 11.8])
df1 = vaex.from_arrays(x=x1, y=y1, z=z1)
df1

#,x,y,z
0,True,1,9.2
1,True,2,10.5
2,False,0,11.8


In [4]:
# Printing out dtypes and buffers from __dataframe__

df1.__dataframe__().metadata
df1.__dataframe__().get_column_by_name('x')._col.dtype
df1.__dataframe__().get_column_by_name('x').dtype
df1.__dataframe__().get_column_by_name('x').get_buffers()
df1.__dataframe__().get_column_by_name('x')._get_data_buffer()

{}

bool

(<_DtypeKind.BOOL: 20>, 8, '|b1', '|')

{'data': (VaexBuffer({'bufsize': 3, 'ptr': 2061009841488, 'device': 'CPU'}),
  (<_DtypeKind.BOOL: 20>, 8, '|b1', '|')),
 'validity': (VaexBuffer({'bufsize': 3, 'ptr': 2061009841872, 'device': 'CPU'}),
  (<_DtypeKind.BOOL: 20>, 8, '|b1', '|')),
 'offsets': {}}

(VaexBuffer({'bufsize': 3, 'ptr': 2061009841488, 'device': 'CPU'}),
 (<_DtypeKind.BOOL: 20>, 8, '|b1', '|'))

In [5]:
# Testing the method
df1_new = _from_dataframe_to_vaex(df1.__dataframe__())
df1_new

#,x,y,z
0,True,1,9.2
1,True,2,10.5
2,False,0,11.8


In [6]:
# Printing out the results

df1_new._buffers
df1_new._buffers[0]

[VaexBuffer({'bufsize': 3, 'ptr': 2061009841488, 'device': 'CPU'}),
 VaexBuffer({'bufsize': 12, 'ptr': 2061011946608, 'device': 'CPU'}),
 VaexBuffer({'bufsize': 24, 'ptr': 2061011947632, 'device': 'CPU'})]

VaexBuffer({'bufsize': 3, 'ptr': 2061009841488, 'device': 'CPU'})

In [7]:
buffer_to_ndarray(df1_new._buffers[0], df1_new.__dataframe__().get_column_by_name('x').dtype)

array([ True,  True, False])

In [8]:
# Checking the validity buffer

a = df1.__dataframe__().get_column_by_name('x').get_buffers()["validity"]
buffer_to_ndarray(a[0], a[1])

array([False, False, False])

### Test data 2

In [9]:
indices = pa.array([0, 1, 2, 1, 2])
dictionary = pa.array(['foo', 'bar', 'baz'])
dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)

the_df = vaex.from_arrays(
    numpy_int=np.array([1, 2, 3, 4, 0]), # Numpy int
    numpy_float=np.array([1.5, 2.5, 3.5, 4.5, 0]), # Numpy float
    numpy_bool=np.array([True, False, True, True, True]), # Numpy bool
    
    numpy_int_m=np.ma.array([1, 2, 3, 4, 0], mask=[0, 0, 0, 1, 1], dtype=int), # Numpy masked int
    numpy_float_m=np.ma.array([1.5, 2.5, 3.5, 4.5, 0], mask=[False, True, True, True, False], dtype=float), # Numpy masked float
    numpy_bool_m=np.ma.array([True, False, True, True, True], mask=[1, 0, 0, 1, 0], dtype=bool), # Numpy masked bool
    
    arrow_int = pa.array([0, 1, 2, 3, 0]), # Arrow integer
    arrow_float = pa.array([0.5, 1.5, 2.5, 3.5, 0.5]), # Arrow integer
    arrow_bool = pa.array([True, False, True, False, True]), # Arrow integer
    
    arrow_int_m = pa.array([0, 1, 2, None, 0], mask=np.array([0, 0, 0, 1, 1], dtype=bool)), # Arrow masked integer
    arrow_float_m = pa.array([0.5, 1.5, 2.5, None, 0.5], mask=np.array([0, 0, 0, 1, 0], dtype=bool)), # Arrow masked integer
    arrow_bool_m = pa.array([True, False, True, None, True], mask=np.array([0, 0, 1, 1, 0], dtype=bool)), # Arrow masked integer
    
    arrow_dict = pa.DictionaryArray.from_arrays(pa.array([0, 1, 2, 0, 1]), pa.array(['aap', 'noot', 'mies'])), # arrow dictionary
    arrow_dict_m = pa.DictionaryArray.from_arrays(pa.array([0, 1, 2, 0, 1], mask=np.array([0, 1, 1, 0, 0], dtype=bool)), pa.array(['aap', 'noot', 'mies'])) # arrow dictionary masked
)

In [10]:
the_df

#,numpy_int,numpy_float,numpy_bool,numpy_int_m,numpy_float_m,numpy_bool_m,arrow_int,arrow_float,arrow_bool,arrow_int_m,arrow_float_m,arrow_bool_m,arrow_dict,arrow_dict_m
0,1,1.5,True,1,1.5,--,0,0.5,True,0,0.5,True,aap,aap
1,2,2.5,False,2,--,False,1,1.5,False,1,1.5,False,noot,--
2,3,3.5,True,3,--,True,2,2.5,True,2,2.5,--,mies,--
3,4,4.5,True,--,--,--,3,3.5,False,--,--,--,aap,aap
4,0,0.0,True,--,0.0,True,0,0.5,True,--,0.5,True,noot,noot


In [11]:
the_df_new = _from_dataframe_to_vaex(the_df.__dataframe__())
the_df_new

#,numpy_int,numpy_float,numpy_bool,numpy_int_m,numpy_float_m,numpy_bool_m,arrow_int,arrow_float,arrow_bool,arrow_int_m,arrow_float_m,arrow_bool_m,arrow_dict,arrow_dict_m
0,1,1.5,True,1,1.5,--,0,0.5,True,0,0.5,True,aap,aap
1,2,2.5,False,2,--,False,1,1.5,False,1,1.5,False,noot,--
2,3,3.5,True,3,--,True,2,2.5,True,2,2.5,--,mies,--
3,4,4.5,True,--,--,--,3,3.5,False,--,--,--,aap,aap
4,0,0.0,True,--,0.0,True,0,0.5,True,--,0.5,True,noot,noot


In [12]:
the_df.__dataframe__().metadata
the_df.__dataframe__().get_column_by_name('arrow_dict').dtype
the_df.__dataframe__().get_column_by_name('arrow_dict')._col.dtype
the_df.__dataframe__().get_column_by_name('arrow_dict_m').dtype
the_df.__dataframe__().get_column_by_name('arrow_dict_m')._col.dtype
the_df.__dataframe__().get_column_by_name('numpy_int').dtype
the_df.__dataframe__().get_column_by_name('numpy_int')._col.dtype

{}

(<_DtypeKind.CATEGORICAL: 23>, 64, 'u', '=')

dictionary<values=string, indices=int64, ordered=0>

(<_DtypeKind.CATEGORICAL: 23>, 64, 'u', '=')

dictionary<values=string, indices=int64, ordered=0>

(<_DtypeKind.INT: 0>, 32, '<i4', '=')

int32

In [13]:
col = the_df.__dataframe__().get_column_by_name('arrow_dict')
codes_buffer, codes_dtype = col._get_data_buffer()
codes_buffer, codes_dtype

(VaexBuffer({'bufsize': 40, 'ptr': 5255416780224, 'device': 'CPU'}),
 (<_DtypeKind.INT: 0>, 64, '<i8', '='))

In [14]:
the_df_new._buffers

[VaexBuffer({'bufsize': 20, 'ptr': 2061011945968, 'device': 'CPU'}),
 VaexBuffer({'bufsize': 40, 'ptr': 2061004051008, 'device': 'CPU'}),
 VaexBuffer({'bufsize': 5, 'ptr': 2061009841472, 'device': 'CPU'}),
 VaexBuffer({'bufsize': 20, 'ptr': 2061011946000, 'device': 'CPU'}),
 VaexBuffer({'bufsize': 40, 'ptr': 2061004050576, 'device': 'CPU'}),
 VaexBuffer({'bufsize': 5, 'ptr': 2061009842400, 'device': 'CPU'}),
 VaexBuffer({'bufsize': 40, 'ptr': 5255416779456, 'device': 'CPU'}),
 VaexBuffer({'bufsize': 40, 'ptr': 5255416779584, 'device': 'CPU'}),
 VaexBuffer({'bufsize': 5, 'ptr': 2061009841520, 'device': 'CPU'}),
 VaexBuffer({'bufsize': 40, 'ptr': 5255416779840, 'device': 'CPU'}),
 VaexBuffer({'bufsize': 40, 'ptr': 5255416779968, 'device': 'CPU'}),
 VaexBuffer({'bufsize': 5, 'ptr': 2061009842176, 'device': 'CPU'}),
 VaexBuffer({'bufsize': 40, 'ptr': 5255416780224, 'device': 'CPU'}),
 VaexBuffer({'bufsize': 40, 'ptr': 5255416780544, 'device': 'CPU'})]

### Test data 3

In [15]:
# Numeric data with ordinal_encode and categorize test
colors = ['red', 'blue', 'green', 'blue']
ds = vaex.from_arrays(
    colors=colors, 
    year=[2012, 2013, 2015, 2019], 
    weekday=[0, 1, 4, 6], 
    x = [1, 2, 3, 4], 
    y = [1.5, 2.5, 3.5, 4.5])
df = ds.ordinal_encode('colors', ['red', 'green', 'blue'])
df = df.categorize('year', min_value=2012, max_value=2019)
df = df.categorize('weekday', labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
df

#,year,weekday,x,y,colors
0,2012,0,1,1.5,0
1,2013,1,2,2.5,2
2,2015,4,3,3.5,1
3,2019,6,4,4.5,2


In [16]:
df.__dataframe__().get_column_by_name('colors').dtype
df.__dataframe__().get_column_by_name('colors')._col.dtype
df.__dataframe__().get_column_by_name('year').dtype
df.__dataframe__().get_column_by_name('year')._col.dtype
df.__dataframe__().get_column_by_name('weekday').dtype
df.__dataframe__().get_column_by_name('weekday')._col.dtype

(<_DtypeKind.CATEGORICAL: 23>, 64, 'u', '=')

uint64

(<_DtypeKind.CATEGORICAL: 23>, 64, 'u', '=')

int32

(<_DtypeKind.CATEGORICAL: 23>, 64, 'u', '=')

int32

In [17]:
df2 = _from_dataframe_to_vaex(df.__dataframe__())
df2

#,year,weekday,x,y,colors
0,2012,Mon,1,1.5,red
1,2013,Tue,2,2.5,blue
2,2015,Fri,3,3.5,green
3,2019,Sun,4,4.5,blue


In [18]:
col1 = df.__dataframe__().get_column_by_name('colors')
codes_buffer1, codes_dtype1 = col1._get_data_buffer()
codes_buffer1, codes_dtype1

(VaexBuffer({'bufsize': 32, 'ptr': 2061013108752, 'device': 'CPU'}),
 (<_DtypeKind.UINT: 1>, 64, '<u8', '='))

In [19]:
df2.__dataframe__().get_column_by_name('year').dtype
df2.__dataframe__().get_column_by_name('weekday').dtype
df2.__dataframe__().get_column_by_name('colors').dtype

(23, 64, '|O', '=')

(23, 64, '|O', '=')

(23, 64, '|O', '=')

### Test data 4

chunked data

In [20]:
data = [
     pa.array([1, 2, 3, 4]),
     pa.array([1.5, 2.5, None, 4.5]),
     pa.array([True, None, False, True])
]
batch = pa.RecordBatch.from_arrays(data, ['f0', 'f1', 'f2'])
batches = [batch] * 5
table = pa.Table.from_batches(batches)
c = table[0]

c.num_chunks

5

In [21]:
# vaex chunked dataframe
vc = vaex.from_arrow_table(table)
vc

#,f0,f1,f2
0,1,1.5,True
1,2,2.5,--
2,3,--,False
3,4,4.5,True
4,1,1.5,True
...,...,...,...
15,4,4.5,True
16,1,1.5,True
17,2,2.5,--
18,3,--,False


In [22]:
# Printing the result of the method
vc_new = _from_dataframe_to_vaex(vc.__dataframe__())
vc_new

#,f0,f1,f2
0,1,1.5,True
1,2,2.5,--
2,3,--,False
3,4,4.5,True
4,1,1.5,True
...,...,...,...
15,4,4.5,True
16,1,1.5,True
17,2,2.5,--
18,3,--,False


In [23]:
# Printing _buffers
vc_new._buffers

[[VaexBuffer({'bufsize': 32, 'ptr': 5255416781824, 'device': 'CPU'}),
  VaexBuffer({'bufsize': 32, 'ptr': 5255416781696, 'device': 'CPU'}),
  VaexBuffer({'bufsize': 4, 'ptr': 2061009841984, 'device': 'CPU'})],
 [VaexBuffer({'bufsize': 32, 'ptr': 5255416781824, 'device': 'CPU'}),
  VaexBuffer({'bufsize': 32, 'ptr': 5255416781696, 'device': 'CPU'}),
  VaexBuffer({'bufsize': 4, 'ptr': 2061009843840, 'device': 'CPU'})],
 [VaexBuffer({'bufsize': 32, 'ptr': 5255416781824, 'device': 'CPU'}),
  VaexBuffer({'bufsize': 32, 'ptr': 5255416781696, 'device': 'CPU'}),
  VaexBuffer({'bufsize': 4, 'ptr': 2061009844128, 'device': 'CPU'})],
 [VaexBuffer({'bufsize': 32, 'ptr': 5255416781824, 'device': 'CPU'}),
  VaexBuffer({'bufsize': 32, 'ptr': 5255416781696, 'device': 'CPU'}),
  VaexBuffer({'bufsize': 4, 'ptr': 2061009844144, 'device': 'CPU'})],
 [VaexBuffer({'bufsize': 32, 'ptr': 5255416781824, 'device': 'CPU'}),
  VaexBuffer({'bufsize': 32, 'ptr': 5255416781696, 'device': 'CPU'}),
  VaexBuffer({'bufsi