# Version 13
**2021/09/03**

Update:
- **optimize sentinel values check/correction**<br>
Instead of using Pandas Series, mask is generated and applied to arrow array.
- **select_columns()**<br>
Using `select_columns_by_name`.
- **optimize _get_data_buffer**<br>
Using `self._col.index_values()` to get the codes from arrow dictionary. Before the indices were transfered to Vaex dataframe and then to the buffer.
- tried to optimize `_get_data_buffer()` in case of categorize in Vaex. Left it as it is.

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
%run pandas_implementation.py
%run vaex_implementation_v13.py

## Sentinel value
Correction of sentinel value in convert_categorical_column

In [3]:
df = pd.DataFrame({"A": [1, 2, 5, 1]})
df["B"] = df["A"].astype("category")
df.at[1, 'B'] = np.nan  # Set one item to null

In [4]:
df

Unnamed: 0,A,B
0,1,1.0
1,2,
2,5,5.0
3,1,1.0


In [5]:
# research how the  sentinel values are stored
col = df.__dataframe__().get_column_by_name('B')
col.describe_null

codes_buffer, codes_dtype = col.get_buffers()["data"]
codes = buffer_to_ndarray(codes_buffer, codes_dtype)
codes
codes.dtype

(2, -1)

array([ 0, -1,  2,  0], dtype=int8)

dtype('int8')

In [6]:
# The easiest way would be to construct a mask for arrow array
sentinel = [col.describe_null[1]] * col.size
sentinel

[-1, -1, -1, -1]

In [7]:
# True/1 where sentinel value appears in the codes
mask = codes == sentinel
mask

array([False,  True, False, False])

In [8]:
# Constructing arrow array with mask
pa.array(codes, mask=mask)
pa.array(codes, mask=mask).type

<pyarrow.lib.Int8Array object at 0x0000029C76A5E7C0>
[
  0,
  null,
  2,
  0
]

DataType(int8)

In [9]:
# Try the method
from_dataframe_to_vaex(df)

#,A,B
0,1,1
1,2,--
2,5,5
3,1,1


## select_columns
By intiger sequence

In [10]:
x1 = np.array([True, True, False])
y1 = np.array([1, 2, 0])
z1 = np.array([9.2, 10.5, 11.8])
df1 = vaex.from_arrays(x=x1, y=y1, z=z1)
df1

#,x,y,z
0,True,1,9.2
1,True,2,10.5
2,False,0,11.8


In [11]:
df1[:,1]

Expression = y
Length: 3 dtype: int32 (column)
-------------------------------
0  1
1  2
2  0

In [12]:
# Maybe the easiest way to do it is to call select_columns_by_name
# First try to construct a sequence of names
indices = (0,2)
names = []
for i in indices:
    names.append(df1[:,i].expression)
names

['x', 'z']

In [13]:
# Try the select_columns_ny_name
df1.__dataframe__().select_columns_by_name(names)._df

#,x,z
0,True,9.2
1,True,10.5
2,False,11.8


In [14]:
# Testing the method
df1_new = _from_dataframe_to_vaex(df1.__dataframe__())
df1_new

#,x,y,z
0,True,1,9.2
1,True,2,10.5
2,False,0,11.8


## _get_data_buffer
Optimize in case of arrow dictionary - try without the separate Vaex dataframe creation as it is now done to not have issues with transfering arrow indices through the buffer.

In [15]:
indices = pa.array([0, 1, 2, 1, 2])
dictionary = pa.array(['foo', 'bar', 'baz'])
dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)

the_df = vaex.from_arrays(
    numpy_int=np.array([1, 2, 3, 4, 0]), # Numpy int
    numpy_float=np.array([1.5, 2.5, 3.5, 4.5, 0]), # Numpy float
    numpy_bool=np.array([True, False, True, True, True]), # Numpy bool
    
    numpy_int_m=np.ma.array([1, 2, 3, 4, 0], mask=[0, 0, 0, 1, 1], dtype=int), # Numpy masked int
    numpy_float_m=np.ma.array([1.5, 2.5, 3.5, 4.5, 0], mask=[False, True, True, True, False], dtype=float), # Numpy masked float
    numpy_bool_m=np.ma.array([True, False, True, True, True], mask=[1, 0, 0, 1, 0], dtype=bool), # Numpy masked bool
    
    arrow_int = pa.array([0, 1, 2, 3, 0]), # Arrow integer
    arrow_float = pa.array([0.5, 1.5, 2.5, 3.5, 0.5]), # Arrow integer
    arrow_bool = pa.array([True, False, True, False, True]), # Arrow integer
    
    arrow_int_m = pa.array([0, 1, 2, None, 0], mask=np.array([0, 0, 0, 1, 1], dtype=bool)), # Arrow masked integer
    arrow_float_m = pa.array([0.5, 1.5, 2.5, None, 0.5], mask=np.array([0, 0, 0, 1, 0], dtype=bool)), # Arrow masked integer
    arrow_bool_m = pa.array([True, False, True, None, True], mask=np.array([0, 0, 1, 1, 0], dtype=bool)), # Arrow masked integer
    
    arrow_dict = pa.DictionaryArray.from_arrays(pa.array([0, 1, 2, 0, 1]), pa.array(['aap', 'noot', 'mies'])), # arrow dictionary
    arrow_dict_m = pa.DictionaryArray.from_arrays(pa.array([0, 1, 2, 0, 1], mask=np.array([0, 1, 1, 0, 0], dtype=bool)), pa.array(['aap', 'noot', 'mies'])) # arrow dictionary masked
)

In [16]:
the_df

#,numpy_int,numpy_float,numpy_bool,numpy_int_m,numpy_float_m,numpy_bool_m,arrow_int,arrow_float,arrow_bool,arrow_int_m,arrow_float_m,arrow_bool_m,arrow_dict,arrow_dict_m
0,1,1.5,True,1,1.5,--,0,0.5,True,0,0.5,True,aap,aap
1,2,2.5,False,2,--,False,1,1.5,False,1,1.5,False,noot,--
2,3,3.5,True,3,--,True,2,2.5,True,2,2.5,--,mies,--
3,4,4.5,True,--,--,--,3,3.5,False,--,--,--,aap,aap
4,0,0.0,True,--,0.0,True,0,0.5,True,--,0.5,True,noot,noot


In [17]:
# Try with index_values()

col2 = the_df.__dataframe__().get_column_by_name('arrow_dict')
indices_list = col2._col.index_values()
indices_list

Expression = index_values(arrow_dict)
Length: 5 dtype: int64 (expression)
-----------------------------------
0  0
1  1
2  2
3  0
4  1

In [18]:
# Perfect!
# select the data to be saved to _VaexBuffer
indices_list.dtype
indices_list.to_numpy()

int64

array([0, 1, 2, 0, 1], dtype=int64)

In [19]:
# Try the transfer
buffer = _VaexBuffer(indices_list.to_numpy())
dtype = col2._dtype_from_vaexdtype(indices_list.dtype)
buffer, dtype

(VaexBuffer({'bufsize': 40, 'ptr': 7763975476864, 'device': 'CPU'}),
 (<_DtypeKind.INT: 0>, 64, '<i8', '='))

In [20]:
buffer_to_ndarray(buffer, dtype)

array([0, 1, 2, 0, 1], dtype=int64)

In [21]:
# Works!

In [22]:
the_df_new = _from_dataframe_to_vaex(the_df.__dataframe__())
the_df_new

#,numpy_int,numpy_float,numpy_bool,numpy_int_m,numpy_float_m,numpy_bool_m,arrow_int,arrow_float,arrow_bool,arrow_int_m,arrow_float_m,arrow_bool_m,arrow_dict,arrow_dict_m
0,1,1.5,True,1,1.5,--,0,0.5,True,0,0.5,True,aap,aap
1,2,2.5,False,2,--,False,1,1.5,False,1,1.5,False,noot,--
2,3,3.5,True,3,--,True,2,2.5,True,2,2.5,--,mies,--
3,4,4.5,True,--,--,--,3,3.5,False,--,--,--,aap,aap
4,0,0.0,True,--,0.0,True,0,0.5,True,--,0.5,True,noot,noot


### Test data 3

In [23]:
# Numeric data with ordinal_encode and categorize test
colors = ['red', 'blue', 'green', 'blue']
ds = vaex.from_arrays(
    colors=colors, 
    year=[2012, 2013, 2015, 2019], 
    weekday=[0, 1, 4, 6], 
    x = [1, 2, 3, 4], 
    y = [1.5, 2.5, 3.5, 4.5])
df3 = ds.ordinal_encode('colors', ['red', 'green', 'blue'])
df3 = df3.categorize('year', min_value=2012, max_value=2019)
df3 = df3.categorize('weekday', labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
df3

#,year,weekday,x,y,colors
0,2012,0,1,1.5,0
1,2013,1,2,2.5,2
2,2015,4,3,3.5,1
3,2019,6,4,4.5,2


In [24]:
_from_dataframe_to_vaex(df3.__dataframe__())

#,year,weekday,x,y,colors
0,2012,Mon,1,1.5,red
1,2013,Tue,2,2.5,blue
2,2015,Fri,3,3.5,green
3,2019,Sun,4,4.5,blue


### Test data 4

In [25]:
data = [
     pa.array([1, 2, 3, 4]),
     pa.array([1.5, 2.5, None, 4.5]),
     pa.array([True, None, False, True])
]
batch = pa.RecordBatch.from_arrays(data, ['f0', 'f1', 'f2'])
batches = [batch] * 5
table = pa.Table.from_batches(batches)
c = table[0]

c.num_chunks

5

In [26]:
vc = vaex.from_arrow_table(table)
vc

#,f0,f1,f2
0,1,1.5,True
1,2,2.5,--
2,3,--,False
3,4,4.5,True
4,1,1.5,True
...,...,...,...
15,4,4.5,True
16,1,1.5,True
17,2,2.5,--
18,3,--,False


In [27]:
vc_new = _from_dataframe_to_vaex(vc.__dataframe__())
vc_new

#,f0,f1,f2
0,1,1.5,True
1,2,2.5,--
2,3,--,False
3,4,4.5,True
4,1,1.5,True
...,...,...,...
15,4,4.5,True
16,1,1.5,True
17,2,2.5,--
18,3,--,False
