In [16]:
%run vaex_implementation_v6.py

In [17]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [18]:
import vaex
import numpy as np

**2021/08/19-20**

## Masked NumPy array
Vaex remembers a mask from np.ma.array. It can be retreived with `.values.mask`.<br>
In version 6 the protocol assignes null value 4 in the `describe_null()` method.

In [19]:
df_n = vaex.from_arrays(
    int1=np.ma.array([1, 0], mask=[0, 1], dtype=int),
    float1=np.ma.array([3.14, 0], mask=[True, False], dtype=float),
    bool1=np.ma.array([True, True], mask=[1, 0], dtype=bool)
)
df_n

#,int1,float1,bool1
0,1,--,--
1,--,0.0,True


In [20]:
df_n.int1.dtype
df_n.int1.dtype.kind
df_n.int1.is_masked
df_n.int1.values.mask
df_n.int1.values.mask.dtype.itemsize

int32

'i'

True

array([False,  True])

1

In [21]:
df_n.float1.dtype
df_n.float1.dtype.kind
df_n.float1.is_masked
df_n.float1.values.mask
df_n.int1.values.data

float64

'f'

True

array([ True, False])

array([1, 0])

In [22]:
df_n.bool1.dtype
df_n.bool1.dtype.kind
df_n.bool1.is_masked
df_n.bool1.values.mask
df_n.bool1.values.data

bool

'b'

True

array([ True, False])

array([ True,  True])

In [23]:
df_n.__dataframe__().get_column_by_name('int1').describe_null
df_n.__dataframe__().get_column_by_name('float1').describe_null
df_n.__dataframe__().get_column_by_name('bool1').describe_null

(4, None)

(4, None)

(4, None)

## Masked arrow array
Can't get the data about the masks. Vaex also doesn't remember it in `is_masked` attribute.<br>
In version 6 the protocol doesn't implement arrow dtypes.

In [24]:
a = pa.array([0, 1, 2, None, 0], mask=np.array([0, 0, 0, 1, 0], dtype=bool))
df_a = vaex.from_arrays(a=a)
df_a

#,a
0,0
1,1
2,2
3,--
4,0


In [25]:
df_a.a.is_masked # it should be masked!!
df_a.a.dtype
df_a.a.dtype.kind

df_a.a
df_a.a.values

False

int64

'i'

Expression = a
Length: 5 dtype: int64 (column)
-------------------------------
0   0
1   1
2   2
3  --
4   0

<pyarrow.lib.Int64Array object at 0x000001D221F0F580>
[
  0,
  1,
  2,
  null,
  0
]

## Masked categorical Vaex
Vaex remembers a mask from ordinal_encode. It can be retreived with `.values.mask`.<br>
In version 6 the protocol assignes null value 4 in the `describe_null()` method.

In [26]:
# ordinal_encode!
colors = ['red', 'green', 'blue', 'green', 'MISSING']
mask   = [False, False, False, False, True]
# Can be:
# mask = [0, 0, 0, 0, 1]
colors = np.ma.array(colors, mask=mask)
ds = vaex.from_arrays(colors=colors)
df2 = ds.ordinal_encode('colors', ['red', 'green', 'blue'])

df2
df2.colors.is_masked
df2.colors.values.data
df2.colors.values.mask
df2.colors.values.mask.dtype.itemsize

df2.colors.dtype
df2.colors.dtype.is_encoded
df2.colors.dtype.kind
df2.is_category('colors')
df2.__dataframe__().get_column_by_name('colors').describe_null
df2.__dataframe__().get_column_by_name('colors').describe_categorical

#,colors
0,0
1,1
2,2
3,1
4,--


True

array([0, 1, 2, 1, 4], dtype=uint64)

array([False, False, False, False,  True])

1

uint64

False

'u'

True

(4, None)

(False, True, {0: 'red', 1: 'green', 2: 'blue'})

## Masked categorical arrow dict
Can't get the data about the masks. Vaex also doesn't remember it in `is_masked` attribute.<br>
In version 6 the protocol doesn't implement arrow dtypes.

In [28]:
# arrow dic
indices = pa.array([0, 1, 0, 1, 2, 0, None, 2])
dictionary = pa.array(['aap', 'noot', 'mies'])
c = pa.DictionaryArray.from_arrays(indices, dictionary)
df3 = vaex.from_arrays(c = c)

df3
df3.c.is_masked
df3.c.dtype
df3.c.dtype.is_encoded
df3.c.dtype.index_type # dtype.is_encoded = True !!!
df3.c.dtype.kind
df3.c.evaluate().indices

df3.is_category('c')
# Not handled in version 6 of the protocol:
#df3.__dataframe__().get_column_by_name('c').describe_null
#df3.__dataframe__().get_column_by_name('c').describe_categorical

#,c
0,aap
1,noot
2,aap
3,noot
4,mies
5,aap
6,--
7,mies


False

dictionary<values=string, indices=int64, ordered=0>

True

int64

'O'

<pyarrow.lib.Int64Array object at 0x000001D221F14580>
[
  0,
  1,
  0,
  1,
  2,
  0,
  null,
  2
]

True

# Play around with masks

Adding `get_mask()` to 6th version of the protocol and testing how to get the array from the buffer of masks. The trouble is that a mask can have bitwidth of 1 and the `buffer_to_ndarray` doesn't handle that (*update: I was wrong here, mask is a normal bool array*).

In [29]:
# Researching a mask from masked categorical vaex dataframe
df2.colors
df2.colors.values.mask.dtype
df2.colors.values.mask

Expression = colors
Length: 5 dtype: uint64 (column)
--------------------------------
0   0
1   1
2   2
3   1
4  --

dtype('bool')

array([False, False, False, False,  True])

In [30]:
# Researching data for buffer of masks from masked categorical vaex dataframe
df2.colors.values.mask.dtype.kind
df2.colors.values.mask.dtype.itemsize
df2.colors.values.mask.dtype.str
df2.colors.values.mask.dtype.byteorder

'b'

1

'|b1'

'|'

In [31]:
a1, b1, = df2.__dataframe__().get_column_by_name('colors').get_data_buffer()
a1, b1
buffer_to_ndarray(a1, b1)

(VaexBuffer({'bufsize': 40, 'ptr': 2001971508560, 'device': 'CPU'}),
 (<_DtypeKind.UINT: 1>, 64, '<u8', '='))

array([0, 1, 2, 1, 4], dtype=uint64)

In [32]:
a, b = df2.__dataframe__().get_column_by_name('colors').get_mask()
a, b
#buffer_to_ndarray(a, b)

(VaexBuffer({'bufsize': 5, 'ptr': 2001977677456, 'device': 'CPU'}),
 (<_DtypeKind.BOOL: 20>, 1, '|b1', '|'))

In [33]:
b[1]
a.bufsize
a.bufsize // b[1]

1

5

5

In [34]:
# Constructing ndarray from mask buffer
ctypes_type = np.ctypeslib.as_ctypes_type('bool')
ctypes_type
data_pointer = ctypes.cast(a.ptr, ctypes.POINTER(ctypes_type))
data_pointer
np.ctypeslib.as_array(data_pointer, shape=(a.bufsize // (b[1]),))

ctypes.c_bool

<__main__.LP_c_bool at 0x1d221f4a940>

array([False, False, False, False,  True])

In [35]:
# test array of mask that has higher bitwidth than 1
test = vaex.from_arrays(t=np.array([1,0,0,1], dtype=bool))
test

#,t
0,True
1,False
2,False
3,True


In [36]:
t1, t2 = test.__dataframe__().get_column_by_name('t').get_data_buffer()
t1, t2

(VaexBuffer({'bufsize': 4, 'ptr': 2001977677056, 'device': 'CPU'}),
 (<_DtypeKind.BOOL: 20>, 8, '|b1', '|'))

In [37]:
# Constructing ndarray from test mask buffer
ctypes_type = np.ctypeslib.as_ctypes_type('bool')
ctypes_type
data_pointer = ctypes.cast(t1.ptr, ctypes.POINTER(ctypes_type))
data_pointer
np.ctypeslib.as_array(data_pointer, shape=(t1.bufsize // (t2[1]//8),))

ctypes.c_bool

<__main__.LP_c_bool at 0x1d221f4adc0>

array([ True, False, False,  True])

## Test
Test for methods `get_mask` and `buffer_to_ndarray` to work on masks.

In [38]:
df_n

#,int1,float1,bool1
0,1,--,--
1,--,0.0,True


In [39]:
mask_buffer_df_n1, mask_dtype_df_n1 = df_n.__dataframe__().get_column_by_name('int1').get_mask()
buffer_to_ndarray(mask_buffer_df_n1, mask_dtype_df_n1)

array([False,  True])

In [40]:
mask_buffer_df_n2, mask_dtype_df_n2 = df_n.__dataframe__().get_column_by_name('float1').get_mask()
buffer_to_ndarray(mask_buffer_df_n2, mask_dtype_df_n2)

array([ True, False])

In [41]:
df2

#,colors
0,0
1,1
2,2
3,1
4,--


In [42]:
mask_buffer_df2, mask_dtype_df2 = df2.__dataframe__().get_column_by_name('colors').get_mask()
buffer_to_ndarray(mask_buffer_df2, mask_dtype_df2)

array([False, False, False, False,  True])

### Can we get the mask back into vaex dataframe?

In [43]:
df2.colors.values.mask.dtype.itemsize

1

In [44]:
mask_from_buffer = buffer_to_ndarray(mask_buffer_df2, mask_dtype_df2)
mask_from_buffer.dtype.itemsize

1

In [45]:
c5 = ['red', 'green', 'blue', 'green', 'MISSING']
c5 = np.ma.array(colors, mask=mask_from_buffer)
ds5 = vaex.from_arrays(colors=colors)
df5 = ds.ordinal_encode('colors', ['red', 'green', 'blue'])
df5

#,colors
0,0
1,1
2,2
3,1
4,--


In [46]:
df2

#,colors
0,0
1,1
2,2
3,1
4,--


** **It works! Yey** **

### Test unmasked data
`get_mask` raises an error.

In [47]:
x = np.array([True, True, False])
y = np.array([1, 2, 0])
z = np.array([9.2, 10.5, 11.8])
df = vaex.from_arrays(x=x, y=y, z=z)

In [48]:
df.__dataframe__().get_column_by_name('x').get_mask()

RuntimeError: This column uses NaN as null so does not have a separate mask