In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [23]:
import vaex
import numpy as np
import pyarrow as pa

## Open questions
- int and bool with np.nan are changed to float
- int and bool with None are changed to object

# Numeric missing
## NaN/Null NumPy array

In [11]:
x = np.array([1, np.nan, 3])
y = np.array([np.nan, 2.5, 3.5])
z = np.array([True, False, None])
df_num1 = vaex.from_arrays(x=x, y=y, z=z)
df_num1

#,x,y,z
0,1.0,,True
1,,2.5,False
2,3.0,3.5,


In [12]:
print(f"Is column x masked? {df_num1.x.is_masked}")
print(f"Data type of column x: {df_num1.x.dtype}")

Is column x masked? False
Data type of column x : float64


In [13]:
print(f"Is column y masked? {df_num1.y.is_masked}")
print(f"Data type of column y: {df_num1.y.dtype}")

Is column y masked? False
Data type of column y : float64


In [14]:
print(f"Is column z masked? {df_num1.z.is_masked}")
print(f"Data type of column z: {df_num1.z.dtype}")

Is column z masked? False
Data type of column z : object


## Masked NumPy array

In [18]:
df_num2 = vaex.from_arrays(
    int1=np.ma.array([1, 0], mask=[0, 1], dtype=int),
    float1=np.ma.array([3.14, 0], mask=[False, True], dtype=float),
    bool1=np.ma.array([True, True], mask=[1, 0], dtype=bool)
)
df_num2

#,int1,float1,bool1
0,1,3.14,--
1,--,--,True


In [17]:
print(f"Is column int1 masked? {df_num2.int1.is_masked}")
print(f"Data type of column int1: {df_num2.int1.dtype}")
print(f"Mask values: {df_num2.int1.values.mask}")
print(f"Data type of mask: {df_num2.int1.values.mask.dtype}")
print(f"Size of a mask array item: {df_num2.int1.values.mask.dtype.itemsize}")

Is column int1 masked? True
Data type of column int1 : int32
Mask values: [False  True]
Data type of mask: bool
Size of a mask array item: 1


In [19]:
print(f"Is column float1 masked? {df_num2.float1.is_masked}")
print(f"Data type of column float1: {df_num2.float1.dtype}")
print(f"Mask values: {df_num2.float1.values.mask}")
print(f"Data type of mask: {df_num2.float1.values.mask.dtype}")
print(f"Size of a mask array item: {df_num2.float1.values.mask.dtype.itemsize}")

Is column float1 masked? True
Data type of column float1 : float64
Mask values: [False  True]
Data type of mask: bool
Size of a mask array item: 1


In [20]:
print(f"Is column bool1 masked? {df_num2.bool1.is_masked}")
print(f"Data type of column bool1: {df_num2.bool1.dtype}")
print(f"Mask values: {df_num2.bool1.values.mask}")
print(f"Data type of mask: {df_num2.bool1.values.mask.dtype}")
print(f"Size of a mask array item: {df_num2.bool1.values.mask.dtype.itemsize}")

Is column bool1 masked? True
Data type of column bool1 : bool
Mask values: [ True False]
Data type of mask: bool
Size of a mask array item: 1


## Masked arrow array

In [24]:
numpa = pa.array([0, 1, 2, None], mask=np.array([0, 0, 0, 1], dtype=bool))
df_num3 = vaex.from_arrays(n=numpa)
df_num3

#,n
0,0
1,1
2,2
3,--


In [43]:
print(f"Is column n masked? {df_num3.n.is_masked }") # it should be masked...?
print(f"Data type of column n: {df_num3.n.dtype}")
print(f"Mask values: {df_num3.n.values.is_null().tolist()}")

Is column n masked? False
Data type of column n: int64
Mask values: [False, False, False, True]


## Nullable Arrow Array

In [26]:
m = pa.array([0, 1, 2, None])
df_num4 = vaex.from_arrays(m=m)
df_num4

#,m
0,0
1,1
2,2
3,--


In [27]:
print(f"Is column m masked? {df_num4.m.is_masked}")
print(f"Data type of column m: {df_num4.m.dtype}")

Is column m masked? False
Data type of column m : int64


# Categorical missing

## Categorize method
Can't call categorize method on column with missing values!

## Ordinal_encode method

In [28]:
colors = ['red', 'green', 'blue', 'green', 'MISSING']
mask   = [False, False,   False,   False,  True]
colors = np.ma.array(colors, mask=mask)
ds = vaex.from_arrays(colors=colors)
df2 = ds.ordinal_encode('colors', ['red', 'green', 'blue'])
df2

#,colors
0,0
1,1
2,2
3,1
4,--


In [29]:
print(f"Is column colors masked? {df2.colors.is_masked}")
print(f"Data type of column colors: {df_num4.m.dtype}")
print(f"Mask values: {df2.colors.values.mask}")
print(f"Data type of mask: {df2.colors.values.mask.dtype}")
print(f"Size of a mask array item: {df2.colors.values.mask.dtype.itemsize}")

Is column colors masked? True
Data type of column colors: int64
Mask values: [False False False False  True]
Data type of mask: bool
Size of a mask array item: 1


## Arrow dictionary

In [32]:
indices = pa.array([0, 1, 0, 1, 2, None])
dictionary = pa.array(['aap', 'noot', 'mies'])
c = pa.DictionaryArray.from_arrays(indices, dictionary)
df3 = vaex.from_arrays(c = c)
df3

#,c
0,aap
1,noot
2,aap
3,noot
4,mies
5,--


In [44]:
print(f"Is column colors masked? {df3.c.is_masked}")
print(f"Data type of column colors: {df3.c.dtype.index_type}")
print(f"Mask values: {df3.c.values.is_null().tolist()}")

Is column colors masked? False
Data type of column colors: int64
Mask values: [False, False, False, False, False, True]
