In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import vaex
import numpy as np

**2021/08/16-17**
# Numeric missing
## NaN/Null NumPy array

In [3]:
x = np.array([np.nan, 2.5, 3.5])
y = np.array([9.2, 10.5, None])
z = np.array([1, np.nan, 3])
df_num1 = vaex.from_arrays(x=x, y=y, z=z)
df_num1

#,x,y,z
0,,9.2,1.0
1,2.5,10.5,
2,3.5,,3.0


In [4]:
df_num1.x.is_masked
df_num1.x.dtype
df_num1.x.dtype.kind

False

float64

'f'

In [5]:
df_num1.y.dtype
df_num1.y.is_masked

object

False

In [6]:
df_num1.z.dtype
df_num1.z.dtype.kind

float64

'f'

## Virtual boolean test

In [7]:
df_num1['w'] = np.array([np.nan, True, False])
df_num1

#,x,y,z,w
0,,9.2,1.0,
1,2.5,10.5,,1.0
2,3.5,,3.0,0.0


In [8]:
df_num1.w.is_masked
df_num1.w.dtype
df_num1.w.dtype.kind

False

float64

'f'

In [9]:
df_num1['q'] = np.array([True, True, False])
df_num1

#,x,y,z,w,q
0,,9.2,1.0,,True
1,2.5,10.5,,1.0,True
2,3.5,,3.0,0.0,False


In [10]:
df_num1.q.is_masked
df_num1.q.dtype
df_num1.q.dtype.kind

False

bool

'b'

In [11]:
df_num1['p'] = np.array([None, True, False])
df_num1

#,x,y,z,w,q,p
0,,9.2,1.0,,True,
1,2.5,10.5,,1.0,True,True
2,3.5,,3.0,0.0,False,False


In [12]:
df_num1.p.is_masked
df_num1.p.dtype
df_num1.p.dtype.kind

False

object

'O'

## Masked NumPy array

In [13]:
df_num2 = vaex.from_arrays(
    int1=np.ma.array([1, 0], mask=[0, 1], dtype=int),
    float1=np.ma.array([3.14, 0], mask=[False, True], dtype=float),
)
df_num2

#,int1,float1
0,1,3.14
1,--,--


In [14]:
df_num2.int1.dtype
df_num2.int1.dtype.kind
df_num2.int1.is_masked
df_num2.int1.values.mask

int32

'i'

True

array([False,  True])

In [15]:
df_num2.float1.dtype
df_num2.float1.dtype.kind
df_num2.float1.is_masked
df_num2.float1.values.mask
df_num2.int1.values.data

float64

'f'

True

array([False,  True])

array([1, 0])

## Masked arrow array

In [16]:
import pyarrow as pa

numpa = pa.array([0, 1, 2, None, 0], mask=np.array([0, 0, 0, 1, 0], dtype=bool))
df_num3 = vaex.from_arrays(n=numpa)
df_num3

#,n
0,0
1,1
2,2
3,--
4,0


In [17]:
df_num3.n.is_masked # it should be masked!!
df_num3.n.dtype
df_num3.n.dtype.kind

df_num3.n

False

int64

'i'

Expression = n
Length: 5 dtype: int64 (column)
-------------------------------
0   0
1   1
2   2
3  --
4   0

## Nullable Arrow Array

In [18]:
m = pa.array([0, 1, 2, None, 0])
df_num4 = vaex.from_arrays(m=m)
df_num4

#,m
0,0
1,1
2,2
3,--
4,0


In [19]:
df_num4.m.is_masked
df_num4.m.dtype
df_num4.m.dtype.kind

df_num4.m

False

int64

'i'

Expression = m
Length: 5 dtype: int64 (column)
-------------------------------
0   0
1   1
2   2
3  --
4   0

# Categorical missing

In [20]:
# categorize
# cant call the function on column with missing values!
df1 = vaex.from_arrays(year=[2012, 2013, 2019], weekday=[0, 4, 6])
df1 = df1.categorize('year', min_value=2012, max_value=2019)
df1 = df1.categorize('weekday', labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])

df1
df1.year.dtype
df1.weekday.dtype
df1.is_category('year')
df1.is_category('weekday')

#,year,weekday
0,2012,0
1,2013,4
2,2019,6


int32

int32

True

True

In [21]:
# ordinal_encode!
colors = ['red', 'green', 'blue', 'green', 'MISSING']
mask   = [False, False,   False,   False,  True]
colors = np.ma.array(colors, mask=mask)
ds = vaex.from_arrays(colors=colors)
df2 = ds.ordinal_encode('colors', ['red', 'green', 'blue'])

df2
df2.colors.is_masked
df2.colors.values.data
df2.colors.values.mask
df2.colors.dtype
df2.colors.dtype.kind
df2.is_category('colors')

#,colors
0,0
1,1
2,2
3,1
4,--


True

array([0, 1, 2, 1, 4], dtype=uint64)

array([False, False, False, False,  True])

uint64

'u'

True

In [22]:
# arrow dic
indices = pa.array([0, 1, 0, 1, 2, 0, None, 2])
dictionary = pa.array(['aap', 'noot', 'mies'])
c = pa.DictionaryArray.from_arrays(indices, dictionary)
df3 = vaex.from_arrays(c = c)

df3
df3.c.is_masked
df3.c.dtype
df3.c.dtype.kind
df3.is_category('c')

#,c
0,aap
1,noot
2,aap
3,noot
4,mies
5,aap
6,--
7,mies


False

dictionary<values=string, indices=int64, ordered=0>

'O'

True