# Version 9
**2021/08/20&23**

- make musk handling easier (just use .ismissing for all data)
- correct get_mask (itemsize must be * 8)!

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
%run vaex_implementation_v9.py

### Test data

In [3]:
indices = pa.array([0, 1, 2, 1, 2])
dictionary = pa.array(['foo', 'bar', 'baz'])
dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)

the_df = vaex.from_arrays(
    numpy_int=np.array([1, 2, 3, 4, 0]), # Numpy int
    numpy_float=np.array([1.5, 2.5, 3.5, 4.5, 0]), # Numpy float
    numpy_bool=np.array([True, False, True, True, True]), # Numpy bool
    
    numpy_int_m=np.ma.array([1, 2, 3, 4, 0], mask=[0, 0, 0, 1, 1], dtype=int), # Numpy masked int
    numpy_float_m=np.ma.array([1.5, 2.5, 3.5, 4.5, 0], mask=[False, True, True, True, False], dtype=float), # Numpy masked float
    numpy_bool_m=np.ma.array([True, False, True, True, True], mask=[1, 0, 0, 1, 0], dtype=bool), # Numpy masked bool
    
    arrow_int = pa.array([0, 1, 2, 3, 0]), # Arrow integer
    arrow_float = pa.array([0.5, 1.5, 2.5, 3.5, 0.5]), # Arrow integer
    arrow_bool = pa.array([True, False, True, False, True]), # Arrow integer
    
    arrow_int_m = pa.array([0, 1, 2, None, 0], mask=np.array([0, 0, 0, 1, 1], dtype=bool)), # Arrow masked integer
    arrow_float_m = pa.array([0.5, 1.5, 2.5, None, 0.5], mask=np.array([0, 0, 0, 1, 0], dtype=bool)), # Arrow masked integer
    arrow_bool_m = pa.array([True, False, True, None, True], mask=np.array([0, 0, 1, 1, 0], dtype=bool)), # Arrow masked integer
    
    arrow_dict = pa.DictionaryArray.from_arrays(pa.array([0, 1, 2, 0, 1]), pa.array(['aap', 'noot', 'mies'])), # arrow dictionary
    arrow_dict_m = pa.DictionaryArray.from_arrays(pa.array([0, 1, 2, 0, 1], mask=np.array([0, 1, 1, 0, 0], dtype=bool)), pa.array(['aap', 'noot', 'mies'])) # arrow dictionary masked
)
add_df = vaex.from_arrays(
    vaex_cat = np.ma.array(['red', 'green', 'blue', 'green', 'MISSING'], mask=[False, False, True, False, False]) # Vaex dictionary
)
add_df = add_df.ordinal_encode('vaex_cat', ['red', 'green', 'blue'])
the_df

#,numpy_int,numpy_float,numpy_bool,numpy_int_m,numpy_float_m,numpy_bool_m,arrow_int,arrow_float,arrow_bool,arrow_int_m,arrow_float_m,arrow_bool_m,arrow_dict,arrow_dict_m
0,1,1.5,True,1,1.5,--,0,0.5,True,0,0.5,True,aap,aap
1,2,2.5,False,2,--,False,1,1.5,False,1,1.5,False,noot,--
2,3,3.5,True,3,--,True,2,2.5,True,2,2.5,--,mies,--
3,4,4.5,True,--,--,--,3,3.5,False,--,--,--,aap,aap
4,0,0.0,True,--,0.0,True,0,0.5,True,--,0.5,True,noot,noot


### Call _from_dataframe_to_vaex

In [4]:
the_df_done = _from_dataframe_to_vaex(the_df.__dataframe__())
the_df_done

#,numpy_int,numpy_float,numpy_bool,numpy_int_m,numpy_float_m,numpy_bool_m,arrow_int,arrow_float,arrow_bool,arrow_int_m,arrow_float_m,arrow_bool_m,arrow_dict,arrow_dict_m
0,1,1.5,True,1,1.5,--,0,0.5,True,0,0.5,True,aap,aap
1,2,2.5,False,2,--,False,1,1.5,False,1,1.5,False,noot,--
2,3,3.5,True,3,--,True,2,2.5,True,2,2.5,--,mies,--
3,4,4.5,True,--,--,--,3,3.5,False,--,--,--,aap,aap
4,0,0.0,True,--,0.0,True,0,0.5,True,--,0.5,True,noot,noot


In [5]:
add_df
add_df_done = _from_dataframe_to_vaex(add_df.__dataframe__())
add_df_done

#,vaex_cat
0,0
1,1
2,--
3,1
4,--


#,vaex_cat
0,red
1,green
2,--
3,green
4,--


### Testing

In [6]:
assert the_df.__dataframe__().metadata == the_df_done.__dataframe__().metadata

# numpy

assert the_df['numpy_int'].tolist() == the_df_done['numpy_int'].tolist()
assert not the_df_done['numpy_int'].is_masked
assert the_df_done.__dataframe__().get_column_by_name('numpy_int').null_count == 0
assert the_df['numpy_int'].dtype == the_df_done['numpy_int'].dtype

assert the_df['numpy_float'].tolist() == the_df_done['numpy_float'].tolist()
assert not the_df_done['numpy_float'].is_masked
assert the_df_done.__dataframe__().get_column_by_name('numpy_float').null_count == 0
assert the_df['numpy_float'].dtype == the_df_done['numpy_float'].dtype

assert the_df['numpy_bool'].tolist() == the_df_done['numpy_bool'].tolist()
assert not the_df_done['numpy_bool'].is_masked
assert the_df_done.__dataframe__().get_column_by_name('numpy_bool').null_count == 0
assert the_df['numpy_bool'].dtype == the_df_done['numpy_bool'].dtype

assert the_df['numpy_int_m'].tolist() == the_df_done['numpy_int_m'].tolist()
assert not the_df_done['numpy_int_m'].is_masked
assert the_df_done.__dataframe__().get_column_by_name('numpy_int_m').null_count == 2
assert the_df['numpy_int_m'].dtype == the_df_done['numpy_int_m'].dtype

assert the_df['numpy_float_m'].tolist() == the_df_done['numpy_float_m'].tolist()
assert not the_df_done['numpy_float_m'].is_masked
assert the_df_done.__dataframe__().get_column_by_name('numpy_float_m').null_count == 3
assert the_df['numpy_float_m'].dtype == the_df_done['numpy_float_m'].dtype

assert the_df['numpy_bool_m'].tolist() == the_df_done['numpy_bool_m'].tolist()
assert not the_df_done['numpy_bool_m'].is_masked
assert the_df_done.__dataframe__().get_column_by_name('numpy_bool_m').null_count == 2
assert the_df['numpy_bool_m'].dtype == the_df_done['numpy_bool_m'].dtype

# arrow

assert the_df['arrow_int'].tolist() == the_df_done['arrow_int'].tolist()
assert not the_df_done['arrow_int'].is_masked
assert the_df_done.__dataframe__().get_column_by_name('arrow_int').null_count == 0
assert the_df['arrow_int'].dtype == the_df_done['arrow_int'].dtype

assert the_df['arrow_float'].tolist() == the_df_done['arrow_float'].tolist()
assert not the_df_done['arrow_float'].is_masked
assert the_df_done.__dataframe__().get_column_by_name('arrow_float').null_count == 0
assert the_df['arrow_float'].dtype == the_df_done['arrow_float'].dtype

assert the_df['arrow_bool'].tolist() == the_df_done['arrow_bool'].tolist()
assert not the_df_done['arrow_bool'].is_masked
assert the_df_done.__dataframe__().get_column_by_name('arrow_bool').null_count == 0
assert the_df['arrow_bool'].dtype == the_df_done['arrow_bool'].dtype

assert the_df['arrow_int_m'].tolist() == the_df_done['arrow_int_m'].tolist()
assert not the_df_done['arrow_int_m'].is_masked
assert the_df_done.__dataframe__().get_column_by_name('arrow_int_m').null_count == 2
assert the_df['arrow_int_m'].dtype == the_df_done['arrow_int_m'].dtype

assert the_df['arrow_float_m'].tolist() == the_df_done['arrow_float_m'].tolist()
assert not the_df_done['arrow_float_m'].is_masked
assert the_df_done.__dataframe__().get_column_by_name('arrow_float_m').null_count == 1
assert the_df['arrow_float_m'].dtype == the_df_done['arrow_float_m'].dtype

assert the_df['arrow_bool_m'].tolist() == the_df_done['arrow_bool_m'].tolist()
assert not the_df_done['arrow_bool_m'].is_masked
assert the_df_done.__dataframe__().get_column_by_name('arrow_bool_m').null_count == 2
assert the_df['arrow_bool_m'].dtype == the_df_done['arrow_bool_m'].dtype

# arrow dict

assert the_df['arrow_dict'].tolist() == the_df_done['arrow_dict'].tolist()
assert not the_df_done['arrow_dict'].is_masked
assert the_df_done.__dataframe__().get_column_by_name('arrow_dict').null_count == 0
assert the_df['arrow_dict'].dtype.index_type == the_df_done['arrow_dict'].dtype.index_type

assert the_df['arrow_dict_m'].tolist() == the_df_done['arrow_dict_m'].tolist()
assert not the_df_done['arrow_dict_m'].is_masked
assert the_df_done.__dataframe__().get_column_by_name('arrow_dict_m').null_count == 2
assert the_df['arrow_dict_m'].dtype.index_type == the_df_done['arrow_dict_m'].dtype.index_type

# vaex ordered
assert add_df['vaex_cat'].tolist() == add_df_done['vaex_cat'].values.indices.tolist()
assert not add_df_done['vaex_cat'].is_masked
assert isinstance(add_df_done['vaex_cat'].values, pa.DictionaryArray)
assert add_df_done.__dataframe__().get_column_by_name('vaex_cat').null_count == 2
assert add_df['vaex_cat'].dtype.index_type == add_df_done['vaex_cat'].dtype.index_type


### Virtual columns with missing data

In [7]:
the_df_v = the_df
the_df_v.add_virtual_column("virtual", "sqrt(numpy_float_m**2 + arrow_float_m**2)")
the_df_v

#,numpy_int,numpy_float,numpy_bool,numpy_int_m,numpy_float_m,numpy_bool_m,arrow_int,arrow_float,arrow_bool,arrow_int_m,arrow_float_m,arrow_bool_m,arrow_dict,arrow_dict_m,virtual
0,1,1.5,True,1,1.5,--,0,0.5,True,0,0.5,True,aap,aap,1.5811388300841898
1,2,2.5,False,2,--,False,1,1.5,False,1,1.5,False,noot,--,--
2,3,3.5,True,3,--,True,2,2.5,True,2,2.5,--,mies,--,--
3,4,4.5,True,--,--,--,3,3.5,False,--,--,--,aap,aap,--
4,0,0.0,True,--,0.0,True,0,0.5,True,--,0.5,True,noot,noot,0.5


In [8]:
the_df_v_done = _from_dataframe_to_vaex(the_df_v.__dataframe__())
the_df_v_done

#,numpy_int,numpy_float,numpy_bool,numpy_int_m,numpy_float_m,numpy_bool_m,arrow_int,arrow_float,arrow_bool,arrow_int_m,arrow_float_m,arrow_bool_m,arrow_dict,arrow_dict_m,virtual
0,1,1.5,True,1,1.5,--,0,0.5,True,0,0.5,True,aap,aap,1.5811388300841898
1,2,2.5,False,2,--,False,1,1.5,False,1,1.5,False,noot,--,--
2,3,3.5,True,3,--,True,2,2.5,True,2,2.5,--,mies,--,--
3,4,4.5,True,--,--,--,3,3.5,False,--,--,--,aap,aap,--
4,0,0.0,True,--,0.0,True,0,0.5,True,--,0.5,True,noot,noot,0.5


### Testing with virtual

In [10]:
assert the_df_v['virtual'].tolist() == the_df_v_done['virtual'].tolist()
assert not the_df_v_done['virtual'].is_masked
assert the_df_v_done.__dataframe__().get_column_by_name('virtual').null_count == 3
assert the_df_v['virtual'].dtype == the_df_v_done['virtual'].dtype