# Version 7
**2021/08/19** *Cleaning-up the code*

- for categorical we now use Arrow Dictionary in the `convert_categorical_column()`. Before we joined ndarrays in `_from_dataframe_to_vaex()` and made them categorical with `categorize`. Now the output of `convert_categorical_column()` is Arrow Dictionary and there is no need to seperately categorize the column.
- added a check for Arrow Dictionary  in ` get_data_buffer()` so protocol now works for this dtype as well 🎉

In [1]:
%run vaex_implementation_v7.py

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import vaex
import pandas as pd
import pyarrow as pa

# NumPy
Numeric and categorical (ordinal_encode and categorize)

In [2]:
# Numeric data with ordinal_encode and categorize test
colors = ['red', 'blue', 'green', 'blue']
ds = vaex.from_arrays(
    colors=colors, 
    year=[2012, 2013, 2015, 2019], 
    weekday=[0, 1, 4, 6], 
    x = [1, 2, 3, 4], 
    y = [1.5, 2.5, 3.5, 4.5])
df = ds.ordinal_encode('colors', ['red', 'green', 'blue'])
df = df.categorize('year', min_value=2012, max_value=2019)
df = df.categorize('weekday', labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
df

#,year,weekday,x,y,colors
0,2012,0,1,1.5,0
1,2013,1,2,2.5,2
2,2015,4,3,3.5,1
3,2019,6,4,4.5,2


In [3]:
vaex_from = _from_dataframe_to_vaex(df.__dataframe__())
vaex_from
df.__dataframe__().metadata

#,year,weekday,x,y,colors
0,2012,Mon,1,1.5,red
1,2013,Tue,2,2.5,blue
2,2015,Fri,3,3.5,green
3,2019,Sun,4,4.5,blue


{'vaex.cetagories_bool': {'year': True,
  'weekday': True,
  'x': False,
  'y': False,
  'colors': True},
 'vaex.cetagories': {'year': [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019],
  'weekday': ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'],
  'colors': ['red', 'green', 'blue']}}

In [4]:
vaex_from
vaex_from.year.values.indices.to_pylist()

#,year,weekday,x,y,colors
0,2012,Mon,1,1.5,red
1,2013,Tue,2,2.5,blue
2,2015,Fri,3,3.5,green
3,2019,Sun,4,4.5,blue


[0, 1, 3, 7]

# Arrow
Numeric and dictionary

In [5]:
# Arrow dict!
indices = pa.array([0, 1, 0, 1, 2])
dictionary = pa.array(['aap', 'noot', 'mies'])
c = pa.DictionaryArray.from_arrays(indices, dictionary)
df2 = vaex.from_arrays(
    c = c, 
    x = pa.array([1, 2, 3, 4, 5], type=pa.int32()),
    y = pa.array([1.5, 42, 12, 144, 200.5], type=pa.float64()),
    z = pa.array([True, False, False, True, True]))
df2

#,c,x,y,z
0,aap,1,1.5,True
1,noot,2,42.0,False
2,aap,3,12.0,False
3,noot,4,144.0,True
4,mies,5,200.5,True


In [6]:
arrow_from = _from_dataframe_to_vaex(df2.__dataframe__())
arrow_from
df2.__dataframe__().metadata

#,c,x,y,z
0,aap,1,1.5,True
1,noot,2,42.0,True
2,aap,3,12.0,True
3,noot,4,144.0,True
4,mies,5,200.5,True


{'vaex.cetagories_bool': {'c': True, 'x': False, 'y': False, 'z': False},
 'vaex.cetagories': {'c': ['aap', 'noot', 'mies']}}

# Test with Pandas

In [7]:
%run pandas_implementation.py

In [8]:
dfp = pd.DataFrame(data=dict(a=[1, 2, 3], b=[3, 4, 5],c=[1.5, 2.5, 3.5], d=[9, 10, 11]))
dfp["A"] = pd.DataFrame({"A": [1, 2, 5, 1]})
dfp["B"] = dfp["A"].astype("category") # Not working yet
dfp

Unnamed: 0,a,b,c,d,A,B
0,1,3,1.5,9,1,1
1,2,4,2.5,10,2,2
2,3,5,3.5,11,5,5


In [9]:
from_dataframe(dfp)

Unnamed: 0,a,b,c,d,A,B
0,1,3,1.5,9,1,1
1,2,4,2.5,10,2,2
2,3,5,3.5,11,5,5


In [10]:
from_dataframe_to_vaex(dfp)

#,a,b,c,d,A,B
0,1,3,1.5,9,1,1
1,2,4,2.5,10,2,2
2,3,5,3.5,11,5,5


In [11]:
dfp.at[1, 'B'] = np.nan  # Set one item to null
dfp

Unnamed: 0,a,b,c,d,A,B
0,1,3,1.5,9,1,1.0
1,2,4,2.5,10,2,
2,3,5,3.5,11,5,5.0


In [12]:
from_dataframe(dfp)

Unnamed: 0,a,b,c,d,A,B
0,1,3,1.5,9,1,1.0
1,2,4,2.5,10,2,
2,3,5,3.5,11,5,5.0


In [13]:
from_dataframe_to_vaex(dfp) # errors - need to add support for sentinel values

ValueError: Indices must be integer type