In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
%run vaex_implementation_v4.py

In [3]:
%run pandas_implementation.py

## Missing values and Pandas dataframe
**2021/08/13**

In [4]:
df = pd.DataFrame({"A": [1, 2, 5, 1], "B": [1.5, 2.5, 5.5, 1.5], "C": [True, False, False, True]})
df.at[1, 'A'] = None  # Set one item to null with None
df['A'] = df['A'].astype('Int64')
df.at[2, 'B'] = np.nan  # Set one item to null with np.nan

In [5]:
df

Unnamed: 0,A,B,C
0,1.0,1.5,True
1,,2.5,False
2,5.0,,False
3,1.0,1.5,True


In [6]:
df['A'].dtype
df['B'].dtype
df['C'].dtype

Int64Dtype()

dtype('float64')

dtype('bool')

In [7]:
df['A'].dtype.kind
df['B'].dtype.kind
df['C'].dtype.kind

'i'

'f'

'b'

In [8]:
from_dataframe(df)

AttributeError: 'Int64Dtype' object has no attribute 'str'

In [9]:
from_dataframe_to_vaex(df)

AttributeError: 'Int64Dtype' object has no attribute 'str'

## Pandas boolean

In [10]:
# One way is to replace a value with np.nan
# Also doedn't work as the column dtype changes to Object and so it is not supported
#df['C'] = df['C'].replace(False, np.NaN)

# The other option is creating new column
# This tactic uses mask so it will not be spported by the protocol
df["D"] = pd.array([True, False, pd.NA, True], dtype="boolean")
df

Unnamed: 0,A,B,C,D
0,1.0,1.5,True,True
1,,2.5,False,False
2,5.0,,False,
3,1.0,1.5,True,True


I get the same error for Pandas or Vaex implementation ---> because the dtpye changes from 'bool' to 'BooleanDtype' which is extended array type. See https://github.com/data-apis/dataframe-api/issues/52

In [11]:
df['D'].dtype

BooleanDtype

In [12]:
from_dataframe(df)

AttributeError: 'Int64Dtype' object has no attribute 'str'

In [13]:
from_dataframe_to_vaex(df)

AttributeError: 'Int64Dtype' object has no attribute 'str'

## Pandas categorical with missing

There is again the same error as the array is changed into extension array and dtype is not supported.

In [14]:
df = df.drop("D", axis=1)
df["E"] = df["A"].astype("category")
df.at[1, 'E'] = None  # Set one item to null
df

Unnamed: 0,A,B,C,E
0,1.0,1.5,True,1.0
1,,2.5,False,
2,5.0,,False,5.0
3,1.0,1.5,True,1.0


In [15]:
from_dataframe(df)

AttributeError: 'Int64Dtype' object has no attribute 'str'

In [16]:
from_dataframe_to_vaex(df)

AttributeError: 'Int64Dtype' object has no attribute 'str'

## Vaex and categorize() when joining categorical columns with missing values from Pandas
There is an error when using `categorize()` function due to the value for missing value (-1) being out of range.

Codes calculated are always from `0, .. n-1` and because the value of missing value is `-1` it returns an error when Vaex wants to calculate the diff between max an min value of the data and then compares it to the length of the column. As there shouldn't be more unique values then number of rows it gives an error.

I tried with changing `-1` to `0` and then changing it back to `None`. First idea that will have to be optimised.

In [17]:
# testing the procedure described above:

df_vv = vaex.from_arrays(year=[2012, None, 2019], weekday=[0, 4, 6])

# save the location of missing values
df_vv['check'] = df_vv.year == None
# replacing missing value with default 0
df_vv['year'] = df_vv.func.where(df_vv.year == None, 0, df_vv.year)
# dtype must be changed to int (not necessary in the protocol)
df_vv['year'] = df_vv.year.astype('int64')
# now we can categorize the column
df_vv = df_vv.categorize('year', min_value=2012, max_value=2019)
df_vv = df_vv.categorize('weekday', labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
# and change back to missing value
df_vv['year'] = df_vv.func.where(df_vv.check, None, df_vv.year)
df_vv

#,year,weekday,check
0,2012.0,0,False
1,,4,True
2,2019.0,6,False


In [18]:
df_vv.drop('check')

#,year,weekday
0,2012.0,0
1,,4
2,2019.0,6


In [19]:
df_vv.category_labels('year')

[2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]

In [20]:
df_vv.category_labels('weekday')

['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

In [21]:
df_vv.is_category('year')

True

### Testing the difference between `None`, `np.nan` in dtype

In [22]:
df_vvv = vaex.from_arrays(year=[2012, np.nan, 2019], weekday=[0, 4, 6])
df_vvv.dtypes

year       float64
weekday      int32
dtype: object

In [23]:
df_v = vaex.from_arrays(year=[2012, 2013, 2019], weekday=[0, 4, 6])
df_v = df_v.categorize('year', min_value=2012, max_value=2019)
df_v = df_v.categorize('weekday', labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
df_v.dtypes

year       int32
weekday    int32
dtype: object

In [24]:
df_vv.dtypes

year       object
weekday     int32
check        bool
dtype: object

There are different types depending on how missing value is used:
- np.nan gives float dtype
- None gives an object
- categorical without missing values stays int

how is that in Pandas?
----> in Pandas categorical column with or without missing data is a category

In [25]:
df

Unnamed: 0,A,B,C,E
0,1.0,1.5,True,1.0
1,,2.5,False,
2,5.0,,False,5.0
3,1.0,1.5,True,1.0


In [26]:
df.dtypes

A       Int64
B     float64
C        bool
E    category
dtype: object

In [27]:
df["E"] = pd.Series([1, 2, 5, 1]).astype("category")

In [28]:
df.dtypes

A       Int64
B     float64
C        bool
E    category
dtype: object

In [29]:
df

Unnamed: 0,A,B,C,E
0,1.0,1.5,True,1
1,,2.5,False,2
2,5.0,,False,5
3,1.0,1.5,True,1


## After change in protocol (version 5)

In [30]:
# try again with corrected code for categorize with missign values
%run vaex_implementation_v5.py

In [31]:
df

Unnamed: 0,A,B,C,E
0,1.0,1.5,True,1
1,,2.5,False,2
2,5.0,,False,5
3,1.0,1.5,True,1


In [32]:
from_dataframe_to_vaex(df.drop('E',axis=1))

AttributeError: 'Int64Dtype' object has no attribute 'str'

In [33]:
from_dataframe_to_vaex(df)

AttributeError: 'Int64Dtype' object has no attribute 'str'

### Works for Pandas dataframes (Pandas -> Vaex)

Lets try the other way around

In [34]:
df_v

#,year,weekday
0,2012,0
1,2013,4
2,2019,6


In [35]:
df_v['year'] = df_v.func.where(df_v.year == 2013, None, df_v.year)
#df_v['year'] = df_v.year.map({2012: 2012, 2013: np.nan, 2019:2019})
df_v

#,year,weekday
0,2012.0,0
1,,4
2,2019.0,6


In [36]:
# Lets first try Pandas implementation
# ---> doesn't work as the categorical column with missing values becomes object type!
from_dataframe(df_v)

ValueError: Data type object not supported by exchangeprotocol

In [37]:
df_v.dtypes

year       object
weekday     int32
dtype: object

In [38]:
df_v.is_category('year')

True

In [39]:
from_dataframe_to_vaex(df_v)

#,year,weekday
0,2012.0,0
1,,4
2,2019.0,6
