In [1]:
%run pandas_implementation.py

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Research how Pandas implementation works

In [2]:
df = pd.DataFrame({"A": ["a", "b", "cdef", "", "g"]})
df["B"] = df["A"].astype("object")
df.at[1, "B"] = np.nan  # Set one item to null

# Test for correctness and null handling:
col = df.__dataframe__().get_column_by_name("B")

In [3]:
df

Unnamed: 0,A,B
0,a,a
1,b,
2,cdef,cdef
3,,
4,g,g


## First lets research `_get_data_buffer`

In [4]:
df["B"].dtype

dtype('O')

In [5]:
df["B"].to_numpy()

array(['a', nan, 'cdef', '', 'g'], dtype=object)

In [6]:
buf = df["B"].to_numpy()
b = bytearray(buf)
b

bytearray(b'p\xe4\xe4\x0b\x15\x02\x00\x00\xb0]\xfa\x0f\x15\x02\x00\x00\xf0\x0c\xa5\x0e\x15\x02\x00\x00pF\x8b\x0b\x15\x02\x00\x00p;\xe7\x0b\x15\x02\x00\x00')

In [7]:
for i in range(buf.size):
    if type(buf[i]) == str:
        b.extend(buf[i].encode(encoding="utf-8"))
        print(buf[i].encode(encoding="utf-8"))

b'a'
b'cdef'
b''
b'g'


In [8]:
b

bytearray(b'p\xe4\xe4\x0b\x15\x02\x00\x00\xb0]\xfa\x0f\x15\x02\x00\x00\xf0\x0c\xa5\x0e\x15\x02\x00\x00pF\x8b\x0b\x15\x02\x00\x00p;\xe7\x0b\x15\x02\x00\x00acdefg')

In [9]:
np.frombuffer(b, dtype="uint8")

array([112, 228, 228,  11,  21,   2,   0,   0, 176,  93, 250,  15,  21,
         2,   0,   0, 240,  12, 165,  14,  21,   2,   0,   0, 112,  70,
       139,  11,  21,   2,   0,   0, 112,  59, 231,  11,  21,   2,   0,
         0,  97,  99, 100, 101, 102, 103], dtype=uint8)

## And `_get_offset_buffer`

In [10]:
values = buf
ptr = 0
offsets = [ptr]

values

array(['a', nan, 'cdef', '', 'g'], dtype=object)

In [11]:
for v in values:
    # For missing values (in this case, `np.nan` values), we don't increment the pointer)
    if type(v) == str:
        b = v.encode(encoding="utf-8")
        ptr += len(b)
        print(type(v))
        print(f"b = {b}")
        print(len(b))
        print(f"ptr = {ptr}")

    offsets.append(ptr)

<class 'str'>
b = b'a'
1
ptr = 1
<class 'str'>
b = b'cdef'
4
ptr = 5
<class 'str'>
b = b''
0
ptr = 5
<class 'str'>
b = b'g'
1
ptr = 6


In [12]:
offsets

[0, 1, 1, 5, 5, 6]

## Test

Applied code to Vaex implementation copy-paste first. Lets see.

In [13]:
%run vaex_implementation_v18.py

In [14]:
dfv = vaex.from_arrays(a =  np.array(["a", "b", "cdef", "", "g"]))
dfv

#,a
0,a
1,b
2,cdef
3,
4,g


In [15]:
dfv.a.dtype

string

In [16]:
dfv.__dataframe__().get_column_by_name('a').get_buffers()

{'data': (VaexBuffer({'bufsize': 7, 'ptr': 2289550682464, 'device': 'CPU'}),
  (<_DtypeKind.STRING: 21>, 8, 'u', '=')),
 'validity': (VaexBuffer({'bufsize': 5, 'ptr': 2289496678976, 'device': 'CPU'}),
  (<_DtypeKind.BOOL: 20>, 8, '|b1', '|')),
 'offsets': (VaexBuffer({'bufsize': 48, 'ptr': 2289505517024, 'device': 'CPU'}),
  (<_DtypeKind.INT: 0>, 64, 'l', '='))}

In [17]:
buffers = dfv.__dataframe__().get_column_by_name('a').get_buffers()

# Retrieve the data buffer containing the UTF-8 code units
dbuffer, bdtype = buffers["data"]

# Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string
obuffer, odtype = buffers["offsets"]

# Retrieve the mask buffer indicating the presence of missing values
mbuffer, mdtype = buffers["validity"]

# Convert the buffers to NumPy arrays
dt = (_DtypeKind.UINT, 8, None, None)  # note: in order to go from STRING to an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array)
dbuf = buffer_to_ndarray(dbuffer, dt)

obuf = buffer_to_ndarray(obuffer, odtype)
mbuf = buffer_to_ndarray(mbuffer, mdtype)

In [18]:
dbuf
obuf
mbuf

array([ 97,  98,  99, 100, 101, 102, 103], dtype=uint8)

array([0, 1, 2, 6, 6, 7], dtype=int64)

array([False, False, False, False, False])

## Now lets research the `convert_string_column`

In [19]:
# Assemble the strings from the code units
str_list = []
for i in range(obuf.size-1):

    # Extract a range of code units
    units = dbuf[obuf[i]:obuf[i+1]]
    print(units)

    # Convert the list of code units to bytes
    bb = bytes(units)
    print(bb)

    # Create the string
    s = bb.decode(encoding="utf-8")
    print(s)

    # Add to our list of strings
    str_list.append(s)

[97]
b'a'
a
[98]
b'b'
b
[ 99 100 101 102]
b'cdef'
cdef
[]
b''

[103]
b'g'
g


In [20]:
str_list

['a', 'b', 'cdef', '', 'g']

In [21]:
np.asarray(str_list, dtype="object")

array(['a', 'b', 'cdef', '', 'g'], dtype=object)

## Lets try the implementation

In [22]:
%run vaex_implementation_v18.py

In [23]:
dfv.__dataframe__().get_column_by_name('a')._col

Expression = a
Length: 5 dtype: string (column)
--------------------------------
0     a
1     b
2  cdef
3
4     g

In [24]:
convert_string_column(dfv.__dataframe__().get_column_by_name('a'))

(<pyarrow.lib.StringArray object at 0x00000215151BEE80>
 [
   "a",
   "b",
   "cdef",
   "",
   "g"
 ],
 {'data': (VaexBuffer({'bufsize': 7, 'ptr': 2289550681120, 'device': 'CPU'}),
   (<_DtypeKind.STRING: 21>, 8, 'u', '=')),
  'validity': (VaexBuffer({'bufsize': 5, 'ptr': 2289505494432, 'device': 'CPU'}),
   (<_DtypeKind.BOOL: 20>, 8, '|b1', '|')),
  'offsets': (VaexBuffer({'bufsize': 48, 'ptr': 2289505517856, 'device': 'CPU'}),
   (<_DtypeKind.INT: 0>, 64, 'l', '='))})

### Works! =)

## Now mising values handling

In [25]:
dfvm = vaex.from_arrays(a =  np.array(["a", None, "cdef", "", "g"]))
dfvm

#,a
0,a
1,--
2,cdef
3,
4,g


In [26]:
dfvm.__dataframe__().get_column_by_name('a').null_count

1

In [27]:
buffers_m = dfvm.__dataframe__().get_column_by_name('a').get_buffers()

# Retrieve the data buffer containing the UTF-8 code units
dbuffer_m, bdtype_m = buffers_m["data"]

# Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string
obuffer_m, odtype_m = buffers_m["offsets"]

# Retrieve the mask buffer indicating the presence of missing values
mbuffer_m, mdtype_m = buffers_m["validity"]

# Convert the buffers to NumPy arrays
dt_m = (_DtypeKind.UINT, 8, None, None)  # note: in order to go from STRING to an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array)
dbuf_m = buffer_to_ndarray(dbuffer_m, dt_m)

obuf_m = buffer_to_ndarray(obuffer_m, odtype_m)
mbuf_m = buffer_to_ndarray(mbuffer_m, mdtype_m)

In [28]:
dbuf_m
obuf_m
mbuf_m

array([ 97,  99, 100, 101, 102, 103], dtype=uint8)

array([0, 1, 1, 5, 5, 6], dtype=int64)

array([False,  True, False, False, False])

### This all works fine, now lets fix the `convert_string_column`

In [29]:
%run vaex_implementation_v18.py

In [30]:
convert_string_column(dfvm.__dataframe__().get_column_by_name('a'))

(<pyarrow.lib.StringArray object at 0x0000021515228340>
 [
   "a",
   null,
   "cdef",
   "",
   "g"
 ],
 {'data': (VaexBuffer({'bufsize': 6, 'ptr': 2289550682768, 'device': 'CPU'}),
   (<_DtypeKind.STRING: 21>, 8, 'u', '=')),
  'validity': (VaexBuffer({'bufsize': 5, 'ptr': 2289505494560, 'device': 'CPU'}),
   (<_DtypeKind.BOOL: 20>, 8, '|b1', '|')),
  'offsets': (VaexBuffer({'bufsize': 48, 'ptr': 2289505516704, 'device': 'CPU'}),
   (<_DtypeKind.INT: 0>, 64, 'l', '='))})

## That's it!

## Now for the finish lets check Pandas -> Vaex

In [31]:
%run vaex_implementation_v18.py

In [32]:
df

Unnamed: 0,A,B
0,a,a
1,b,
2,cdef,cdef
3,,
4,g,g


In [33]:
from_dataframe_to_vaex(df)

#,A,B
0,a,a
1,b,--
2,cdef,cdef
3,,
4,g,g


### We need to invert masks coming from Pandas ...

In [34]:
buffers_p = df.__dataframe__().get_column_by_name('B').get_buffers()

# Retrieve the data buffer containing the UTF-8 code units
dbuffer_p, bdtype_p = buffers_p["data"]

# Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string
obuffer_p, odtype_p = buffers_p["offsets"]

# Retrieve the mask buffer indicating the presence of missing values
mbuffer_p, mdtype_p = buffers_p["validity"]

# Convert the buffers to NumPy arrays
dt_p = (_DtypeKind.UINT, 8, None, None)  # note: in order to go from STRING to an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array)
dbuf_p = buffer_to_ndarray(dbuffer_p, dt_p)

obuf_p = buffer_to_ndarray(obuffer_p, odtype_p)
mbuf_p = buffer_to_ndarray(mbuffer_p, mdtype_p)

In [35]:
dbuf_p
obuf_p
mbuf_p

array([ 97,  99, 100, 101, 102, 103], dtype=uint8)

array([0, 1, 1, 5, 5, 6], dtype=int64)

array([1, 0, 1, 1, 1], dtype=uint8)

In [36]:
np.invert(np.asarray(mbuf_p, dtype="bool"))

array([False,  True, False, False, False])

In [37]:
mbuf_p.dtype == "uint8"

True

In [49]:
from_dataframe_to_vaex(df)

#,A,B
0,a,a
1,b,--
2,cdef,cdef
3,,
4,g,g


In [50]:
df.__dataframe__().get_column_by_name('B').describe_null

(4, 0)

In [51]:
from_dataframe_to_vaex(df).__dataframe__().get_column_by_name('B').describe_null

(3, 0)