In [1]:
import numpy as np
import pandas as pd

# 1. Japanese Vowels

### Load a dataset: Japanese Vowels

In [2]:
from sktime.datasets import load_japanese_vowels

In [3]:
X, y = load_japanese_vowels(return_X_y=True)

In [4]:
X.shape

(540, 12)

In [5]:
X.head(2)

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,dim_10,dim_11
0,0 1.860936 1 1.891651 2 1.939205 3...,0 -0.207383 1 -0.193249 2 -0.239664 3...,0 0.261557 1 0.235363 2 0.258561 3...,0 -0.214562 1 -0.249118 2 -0.291458 3...,0 -0.171253 1 -0.112890 2 -0.041053 3...,0 -0.118167 1 -0.112238 2 -0.102034 3...,0 -0.277557 1 -0.311997 2 -0.383300 3...,0 0.025668 1 -0.027122 2 0.019013 3...,0 0.126701 1 0.171457 2 0.169510 3...,0 -0.306756 1 -0.289431 2 -0.314894 3...,0 -0.213076 1 -0.247722 2 -0.227908 3...,0 0.088728 1 0.093011 2 0.074638 3...
1,0 1.303905 1 1.288280 2 1.332021 3...,0 0.067256 1 0.018672 2 -0.058744 3...,0 0.597720 1 0.631579 2 0.601928 3...,0 -0.271474 1 -0.355112 2 -0.347913 3...,0 -0.236808 1 -0.119216 2 -0.053463 3...,0 -0.411125 1 -0.434425 2 -0.421753 3...,0 -0.014826 1 -0.078036 2 -0.028479 3...,0 0.113175 1 0.178121 2 0.145073 3...,0 -0.058230 1 -0.106430 2 -0.159488 3...,0 -0.173138 1 -0.181910 2 -0.127751 3...,0 0.093058 1 0.093031 2 0.019092 3...,0 0.099247 1 0.099183 2 0.113546 3...


In [29]:
# NOTE: Index of this dataframe is not unique, which is annoying.
X.index.is_unique

False

In [7]:
y.shape

(540,)

In [8]:
y[:2]

0    0
1    0
dtype: object

In [9]:
np.unique(y)

array(['0', '1', '2', '3', '4', '5', '6', '7', '8'], dtype=object)

In [10]:
cell = X.iloc[0,0]

In [11]:
type(cell)

pandas.core.series.Series

In [12]:
cell.shape

(29,)

### Investigate the nature of this dataframe

In [13]:
lens = set()
for ridx in range(X.shape[0]):
    for cidx in range(X.shape[1]):
        cell = X.iloc[ridx, cidx]
        assert len(cell.shape) == 1
        lens.add( cell.shape[0] )
print(lens)

# All length 29.

{29}


In [14]:
idx_last = None
for ridx in range(X.shape[0]):
    for cidx in range(X.shape[1]):
        idx = X.iloc[ridx, cidx].index
        if idx_last is not None:
            assert (idx == idx_last).all()
        idx_last = idx

print("All temporal indices the same:\n", idx)

All temporal indices the same:
 RangeIndex(start=0, stop=29, step=1)


In [15]:
miss = set()
for ridx in range(X.shape[0]):
    for cidx in range(X.shape[1]):
        miss.add( X.iloc[ridx, cidx].isnull().sum() )

print("Missing data counts among cells:\n", miss)

Missing data counts among cells:
 {0}


### Investigate data format conversion utilities

In [16]:
from sktime.datatypes._panel._convert import (
    are_columns_nested,
    is_nested_dataframe
)

In [17]:
are_columns_nested(X)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

In [18]:
is_nested_dataframe(X)

True

In [19]:
testX = X.copy()
testX["non_nested_dim"] = 29
testX.head(2)

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,dim_10,dim_11,non_nested_dim
0,0 1.860936 1 1.891651 2 1.939205 3...,0 -0.207383 1 -0.193249 2 -0.239664 3...,0 0.261557 1 0.235363 2 0.258561 3...,0 -0.214562 1 -0.249118 2 -0.291458 3...,0 -0.171253 1 -0.112890 2 -0.041053 3...,0 -0.118167 1 -0.112238 2 -0.102034 3...,0 -0.277557 1 -0.311997 2 -0.383300 3...,0 0.025668 1 -0.027122 2 0.019013 3...,0 0.126701 1 0.171457 2 0.169510 3...,0 -0.306756 1 -0.289431 2 -0.314894 3...,0 -0.213076 1 -0.247722 2 -0.227908 3...,0 0.088728 1 0.093011 2 0.074638 3...,29
1,0 1.303905 1 1.288280 2 1.332021 3...,0 0.067256 1 0.018672 2 -0.058744 3...,0 0.597720 1 0.631579 2 0.601928 3...,0 -0.271474 1 -0.355112 2 -0.347913 3...,0 -0.236808 1 -0.119216 2 -0.053463 3...,0 -0.411125 1 -0.434425 2 -0.421753 3...,0 -0.014826 1 -0.078036 2 -0.028479 3...,0 0.113175 1 0.178121 2 0.145073 3...,0 -0.058230 1 -0.106430 2 -0.159488 3...,0 -0.173138 1 -0.181910 2 -0.127751 3...,0 0.093058 1 0.093031 2 0.019092 3...,0 0.099247 1 0.099183 2 0.113546 3...,29


In [20]:
are_columns_nested(testX)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False])

In [21]:
is_nested_dataframe(testX)

True

In [22]:
from sktime.datatypes._panel._convert import (
    from_nested_to_multi_index,
    from_nested_to_3d_numpy,
    from_nested_to_2d_array,
)

In [23]:
from IPython.display import display

display(from_nested_to_multi_index(testX).head(2))

print("‼️‼️‼️‼️‼️ NOTE THAT `from_nested_to_multi_index` COMPLETELY FAILS!")

Unnamed: 0_level_0,Unnamed: 1_level_0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,dim_10,dim_11,non_nested_dim
instance,timepoints,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0,0 1.860936 1 1.891651 2 1.939205 3...,0 -0.207383 1 -0.193249 2 -0.239664 3...,0 0.261557 1 0.235363 2 0.258561 3...,0 -0.214562 1 -0.249118 2 -0.291458 3...,0 -0.171253 1 -0.112890 2 -0.041053 3...,0 -0.118167 1 -0.112238 2 -0.102034 3...,0 -0.277557 1 -0.311997 2 -0.383300 3...,0 0.025668 1 -0.027122 2 0.019013 3...,0 0.126701 1 0.171457 2 0.169510 3...,0 -0.306756 1 -0.289431 2 -0.314894 3...,0 -0.213076 1 -0.247722 2 -0.227908 3...,0 0.088728 1 0.093011 2 0.074638 3...,29
0,0,0 1.860936 1 1.891651 2 1.939205 3...,0 -0.207383 1 -0.193249 2 -0.239664 3...,0 0.261557 1 0.235363 2 0.258561 3...,0 -0.214562 1 -0.249118 2 -0.291458 3...,0 -0.171253 1 -0.112890 2 -0.041053 3...,0 -0.118167 1 -0.112238 2 -0.102034 3...,0 -0.277557 1 -0.311997 2 -0.383300 3...,0 0.025668 1 -0.027122 2 0.019013 3...,0 0.126701 1 0.171457 2 0.169510 3...,0 -0.306756 1 -0.289431 2 -0.314894 3...,0 -0.213076 1 -0.247722 2 -0.227908 3...,0 0.088728 1 0.093011 2 0.074638 3...,29


‼️‼️‼️‼️‼️ NOTE THAT `from_nested_to_multi_index` COMPLETELY FAILS!


In [24]:
Xnp = from_nested_to_3d_numpy(X)

In [25]:
# Note that it's: (n_instances, n_columns, n_timepoints).
Xnp.shape

(540, 12, 29)

In [26]:
Xnp2 = from_nested_to_2d_array(X)

In [27]:
Xnp2.shape

(540, 348)

In [28]:
# NOTE: This is a kind of dataframe!
# The columns are now `column(feature)__timepoint` 
print(type(Xnp2))
Xnp2

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,dim_0__0,dim_0__1,dim_0__2,dim_0__3,dim_0__4,dim_0__5,dim_0__6,dim_0__7,dim_0__8,dim_0__9,...,dim_11__19,dim_11__20,dim_11__21,dim_11__22,dim_11__23,dim_11__24,dim_11__25,dim_11__26,dim_11__27,dim_11__28
0,1.860936,1.891651,1.939205,1.717517,1.741191,1.684695,1.637373,1.643283,1.607030,1.617907,...,-0.175986,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
1,1.303905,1.288280,1.332021,1.436550,1.510069,1.492324,1.731416,1.655716,1.560145,1.981364,...,0.075934,0.131027,0.127282,0.102709,0.119152,0.088565,0.054478,0.0,0.0,0.0
2,1.462484,1.309815,1.418207,1.585858,1.651997,1.486123,1.612464,1.723449,1.908451,2.125260,...,0.066179,0.046339,-0.015915,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
3,1.160837,1.217979,1.234654,1.457268,1.534783,1.505516,1.552154,1.497978,1.543273,1.623602,...,-0.098007,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
4,1.665670,1.685376,1.541171,1.479049,1.602405,1.616784,1.560852,1.586381,1.552360,1.756285,...,0.026577,0.021173,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,1.350661,1.408054,1.330171,1.142174,1.165411,1.091788,1.186173,1.361751,1.330389,1.121027,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
266,0.868252,0.988576,1.147993,1.202193,1.207963,1.193889,1.184650,1.173024,1.038514,1.078472,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
267,1.407858,1.223804,1.034481,1.031508,1.258624,1.379946,1.365356,1.427606,1.505263,1.478791,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
268,1.172216,0.886567,1.114773,1.254748,1.315500,1.444622,1.451150,1.451324,1.320915,1.295548,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
