# Dataset Array test

In this notebook we test the `DatasetArray` functionalities.

## Importing libraries

In [1]:
import pandas as pd
from caits.dataset._dataset3 import CaitsArray, DatasetArray
from caits.filtering import filter_butterworth

## Dataset loading

For this notebook, we use the data/AirQuality.csv.

In [2]:
data = pd.read_csv("data/AirQuality.csv", sep=";", decimal=",")
print(data.iloc[:, 6:8])


      PT08.S2(NMHC)  NOx(GT)
0            1046.0    166.0
1             955.0    103.0
2             939.0    131.0
3             948.0    172.0
4             836.0    131.0
...             ...      ...
9466            NaN      NaN
9467            NaN      NaN
9468            NaN      NaN
9469            NaN      NaN
9470            NaN      NaN

[9471 rows x 2 columns]


In [3]:
data_X = data.iloc[:, 2:-5]
data_X = data_X.fillna(data_X.mean())
data_y = data.iloc[:, -5:-2]
data_y = data_y.fillna(data_y.mean())

In [4]:
data_X


Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3)
0,2.600000,1360.000000,150.000000,11.900000,1046.000000,166.000000,1056.000000,113.000000,1692.000000,1268.000000
1,2.000000,1292.000000,112.000000,9.400000,955.000000,103.000000,1174.000000,92.000000,1559.000000,972.000000
2,2.200000,1402.000000,88.000000,9.000000,939.000000,131.000000,1140.000000,114.000000,1555.000000,1074.000000
3,2.200000,1376.000000,80.000000,9.200000,948.000000,172.000000,1092.000000,122.000000,1584.000000,1203.000000
4,1.600000,1272.000000,51.000000,6.500000,836.000000,131.000000,1205.000000,116.000000,1490.000000,1110.000000
...,...,...,...,...,...,...,...,...,...,...
9466,-34.207524,1048.990061,-159.090093,1.865683,894.595276,168.616971,794.990168,58.148873,1391.479641,975.072032
9467,-34.207524,1048.990061,-159.090093,1.865683,894.595276,168.616971,794.990168,58.148873,1391.479641,975.072032
9468,-34.207524,1048.990061,-159.090093,1.865683,894.595276,168.616971,794.990168,58.148873,1391.479641,975.072032
9469,-34.207524,1048.990061,-159.090093,1.865683,894.595276,168.616971,794.990168,58.148873,1391.479641,975.072032


In [5]:
data_y


Unnamed: 0,T,RH,AH
0,13.600000,48.90000,0.757800
1,13.300000,47.70000,0.725500
2,11.900000,54.00000,0.750200
3,11.000000,60.00000,0.786700
4,11.200000,59.60000,0.788800
...,...,...,...
9466,9.778305,39.48538,-6.837604
9467,9.778305,39.48538,-6.837604
9468,9.778305,39.48538,-6.837604
9469,9.778305,39.48538,-6.837604


In [6]:
data_X_vals = data_X.values
data_X_axis_names = {"axis_1": {name: i for i, name in enumerate(list(data_X.columns))}}
data_y_vals = data_y.values
data_y_axis_names = {"axis_1": {name: i for i, name in enumerate((data_y.columns))}}
data_X = CaitsArray(values=data_X_vals, axis_names=data_X_axis_names)
data_y = CaitsArray(values=data_y_vals, axis_names=data_y_axis_names)
datasetArrayObj = DatasetArray(data_X, data_y)


In [7]:
len(datasetArrayObj)


9471

In [8]:
datasetArrayObj


DatasetArray object with 9471 instances.

## Indexing

In this subsection we test the various indexing methods that can be used.

### Indexing using integer

This returns a `DatasetArray` object, consisting of a single instance `X[int, :], y[int, :]`.

In [9]:
datasetArrayObj[3]


DatasetArray object with 10 instances.

### Indexing using slice

This returns a `DatasetArray` object, consisting of multiple instances `X[slice, ...], y[slice, ...]`

In [10]:
datasetArrayObj[1:6]

DatasetArray object with 5 instances.

In [11]:
datasetArrayObj[:11]

DatasetArray object with 11 instances.

In [12]:
datasetArrayObj[11:]

DatasetArray object with 9460 instances.

In [13]:
datasetArrayObj[:]

DatasetArray object with 9471 instances.

### Indexing using tuple of integers

This returns a single scalar.

In [14]:
datasetArrayObj[1,1]

(1292.0, 47.7)

In [15]:
datasetArrayObj[1,5]

(103.0, None)

### Indexing using a tuple of a slice and an integer

In [16]:
datasetArrayObj[1:5,2]

DatasetArray object with 4 instances.

In [17]:
datasetArrayObj[:,"CO(GT)"]

DatasetArray object with 9471 instances.

In [18]:
datasetArrayObj[:,"RH"]

DatasetArray object with 9471 instances.

In [19]:
datasetArrayObj[:,"CO(GT)":]

DatasetArray object with 9471 instances.

In [20]:
datasetArrayObj[:, "NMHC(GT)":"NO2(GT)"]

DatasetArray object with 9471 instances.

In [21]:
datasetArrayObj[1:78, "NMHC(GT)":"NO2(GT)"]

DatasetArray object with 78 instances.

In [22]:
datasetArrayObj[1:59, :"AH"]

DatasetArray object with 9471 instances.

In [23]:
datasetArrayObj[[0,1,5], 1]

DatasetArray object with 3 instances.

In [24]:
datasetArrayObj[[0, 1, 5], :4]

DatasetArray object with 3 instances.

In [25]:
datasetArrayObj[:10, ["CO(GT)", "T", "RH", "AH"]]

DatasetArray object with 11 instances.

In [26]:
datasetArrayObj[[0,1,5], "CO(GT)"]

DatasetArray object with 3 instances.

In [27]:
datasetArrayObj[[0,1,2,3], ["CO(GT)", "RH", "NO2(GT)", "AH"]]

DatasetArray object with 2 instances.

In [28]:
datasetArrayObj[[1,2], [0,1]]

DatasetArray object with 2 instances.

## Loops

In this subsection we test looping capabilites of a `DatasetArray` object.


### For loop

In [29]:
for i, row in enumerate(datasetArrayObj):
    print(i)
    # print(row)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

### For loop in batches

In [30]:
for i, batch in enumerate(datasetArrayObj.batch(10)):
    print(batch)
    # print(i)


(   CO(GT)  PT08.S1(CO)  NMHC(GT)  C6H6(GT)  PT08.S2(NMHC)  NOx(GT)  \
0     2.6       1360.0     150.0      11.9         1046.0    166.0  
1     2.0       1292.0     112.0       9.4          955.0    103.0  
2     2.2       1402.0      88.0       9.0          939.0    131.0  
3     2.2       1376.0      80.0       9.2          948.0    172.0  
4     1.6       1272.0      51.0       6.5          836.0    131.0  
5     1.2       1197.0      38.0       4.7          750.0     89.0  
6     1.2       1185.0      31.0       3.6          690.0     62.0  
7     1.0       1136.0      31.0       3.3          672.0     62.0  
8     0.9       1094.0      24.0       2.3          609.0     45.0  
9     0.6       1010.0      19.0       1.7          561.0   -200.0  

   PT08.S3(NOx)  NO2(GT)  PT08.S4(NO2)  PT08.S5(O3)  
0        1056.0    113.0        1692.0       1268.0  
1        1174.0     92.0        1559.0        972.0  
2        1140.0    114.0        1555.0       1074.0  
3        1092.0    122

## Train_Test split

In this subsection we check the `train_test_split` method.

### Not-random split

This splits the `DatasetList` object in:
- train: first `Nx` instances
- test: last `N-Nx` instances

where `N` is the number of all instances and `Nx = int(N * (1 - test_size))`.

In [31]:
train_obj, test_obj = datasetArrayObj.train_test_split()

In [32]:
len(train_obj), len(test_obj)

(7576, 1895)

### Random split

This splits the `DatasetList` object in:
- train: `Nx` random instances
- test: The rest `N-Nx` instances

where `N` is the number of all instances and `Nx = int(N * (1 - test_size))`.


In [33]:
train_obj, test_obj = datasetArrayObj.train_test_split(random_state=42)

In [34]:
train_obj.X

      CO(GT)  PT08.S1(CO)  NMHC(GT)  C6H6(GT)  PT08.S2(NMHC)  NOx(GT)  \
 774     0.7        840.0      31.0       1.6          556.0     41.0  
4625     2.1       1083.0    -200.0      10.2          984.0    251.0  
6214     3.4       1374.0    -200.0      17.4         1222.0    602.0  
6465     1.0        884.0    -200.0       3.0          651.0   -200.0  
2362  -200.0        804.0    -200.0       3.3          668.0     32.0  
 ...     ...          ...       ...       ...            ...      ...  
4783     2.5       1063.0    -200.0      11.7         1040.0    341.0  
5208  -200.0        981.0    -200.0      10.1          983.0   -200.0  
3232     2.4       1250.0    -200.0      15.5         1166.0    186.0  
5704     3.7       1341.0    -200.0      21.0         1323.0    499.0  
9129     1.2       1167.0    -200.0       6.4          829.0    202.0  

      PT08.S3(NOx)  NO2(GT)  PT08.S4(NO2)  PT08.S5(O3)  
 774        1474.0     58.0        1094.0        366.0  
4625         784.0  

In [35]:
train_obj.y

         T    RH      AH  
 774  13.7  38.3  0.5957  
4625  30.0  28.5  1.1883  
6214  13.7  66.8  1.0432  
6465  12.2  69.6   0.986  
2362  18.8  58.6  1.2581  
 ...   ...   ...     ...  
4783  15.5  52.2  0.9136  
5208  14.8  56.3  0.9459  
3232  34.2  28.5  1.5071  
5704  23.7  49.1   1.423  
9129  16.1  71.3  1.2955  

CaitsArray with shape (7576, 3)

### Unify

In [36]:
tmp1_X = datasetArrayObj.X.iloc[:, [0,3]]
tmp1_y = datasetArrayObj.y.iloc[:, [0]]
tmp1 = DatasetArray(tmp1_X, tmp1_y)

tmp2_X = datasetArrayObj.X.iloc[:, [1,2]]
tmp2_y = datasetArrayObj.y.iloc[:, [1]]
tmp2 = DatasetArray(tmp2_X, tmp2_y)

tmp3_X = datasetArrayObj.X.iloc[:, [8, 9]]
tmp3_y = datasetArrayObj.y.iloc[:, [2]]
tmp3 = DatasetArray(tmp3_X, tmp3_y)


tmp_unified = tmp1.unify([tmp2, tmp3], axis=1)
tmp_unified, tmp_unified.X.shape, tmp_unified.y.shape



(DatasetArray object with 9471 instances., (9471, 6), (9471, 1))

## Adding two DatasetList objects

In this section we check the addition of two `DatasetArray` objects. This is equivalent to:

`obj1.unify([obj2], axis=0)`

This way, the `obj2` is appended to the `obj1`, row-wise.


In [37]:
newDatasetArrayObj = train_obj + test_obj
newDatasetArrayObj

DatasetArray object with 9471 instances.

In [38]:
newDatasetArrayObj.X

                CO(GT)         PT08.S1(CO)             NMHC(GT)            C6H6(GT)      PT08.S2(NMHC)            NOx(GT)  \
   0               0.7               840.0                 31.0                 1.6              556.0               41.0  
   1               2.1              1083.0               -200.0                10.2              984.0              251.0  
   2               3.4              1374.0               -200.0                17.4             1222.0              602.0  
   3               1.0               884.0               -200.0                 3.0              651.0             -200.0  
   4            -200.0               804.0               -200.0                 3.3              668.0               32.0  
 ...               ...                 ...                  ...                 ...                ...                ...  
9466  -34.207523778989  1048.9900609169606  -159.09009297851875  1.8656834455487867  894.5952762637597  168.6169712514695  
9467  -

In [39]:
newDatasetArrayObj.y

                      T                 RH                  AH  
   0               13.7               38.3              0.5957  
   1               30.0               28.5              1.1883  
   2               13.7               66.8              1.0432  
   3               12.2               69.6               0.986  
   4               18.8               58.6              1.2581  
 ...                ...                ...                 ...  
9466  9.778305012290264  39.48537992946458  -6.837603644330447  
9467  9.778305012290264  39.48537992946458  -6.837603644330447  
9468  9.778305012290264  39.48537992946458  -6.837603644330447  
9469  9.778305012290264  39.48537992946458  -6.837603644330447  
9470  9.778305012290264  39.48537992946458  -6.837603644330447  

CaitsArray with shape (9471, 3)

## Apply method

In this subsection we test applying a method on a `DatasetArray` object.

When `DatasetArray.apply` is called, the callable method is applied to the `DatasetArray.X.values`.

We test `DatasetArray.apply` using `caits.fe.filter_butterworth`.

In [40]:
tmp_apply = datasetArrayObj.apply(filter_butterworth, fs=200, filter_type='lowpass', cutoff_freq=50)
tmp_apply

array([[ 2.60027232e+00,  1.36000482e+03,  1.50014594e+02, ...,
         1.12996021e+02,  1.69200532e+03,  1.26799358e+03],
       [-1.42059319e+00,  1.34921145e+03,  1.14210394e+02, ...,
         1.01132158e+02,  1.58281177e+03,  1.04186444e+03],
       [ 2.11795883e+00,  1.36625793e+03,  9.01143291e+01, ...,
         1.11934063e+02,  1.55452995e+03,  1.04610694e+03],
       ...,
       [-3.42075238e+01,  1.04899006e+03, -1.59090093e+02, ...,
         5.81488725e+01,  1.39147964e+03,  9.75072032e+02],
       [-3.42075238e+01,  1.04899006e+03, -1.59090093e+02, ...,
         5.81488725e+01,  1.39147964e+03,  9.75072032e+02],
       [-3.42075238e+01,  1.04899006e+03, -1.59090093e+02, ...,
         5.81488725e+01,  1.39147964e+03,  9.75072032e+02]])

## Flatten

In [41]:
data_flat = datasetArrayObj.flatten()
data_flat

DatasetArray object with 94710 instances.

In [42]:
data_flat.X

         0,CO(GT)                 2.6
    0,PT08.S1(CO)              1360.0
       0,NMHC(GT)               150.0
       0,C6H6(GT)                11.9
  0,PT08.S2(NMHC)              1046.0
              ...                 ...
     9470,NOx(GT)   168.6169712514695
9470,PT08.S3(NOx)   794.9901677888212
     9470,NO2(GT)   58.14887250187026
9470,PT08.S4(NO2)  1391.4796409105481
 9470,PT08.S5(O3)   975.0720316340708

CaitsArray with shape (94710,)

## Shuffling

In this subsection we test shuffling a `DatasetArray` object.

In [43]:
shuffled_dataset = datasetArrayObj.shuffle()

In [44]:
datasetArrayObj.X, datasetArrayObj.y

(                CO(GT)         PT08.S1(CO)             NMHC(GT)            C6H6(GT)      PT08.S2(NMHC)            NOx(GT)  \
    0               2.6              1360.0                150.0                11.9             1046.0              166.0  
    1               2.0              1292.0                112.0                 9.4              955.0              103.0  
    2               2.2              1402.0                 88.0                 9.0              939.0              131.0  
    3               2.2              1376.0                 80.0                 9.2              948.0              172.0  
    4               1.6              1272.0                 51.0                 6.5              836.0              131.0  
  ...               ...                 ...                  ...                 ...                ...                ...  
 9466  -34.207523778989  1048.9900609169606  -159.09009297851875  1.8656834455487867  894.5952762637597  168.6169712514695  

In [45]:
shuffled_dataset.X, shuffled_dataset.y

(      CO(GT)  PT08.S1(CO)  NMHC(GT)  C6H6(GT)  PT08.S2(NMHC)  NOx(GT)  \
 8074     1.4       -200.0    -200.0    -200.0         -200.0     95.0  
 6837     2.2        838.0    -200.0       3.8          699.0    152.0  
 5159     4.8       1435.0    -200.0      25.0         1429.0    479.0  
 8970     2.2       1242.0    -200.0       8.9          935.0    297.0  
 1012  -200.0       1105.0    -200.0      10.6         1000.0   -200.0  
  ...     ...          ...       ...       ...            ...      ...  
 7807     1.3        901.0    -200.0       2.2          599.0    147.0  
 8678     1.5       1022.0    -200.0       5.2          771.0    289.0  
 8983     1.9       1150.0    -200.0       5.9          807.0    276.0  
 4039     0.9        882.0    -200.0       5.0          763.0     38.0  
 2088     2.3       1124.0    -200.0      14.0         1118.0    134.0  
 
       PT08.S3(NOx)  NO2(GT)  PT08.S4(NO2)  PT08.S5(O3)  
 8074        -200.0     87.0        -200.0       -200.0  
 6837

## Conversions

In this subsection we test various conversion methods of the `DatasetArray` object.

### to_dict

This converts a `DatasetArray` object to a dictionary with keys "X", "y", where each value is the corresponding attribute of the `DatasetArray` object.

In [46]:
datasetArrayObj.to_dict()

{'X':                 CO(GT)         PT08.S1(CO)             NMHC(GT)            C6H6(GT)      PT08.S2(NMHC)            NOx(GT)  \
    0               2.6              1360.0                150.0                11.9             1046.0              166.0  
    1               2.0              1292.0                112.0                 9.4              955.0              103.0  
    2               2.2              1402.0                 88.0                 9.0              939.0              131.0  
    3               2.2              1376.0                 80.0                 9.2              948.0              172.0  
    4               1.6              1272.0                 51.0                 6.5              836.0              131.0  
  ...               ...                 ...                  ...                 ...                ...                ...  
 9466  -34.207523778989  1048.9900609169606  -159.09009297851875  1.8656834455487867  894.5952762637597  168.6169712514

In [47]:
vals = datasetArrayObj.X.values
vals

array([[   2.6       , 1360.        ,  150.        , ...,  113.        ,
        1692.        , 1268.        ],
       [   2.        , 1292.        ,  112.        , ...,   92.        ,
        1559.        ,  972.        ],
       [   2.2       , 1402.        ,   88.        , ...,  114.        ,
        1555.        , 1074.        ],
       ...,
       [ -34.20752378, 1048.99006092, -159.09009298, ...,   58.1488725 ,
        1391.47964091,  975.07203163],
       [ -34.20752378, 1048.99006092, -159.09009298, ...,   58.1488725 ,
        1391.47964091,  975.07203163],
       [ -34.20752378, 1048.99006092, -159.09009298, ...,   58.1488725 ,
        1391.47964091,  975.07203163]])

In [48]:
datasetArrayObj.y.values

array([[13.6       , 48.9       ,  0.7578    ],
       [13.3       , 47.7       ,  0.7255    ],
       [11.9       , 54.        ,  0.7502    ],
       ...,
       [ 9.77830501, 39.48537993, -6.83760364],
       [ 9.77830501, 39.48537993, -6.83760364],
       [ 9.77830501, 39.48537993, -6.83760364]])

## to_numpy

In [49]:
data_np = datasetArrayObj.to_numpy()
data_np

(array([[   2.6       , 1360.        ,  150.        , ...,  113.        ,
         1692.        , 1268.        ],
        [   2.        , 1292.        ,  112.        , ...,   92.        ,
         1559.        ,  972.        ],
        [   2.2       , 1402.        ,   88.        , ...,  114.        ,
         1555.        , 1074.        ],
        ...,
        [ -34.20752378, 1048.99006092, -159.09009298, ...,   58.1488725 ,
         1391.47964091,  975.07203163],
        [ -34.20752378, 1048.99006092, -159.09009298, ...,   58.1488725 ,
         1391.47964091,  975.07203163],
        [ -34.20752378, 1048.99006092, -159.09009298, ...,   58.1488725 ,
         1391.47964091,  975.07203163]]),
 array([[13.6       , 48.9       ,  0.7578    ],
        [13.3       , 47.7       ,  0.7255    ],
        [11.9       , 54.        ,  0.7502    ],
        ...,
        [ 9.77830501, 39.48537993, -6.83760364],
        [ 9.77830501, 39.48537993, -6.83760364],
        [ 9.77830501, 39.48537993, -6.83760

In [50]:
data_np[0].shape

(9471, 10)

In [51]:
data_np_flat = datasetArrayObj.to_numpy(flatten=True)
data_np_flat

(array([   2.6       , 1360.        ,  150.        , ...,   58.1488725 ,
        1391.47964091,  975.07203163]),
 array([[13.6       , 48.9       ,  0.7578    ],
        [13.3       , 47.7       ,  0.7255    ],
        [11.9       , 54.        ,  0.7502    ],
        ...,
        [ 9.77830501, 39.48537993, -6.83760364],
        [ 9.77830501, 39.48537993, -6.83760364],
        [ 9.77830501, 39.48537993, -6.83760364]]))

In [52]:
data_np_flat[0].shape

(94710,)

## Numpy to dataset

In [53]:
tmp = datasetArrayObj.numpy_to_dataset(
    datasetArrayObj.X.values,
    axis_names={
        "axis_1": {name: i for i, name in enumerate(datasetArrayObj.X.axis_names["axis_1"])},
    }
)
tmp

DatasetArray object with 9471 instances.

In [54]:
tmp.X

                CO(GT)         PT08.S1(CO)             NMHC(GT)            C6H6(GT)      PT08.S2(NMHC)            NOx(GT)  \
   0               2.6              1360.0                150.0                11.9             1046.0              166.0  
   1               2.0              1292.0                112.0                 9.4              955.0              103.0  
   2               2.2              1402.0                 88.0                 9.0              939.0              131.0  
   3               2.2              1376.0                 80.0                 9.2              948.0              172.0  
   4               1.6              1272.0                 51.0                 6.5              836.0              131.0  
 ...               ...                 ...                  ...                 ...                ...                ...  
9466  -34.207523778989  1048.9900609169606  -159.09009297851875  1.8656834455487867  894.5952762637597  168.6169712514695  
9467  -

In [55]:
tmp.y

                      T                 RH                  AH  
   0               13.6               48.9              0.7578  
   1               13.3               47.7              0.7255  
   2               11.9               54.0              0.7502  
   3               11.0               60.0              0.7867  
   4               11.2               59.6              0.7888  
 ...                ...                ...                 ...  
9466  9.778305012290264  39.48537992946458  -6.837603644330447  
9467  9.778305012290264  39.48537992946458  -6.837603644330447  
9468  9.778305012290264  39.48537992946458  -6.837603644330447  
9469  9.778305012290264  39.48537992946458  -6.837603644330447  
9470  9.778305012290264  39.48537992946458  -6.837603644330447  

CaitsArray with shape (9471, 3)