Merge pull request #355 from hkchekc/df_compat

Dataframe compatibility test
G-Node · Dec 13, 2018 · b2064da · b2064da
2 parents 864cbc3 + cb1cb9c
commit b2064da
Show file tree

Hide file tree

Showing 7 changed files with 177 additions and 28 deletions.
diff --git a/nixio/block.py b/nixio/block.py
@@ -15,6 +15,7 @@
 from inspect import isclass
 from six import string_types
 from collections import OrderedDict  # using it for python2.7
+import sys
 
 from .util import find as finders
 from .compression import Compression
@@ -190,7 +191,15 @@ def create_data_array(self, name, array_type, dtype=None, shape=None,
         return da
 
     def create_data_frame(self, name, type_, col_dict=None, col_names=None,
-                          col_dtypes=None, data=None, compression=Compression.No):
+                          col_dtypes=None, data=None,
+                          compression=Compression.No):
+
+        if isinstance(col_dict, dict) and not \
+                isinstance(col_dict, OrderedDict) and sys.version_info[0] < 3:
+            raise TypeError("Python 2 users should use name_list "
+                      "or OrderedDict created with LIST and TUPLES"
+                      " to create DataFrames as the order "
+                      "of the columns cannot be maintained in Py2")
 
         if data is not None:
             shape = len(data)
@@ -201,14 +210,17 @@ def create_data_frame(self, name, type_, col_dict=None, col_names=None,
         if col_dict is None:
             if col_names is not None:
                 if col_dtypes is not None:
-                    col_dict = OrderedDict((str(nam), dt) for nam, dt in zip(col_names, col_dtypes))
+                    col_dict = OrderedDict((str(nam), dt)
+                                    for nam, dt in zip(col_names, col_dtypes))
                 elif col_dtypes is None and data is not None:
                     col_dtypes = []
                     for x in data[0]:
                         col_dtypes.append(type(x))
-                    col_dict = OrderedDict((str(nam), dt) for nam, dt in zip(col_names, col_dtypes))
+                    col_dict = OrderedDict((str(nam), dt)
+                                    for nam, dt in zip(col_names, col_dtypes))
                 else:  # col_dtypes is None and data is None
-                    raise (ValueError, "The data type of each column have to be specified")
+                    raise (ValueError,
+                           "The data type of each column have to be specified")
             else:  # if col_names is None
                 if data is not None and type(data[0]) == np.void:
                     col_dtype = data[0].dtype
@@ -220,12 +232,15 @@ def create_data_frame(self, name, type_, col_dict=None, col_names=None,
                             raw_dt_list = [ele[0] for ele in raw_dt]
                             col_dict = OrderedDict(zip(cn, raw_dt_list))
 
-                else:  # data is None or type(data[0]) != np.void /data_type doesnt matter
-                    raise (ValueError, "No information about column names is provided!")
+                else:
+            # data is None or type(data[0]) != np.void /data_type doesnt matter
+                    raise (ValueError,
+                           "No information about column names is provided!")
 
         if col_dict is not None:
             for nam, dt in col_dict.items():
-                if isclass(dt) and any(issubclass(dt, st) for st in string_types):
+                if isclass(dt) and any(issubclass(dt, st)
+                                       for st in string_types):
                     col_dict[nam] = util.vlen_str_dtype
             dt_arr = list(col_dict.items())
             col_dtype = np.dtype(dt_arr)

diff --git a/nixio/data_frame.py b/nixio/data_frame.py
@@ -230,4 +230,3 @@ def df_shape(self):
         df_shape = tuple(df_shape)
         self._h5group.set_attr("df_shape", df_shape)
         return self._h5group.get_attr("df_shape")
-
diff --git a/nixio/test/test_data_frame.py b/nixio/test/test_data_frame.py
@@ -5,6 +5,7 @@
 import numpy as np
 from six import string_types
 from collections import OrderedDict
+import sys
 
 
 class TestDataFrame(unittest.TestCase):
@@ -15,24 +16,34 @@ def setUp(self):
         self.testfilename = os.path.join(self.tmpdir.path, "dataframetest.nix")
         self.file = nix.File.open(self.testfilename, nix.FileMode.Overwrite)
         self.block = self.file.create_block("test block", "recordingsession")
-        di = OrderedDict([('name', np.int64), ('id', str), ('time', float), ('sig1', np.float64), ('sig2', np.int32)])
-        arr = [(1, "a", 20.18, 5.0, 100), (2, 'b', 20.09, 5.5, 101), (2, 'c', 20.05, 5.1, 100)
-            ,(1, "d", 20.15, 5.3, 150), (2, 'e', 20.23, 5.7, 200), (2, 'f', 20.07, 5.2, 300), (1, "g", 20.12, 5.1, 39),
-               (1, "h", 20.27, 5.1, 600), (2, 'i', 20.15, 5.6, 400), (2, 'j', 20.08, 5.1, 200)]
+        di = OrderedDict([('name', np.int64), ('id', str), ('time', float),
+                                    ('sig1', np.float64), ('sig2', np.int32)])
+        arr = [(1, "a", 20.18, 5.0, 100), (2, 'b', 20.09, 5.5, 101),
+               (2, 'c', 20.05, 5.1, 100), (1, "d", 20.15, 5.3, 150),
+               (2, 'e', 20.23, 5.7, 200), (2, 'f', 20.07, 5.2, 300),
+               (1, "g", 20.12, 5.1, 39), (1, "h", 20.27, 5.1, 600),
+               (2, 'i', 20.15, 5.6, 400), (2, 'j', 20.08, 5.1, 200)]
         other_arr = np.arange(11101, 11200).reshape((33, 3))
         other_di = OrderedDict({'name': np.int64, 'id': int, 'time': float})
         self.df1 = self.block.create_data_frame("test df", "signal1",
                                                 data=arr, col_dict=di)
         self.df2 = self.block.create_data_frame("other df", "signal2",
                                                 data=arr, col_dict=di)
         self.df3 = self.block.create_data_frame("reference df", "signal3",
-                                                data=other_arr, col_dict=other_di)
+                                            data=other_arr, col_dict=other_di)
         self.dtype = self.df1._h5group.group["data"].dtype
 
     def tearDown(self):
         self.file.close()
         self.tmpdir.cleanup()
 
+    def create_with_list(self):
+        arr = np.arange(999).reshape((333, 3))
+        namelist = np.array(['name', 'id', 'time'])
+        dtlist = np.array([int, str, float])
+        new_df = self.blk.create_data_frame('test1', 'for_test',
+                            col_names=namelist, col_dtypes=dtlist, data=arr)
+
     def test_data_frame_eq(self):
         assert self.df1 == self.df1
         assert not self.df1 == self.df2
@@ -41,11 +52,12 @@ def test_data_frame_eq(self):
         assert self.df2 is not None
 
     def test_create_with_list(self):
-        arr = [(1, 'a', 20.18, 5.1, 100), (2, 'b', 20.09, 5.5, 101), (2, 'c', 20.05, 5.1, 100)]
+        arr = [(1, 'a', 20.18, 5.1, 100), (2, 'b', 20.09, 5.5, 101),
+               (2, 'c', 20.05, 5.1, 100)]
         namelist = np.array(['name', 'id', 'time', 'sig1', 'sig2'])
         dtlist = np.array([np.int64, str, float, np.float64, np.int32])
-        df_li = self.block.create_data_frame("test_list", "make_of_list", data=arr,
-                                             col_names=namelist, col_dtypes=dtlist)
+        df_li = self.block.create_data_frame("test_list", "make_of_list",
+                               data=arr, col_names=namelist, col_dtypes=dtlist)
         assert df_li.column_names == self.df1.column_names
         assert df_li.dtype == self.df1.dtype
         for i in df_li[:]:
@@ -60,15 +72,15 @@ def test_data_frame_type(self):
     def test_write_row(self):
         # test write single row
         row = ["1", 'abc', 3, 4.4556356242341, 5.1111111]
-        self.assertAlmostEqual(list(self.df1[9]),
-                               [2, 'j', 20.08, 5.1, 200])
+        self.assertAlmostEqual(list(self.df1[9]), [2, 'j', 20.08, 5.1, 200])
         self.df1.write_rows([row], [9])
         assert list(self.df1[9]) == [1, 'abc', 3., 4.4556356242341, 5]
         self.assertIsInstance(self.df1[9]['name'],  np.integer)
         self.assertIsInstance(self.df1[9]['sig2'],  np.int32)
         assert self.df1[9]['sig2'] == int(5)
         # test write multiple rows
-        multi_rows = [[1775, '12355', 1777, 1778, 1779], [1785, '12355', 1787, 1788, 1789]]
+        multi_rows = [[1775, '12355', 1777, 1778, 1779],
+                      [1785, '12355', 1787, 1788, 1789]]
         self.df1.write_rows(multi_rows, [1, 2])
         assert list(self.df1[1]) == [1775, '12355', 1777, 1778, 1779]
         assert list(self.df1[2]) == [1785, '12355', 1787, 1788, 1789]
@@ -113,7 +125,8 @@ def test_read_cell(self):
         assert crcell == 'j'
         # test error raise if only one param given
         self.assertRaises(ValueError, lambda: self.df1.read_cell(row_idx=10))
-        self.assertRaises(ValueError, lambda: self.df1.read_cell(col_name='sig1'))
+        self.assertRaises(ValueError,
+                          lambda: self.df1.read_cell(col_name='sig1'))
 
     def test_write_cell(self):
         # write cell by position
@@ -123,21 +136,25 @@ def test_write_cell(self):
         self.df1.write_cell('test', col_name='id', row_idx=3)
         assert self.df1[3]['id'] == 'test'
         # test error raise
-        self.assertRaises(ValueError, lambda: self.df1.write_cell(11, col_name='sig1'))
+        self.assertRaises(ValueError,
+                          lambda: self.df1.write_cell(11, col_name='sig1'))
 
     def test_append_column(self):
         y = np.arange(start=16000, stop=16010, step=1)
         self.df1.append_column(y, name='trial_col', datatype=int)
-        assert self.df1.column_names == ('name', 'id', 'time', 'sig1', 'sig2', 'trial_col')
+        assert self.df1.column_names == \
+                            ('name', 'id', 'time', 'sig1', 'sig2', 'trial_col')
         assert len(self.df1.dtype) == 6
         k = np.array(self.df1[0:10]["trial_col"], dtype=np.int64)
         np.testing.assert_almost_equal(k, y)
         # too short coulmn
         sh_col = np.arange(start=16000, stop=16100, step=1)
-        self.assertRaises(ValueError, lambda: self.df1.append_column(sh_col, name='sh_col'))
+        self.assertRaises(ValueError, lambda:
+                        self.df1.append_column(sh_col, name='sh_col'))
         # too long column
         long = np.arange(start=16000, stop=16500, step=1)
-        self.assertRaises(ValueError, lambda: self.df1.append_column(long, name='long'))
+        self.assertRaises(ValueError, lambda:
+                        self.df1.append_column(long, name='long'))
 
     def test_append_rows(self):
         # append single row
@@ -147,25 +164,30 @@ def test_append_rows(self):
         # append multi-rows
         mrows = [[1, '2', 3, 4, 5], [6, 'testing', 8, 9, 10]]
         self.df1.append_rows(mrows)
-        assert [list(i) for i in self.df1[-2:]] == [[1, '2', 3., 4., 5], [6, 'testing', 8., 9., 10]]
+        assert [list(i) for i in self.df1[-2:]] == \
+               [[1, '2', 3., 4., 5], [6, 'testing', 8., 9., 10]]
         # append row with incorrect length
         errrow = [5, 6, 7, 8]
         self.assertRaises(ValueError, lambda: self.df1.append_rows([errrow]))
 
     def test_unit(self):
         assert self.df1.units is None
         self.df1.units = ["s", 'A', 'ms', 'Hz', 'mA']
-        np.testing.assert_array_equal(self.df1.units,  np.array(["s", 'A', 'ms', 'Hz', 'mA']))
+        np.testing.assert_array_equal(self.df1.units,
+                                      np.array(["s", 'A', 'ms', 'Hz', 'mA']))
         assert self.df2.units is None
 
     def test_df_shape(self):
         assert tuple(self.df1.df_shape) == (10, 5)
         # create df with incorrect dimension to see if Error is raised
         arr = np.arange(1000).reshape(10, 10, 10)
-        self.assertRaises(ValueError, lambda:
-                          self.block.create_data_frame('err', 'err', {'name': np.int64}, data=arr))
+        if sys.version_info[0] == 3:
+            self.assertRaises(ValueError,
+                          lambda: self.block.create_data_frame('err', 'err',
+                                                {'name': np.int64}, data=arr))
 
     def test_data_type(self):
         assert self.df1.dtype[4] == np.int32
         assert self.df1.dtype[0] != self.df1.dtype[4]
         assert self.df1.dtype[2] == self.df1.dtype[3]
+
diff --git a/nixio/test/test_nix_compatibility.py b/nixio/test/test_nix_compatibility.py
@@ -16,6 +16,7 @@
 import numpy as np
 import tempfile
 import pytest
+from collections import OrderedDict
 
 import nixio as nix
 from .xcompat.compile import maketests
@@ -125,6 +126,30 @@ def _test_data_arrays(tmpdir):
     # validate(nixfilepath)
 
 
+def _test_data_frames(tmpdir):
+    nixfilepath = os.path.join(str(tmpdir), "frametest.nix")
+    nix_file = nix.File.open(nixfilepath, mode=nix.FileMode.Overwrite)
+    print(nixfilepath, nix_file)
+    blk = nix_file.create_block("testblock", "blocktype")
+    grp = blk.create_group("testgroup", "grouptype")
+    arr = np.arange(999).reshape((333, 3))
+
+    for idx in range(7):
+        cn = []
+        dt_list = []
+        di = dict(zip(cn, dt_list))
+        di = {'name': int, 'id': str, 'time': float}
+        arr = np.arange(999).reshape((333, 3))
+        df = blk.create_data_frame("df_" + str(idx), "dataframe", col_dict=di,
+                                   data=arr)
+        df.definition = "da definition " + str(idx)
+        df.force_created_at(np.random.randint(1000000000))
+        df.label = "data label " + str(idx)
+
+    nix_file.close()
+    # validate(nixfilepath)
+
+
 def _test_tags(tmpdir):
     nixfilepath = os.path.join(str(tmpdir), "tagtest.nix")
     nix_file = nix.File.open(nixfilepath, mode=nix.FileMode.Overwrite)
@@ -480,6 +505,13 @@ def test_full_file(tmpdir):
     group = block.groups[0]
     group.data_arrays.append(da)
 
+    df = block.create_data_frame("adataframe", "4-column df",
+                                 col_dict=OrderedDict([('name', str), ('id',
+                                   int), ('time', float), ('Adjusted', bool)]),
+                                 data=[["Bob", 9, 11.28, False],
+                                       ["Jane", 10, 14.37, True]])
+    df.append_rows([["Alice", 2, 3.7, False]])
+
     featda = block.create_data_array("feat-da", "tag-feature",
                                      data=[0.4, 0.41, 0.49, 0.1, 0.1, 0.1])
 
@@ -709,6 +741,21 @@ def test_full_file_read(tmpdir):
     compare(nix.DimensionType.Set, dim.dimension_type)
     compare(["a", "b"], dim.labels)
 
+    # Data Frame
+    df = block.data_frames[0]
+    compare("table", df.name)
+    compare("filing", df.type)
+    dt = (nix.util.util.vlen_str_dtype,
+          nix.DataType.Double, nix.DataType.Int64, nix.DataType.Bool)
+    compare(dt, df.dtype)
+    col_name = ("str", "Double", "int64", "bool")
+    compare(col_name, df.column_names)
+    combine_dt = np.dtype([(n, dty) for n, dty in zip(col_name, dt)])
+    arr = np.array([(b"exp1", 42.1, 10, False),
+                    (b"exp2", 30.2, 4, True)], dtype=combine_dt)
+    compare(arr, df[:])
+    # could not test shape because it will write data
+
     # Tag
     tag = block.tags[0]
     compare("tagu", tag.name)

diff --git a/nixio/test/xcompat/readdataframes.cpp b/nixio/test/xcompat/readdataframes.cpp
@@ -0,0 +1,22 @@
+#include "testutil.hpp"
+#include <nix.hpp>
+
+int main(int argc, char* argv[]) {
+    if (argc != 2) {
+        std::cerr << "Please specify a nix file (and nothing else)" << std::endl;
+        return 1;
+    }
+    std::string fname = argv[1];
+    nix::File nf = nix::File::open(fname, nix::FileMode::ReadOnly);
+
+    int idx = 0, errcount = 0;
+    std::string expname, expdef;
+    for (const auto &block : nf.blocks()) {
+        expname = "test_block" + nix::util::numToStr(idx);
+        expdef = "definition data_frame " + nix::util::numToStr(idx++);
+        errcount += compare(expname, block.name());
+        errcount += compare("blocktype", block.type());
+        errcount += compare(expdef, block.definition());
+    }
+    return errcount;
+}
diff --git a/nixio/test/xcompat/readfullfile.cpp b/nixio/test/xcompat/readfullfile.cpp
@@ -132,6 +132,32 @@ int main(int argc, char* argv[]) {
     setdim = dim;
     errcount += compare({"a", "b"}, setdim.labels());
 
+    // DataFrame
+    auto df = block.getDataFrame(0);
+    size_t n = 3;
+    std::vector<float_t> dou_out(n);
+    std::vector<std::string> str_out(n);
+    errcount += compare("adataframe", df.name());
+    errcount += compare("4-column df", df.type());
+    errcount += compare(nix::ndsize_t{n}, df.rows());
+    errcount += compare(df.readRow(nix::ndsize_t{0}), {nix::Variant("Bob"),
+                                nix::Variant(int64_t(9)), nix::Variant(11.28), nix::Variant(false)});
+    df.readColumn(0, str_out);
+    errcount += compare(str_out, {"Bob", "Jane", "Alice"});
+    df.readColumn(2, dou_out);
+    errcount += compare(dou_out, {11.28, 14.37, 3.7});
+    nix::ndsize_t rown = 1;
+    std::vector<nix::Cell> cells = df.readCells(rown, {"name", "Adjusted"});
+    std::vector<nix::Cell> def_cells = {
+    {"name", nix::Variant{"Jane"}},
+    {"Adjusted", nix::Variant{true}}
+    };
+    errcount += compare(cells, def_cells);
+    std::vector<nix::Column> cols = df.columns();
+    for(size_t i = 0; i <  cols.size(); i++){
+        errcount += compare("", cols[i].unit);
+    }
+
     // Tag
     auto tag = block.getTag(0);
     errcount += compare("tagu", tag.name());

diff --git a/nixio/test/xcompat/writefullfile.cpp b/nixio/test/xcompat/writefullfile.cpp
@@ -66,6 +66,22 @@ int main(int argc, char* argv[]) {
     group = block.getGroup(0);
     group.addDataArray(da);
 
+    // Data Frame
+    std::vector<nix::Column> cols  = {{"str", "", nix::DataType::String}
+                ,{"Double", "A", nix::DataType::Double},
+                {"int64", "ms", nix::DataType::Int64},
+                {"bool", "", nix::DataType::Bool}};
+    auto df = block.createDataFrame("table", "filing", cols);
+    std::vector<nix::Variant> vals = {nix::Variant("exp1"),
+                                       nix::Variant(42.1), nix::Variant(10), nix::Variant(false)};
+    df.rows(1);
+    df.writeRow(0, vals);
+    group.addDataFrame(df);
+    df.rows(2);
+    vals = {nix::Variant("exp2"),
+                        nix::Variant(30.2), nix::Variant(4), nix::Variant(true)};
+    df.writeRow(1, vals);
+
     datadbl = {0.4, 0.41, 0.49, 0.1, 0.1, 0.1};
     auto featda = block.createDataArray("feat-da", "tag-feature", nix::DataType::Double, nix::NDSize{6});
     featda.setData(nix::DataType::Double, datadbl.data(), nix::NDSize{6}, nix::NDSize{0});
@@ -77,6 +93,8 @@ int main(int argc, char* argv[]) {
     group.addTag(tag);
     tag.createFeature(featda, nix::LinkType::Untagged);
 
+
+
     auto mtag = block.createMultiTag("mtagu", "multi tagging", block.createDataArray("tag-data", "multi-tagger", nix::DataType::Double, nix::NDSize{1, 3}));
     datadbl = {0, 0.1, 10.1};
     // MultiTag positions array