In [1]:
import h5py
import numpy as np

In [2]:
class MyDataSet:
    def __init__(self, data, faceId, label, normal):
        assert(data.shape[0] == faceId.shape[0] == label.shape[0] == normal.shape[0])
        self.contents = {
            'data': data,
            'faceId': faceId,
            'label': label,
            'normal': normal
        }
    def __len__(self):
        return self.contents['data'].shape[0]
    def __getitem__(self, args):
        if isinstance(args, str):
            return self.contents[args]
        if isinstance(args, list) or isinstance(args, slice) or isinstance(args, np.ndarray):
            return MyDataSet(
                self.contents['data'][args],
                self.contents['faceId'][args],
                self.contents['label'][args],
                self.contents['normal'][args]
            )
    def __add__(self, other):
        return MyDataSet.merge([self, other])
    def __repr__(self):
        string = f'<DataSet of {len(self)} records with {len(np.unique(self.contents["label"]))} labels>'
        return string
    
    def from_h5_file(file):
        with h5py.File(file, 'r') as f:
            args = f['data'][:], f['faceId'][:], f['label'][:], f['normal'][:]
        return MyDataSet(*args)
    def from_h5_files(files):
        return MyDataSet.merge([MyDataSet.from_h5_file(file) for file in files])
    def to_h5_file(self, file, max_size = None):
        if file[-3:] == '.h5':
            file = file[:-3]
        if not max_size or len(self) <= max_size:
            with h5py.File(file + '.h5', 'w') as f:
                f.create_dataset('data', data = self.contents['data'])
                f.create_dataset('faceId', data = self.contents['faceId'])
                f.create_dataset('label', data = self.contents['label'])
                f.create_dataset('normal', data = self.contents['normal'])
        else:
            i = 0
            while max_size * (i + 1) < len(self):
                with h5py.File(file + str(i) + '.h5', 'w') as f:
                    contents = self[i * max_size : (i + 1) * max_size]
                    f.create_dataset('data', data = contents['data'])
                    f.create_dataset('faceId', data = contents['faceId'])
                    f.create_dataset('label', data = contents['label'])
                    f.create_dataset('normal', data = contents['normal'])
                i = i + 1
            with h5py.File(file + str(i) + '.h5', 'w') as f:
                contents = self[i * max_size :]
                f.create_dataset('data', data = contents['data'])
                f.create_dataset('faceId', data = contents['faceId'])
                f.create_dataset('label', data = contents['label'])
                f.create_dataset('normal', data = contents['normal'])
    
    def count(self, sort = None):
        cnt = zip(*np.unique(self.contents['label'], return_counts = True))
        if not sort:
            return sorted(cnt, key = lambda x: x[0])
        if sort == 'asc':
            return sorted(cnt, key = lambda x: x[1])
        elif sort == 'desc':
            return sorted(cnt, key = lambda x: -x[1])
        raise Exception('sort argument should be asc or desc')
    def summary(self):
        print(f'Num of records: {len(self)}\t\tNum of unique labels: {len(np.unique(self.contents["label"]))}')
        print(f'Unique labels and their frequencies:')
        print(*self.count(sort = 'desc'))
        
    def merge(datasets):
        arg_list = [[ds.contents[w] for ds in datasets] for w in ['data','faceId','label','normal']]
        args = [np.concatenate(arg) for arg in arg_list]
        return MyDataSet(*args)
    def duplicate(self):
        return MyDataSet(*[v.copy() for v in self.contents.values()])
    def filter(self, label):
        return self[np.isin(self.contents['label'], label).reshape(-1)]
    def relabel(self, rename_dict):
        # {old: new}
        new_label = self.contents['label'].copy()
        for old, new in rename_dict.items():
            new_label[self.contents['label'] == old] = new
        return MyDataSet(
            self.contents['data'], self.contents['faceId'], new_label, self.contents['normal']
        )
    def remove(self, args):
        if isinstance(args[0][0], bool):
            return self[[not p for p in args[0]]]
        return self[[not p for p in np.isin(range(len(self)), args)]]
    def shuffle(self):
        return self[np.random.choice(range(len(self)), len(self))]
    def split(self, n_out = None, p_out = None, label = False):
        if not label:
            if not n_out:
                n_out = int(np.round(p_out * len(self)))
            out = np.random.choice(range(len(self)), n_out)
            return self.remove([out]), self[out]
        else:
            d_in, d_out = zip(*[self.filter(label).split(n_out, p_out, False) for label, num in self.count()])
            return MyDataSet.merge(d_in).shuffle(), MyDataSet.merge(d_out).shuffle()

In [3]:
# Read in a single dataset
DS0 = MyDataSet.from_h5_file('ply_data_train0.h5')
DS0

<DataSet of 2048 records with 40 labels>

In [4]:
# Summary of this dataset
DS0.summary()

Num of records: 2048		Num of unique labels: 40
Unique labels and their frequencies:
(8, 191) (0, 138) (30, 138) (22, 110) (4, 106) (37, 99) (2, 94) (33, 77) (35, 69) (36, 64) (5, 62) (26, 61) (21, 55) (7, 44) (23, 44) (14, 42) (3, 41) (25, 41) (31, 41) (12, 40) (16, 36) (15, 35) (17, 33) (11, 32) (18, 32) (20, 29) (9, 28) (34, 28) (19, 27) (28, 24) (38, 24) (27, 23) (29, 23) (32, 23) (24, 21) (1, 19) (13, 16) (39, 16) (6, 14) (10, 8)


In [5]:
# Read in another dataset
DS1 = MyDataSet.from_h5_file('ply_data_train1.h5')
DS1

<DataSet of 2048 records with 40 labels>

In [6]:
# You can feel free to combine datasets
DS_com1 = DS0 + DS1
DS_com1

<DataSet of 4096 records with 40 labels>

In [7]:
# Another way to combine datasets
DS_com2 = MyDataSet.merge([DS0, DS1])
DS_com2

<DataSet of 4096 records with 40 labels>

In [8]:
# You can even combine the datasets from reading files
DS_full = MyDataSet.from_h5_files([f'ply_data_train{i}.h5' for i in range(5)])
DS_full

<DataSet of 9840 records with 40 labels>

In [9]:
# Let's see how the full dataset looks like
DS_full.summary()

Num of records: 9840		Num of unique labels: 40
Unique labels and their frequencies:
(8, 889) (30, 680) (0, 625) (4, 572) (2, 515) (37, 475) (22, 465) (33, 392) (35, 344) (5, 335) (21, 284) (36, 267) (26, 239) (25, 231) (12, 200) (14, 200) (23, 200) (7, 197) (3, 173) (16, 171) (9, 167) (34, 163) (17, 155) (15, 149) (20, 149) (18, 145) (11, 137) (29, 128) (19, 124) (31, 124) (28, 115) (13, 109) (1, 106) (27, 104) (39, 103) (32, 90) (24, 88) (38, 87) (10, 79) (6, 64)


In [10]:
# You can obtain labels or data using bracket []
DS_full['label']

array([[30],
       [27],
       [30],
       ...,
       [35],
       [ 7],
       [ 8]], dtype=uint8)

In [11]:
# You can also get observations using bracket and index
DS_full[[1,4,8,10,13,15,19]]

<DataSet of 7 records with 7 labels>

In [12]:
# Or, with bracket and boolean values
DS_full[[i % 2 == 0 for i in range(len(DS_full))]]

<DataSet of 4920 records with 40 labels>

In [13]:
# If you want to drop out some observations, you can do this.
# Don't be afraid, this is done out of place.
# That means, it will generate a new dataset with observations removed,
# not affecting the old dataset
DS_full.remove([[1,4,8,10,13,15,19]])

<DataSet of 9833 records with 40 labels>

In [14]:
# Boolean values also work
DS_full.remove([[i % 2 == 0 for i in range(len(DS_full))]])

<DataSet of 4920 records with 40 labels>

In [15]:
# If you are really afraid of using remove, you can save the old one in advance.
DS_backup = DS_full.duplicate()

In [16]:
# To filter out only observations with labels 0 and 1, you can do:
DS_L01 = DS_full.filter([0, 1])
DS_L01

<DataSet of 731 records with 2 labels>

In [17]:
# To relabel, just use a dictionary with {old: new} as parameter
# Let's swap label 0 and label 1
# Before swapping,
DS_full.count()

[(0, 625),
 (1, 106),
 (2, 515),
 (3, 173),
 (4, 572),
 (5, 335),
 (6, 64),
 (7, 197),
 (8, 889),
 (9, 167),
 (10, 79),
 (11, 137),
 (12, 200),
 (13, 109),
 (14, 200),
 (15, 149),
 (16, 171),
 (17, 155),
 (18, 145),
 (19, 124),
 (20, 149),
 (21, 284),
 (22, 465),
 (23, 200),
 (24, 88),
 (25, 231),
 (26, 239),
 (27, 104),
 (28, 115),
 (29, 128),
 (30, 680),
 (31, 124),
 (32, 90),
 (33, 392),
 (34, 163),
 (35, 344),
 (36, 267),
 (37, 475),
 (38, 87),
 (39, 103)]

In [18]:
# After swapping
DS_full.relabel({0: 1, 1: 0}).count()

[(0, 106),
 (1, 625),
 (2, 515),
 (3, 173),
 (4, 572),
 (5, 335),
 (6, 64),
 (7, 197),
 (8, 889),
 (9, 167),
 (10, 79),
 (11, 137),
 (12, 200),
 (13, 109),
 (14, 200),
 (15, 149),
 (16, 171),
 (17, 155),
 (18, 145),
 (19, 124),
 (20, 149),
 (21, 284),
 (22, 465),
 (23, 200),
 (24, 88),
 (25, 231),
 (26, 239),
 (27, 104),
 (28, 115),
 (29, 128),
 (30, 680),
 (31, 124),
 (32, 90),
 (33, 392),
 (34, 163),
 (35, 344),
 (36, 267),
 (37, 475),
 (38, 87),
 (39, 103)]

In [19]:
# This modification does not change the original dataset.
DS_full.count()

[(0, 625),
 (1, 106),
 (2, 515),
 (3, 173),
 (4, 572),
 (5, 335),
 (6, 64),
 (7, 197),
 (8, 889),
 (9, 167),
 (10, 79),
 (11, 137),
 (12, 200),
 (13, 109),
 (14, 200),
 (15, 149),
 (16, 171),
 (17, 155),
 (18, 145),
 (19, 124),
 (20, 149),
 (21, 284),
 (22, 465),
 (23, 200),
 (24, 88),
 (25, 231),
 (26, 239),
 (27, 104),
 (28, 115),
 (29, 128),
 (30, 680),
 (31, 124),
 (32, 90),
 (33, 392),
 (34, 163),
 (35, 344),
 (36, 267),
 (37, 475),
 (38, 87),
 (39, 103)]

In [20]:
# Shuffle is nice
DS_full = DS_full.shuffle()

In [21]:
# Let's make some split between training data and test data
# Sample 1000 out as test data
DS_full.split(n_out = 1000)

(<DataSet of 8893 records with 40 labels>,
 <DataSet of 1000 records with 40 labels>)

In [22]:
# Sample 10% out as test data
DS_full.split(p_out = 0.1)

(<DataSet of 8913 records with 40 labels>,
 <DataSet of 984 records with 40 labels>)

In [23]:
# Sample 100 out in each label as test data
DS_full.split(n_out = 100, label = True)

(<DataSet of 6793 records with 40 labels>,
 <DataSet of 4000 records with 40 labels>)

In [24]:
# Sample 10% out in each label as test data
DS_full.split(p_out = 0.1, label = True)

(<DataSet of 8899 records with 40 labels>,
 <DataSet of 986 records with 40 labels>)

In [25]:
# If you use n_out and p_out together, p.out will be ignored
DS_full.split(n_out = 100, p_out = 0.99)

(<DataSet of 9740 records with 40 labels>,
 <DataSet of 100 records with 34 labels>)

In [26]:
# Let's play with a small dataset with 1000 observations.
DS_play = DS_full.shuffle()[:1000]
DS_play

<DataSet of 1000 records with 40 labels>

In [27]:
# Save it back to h5 file
DS_play.to_h5_file('play_test.h5')

In [28]:
# You can also save back without the extension
DS_play.to_h5_file('play_test_without_extension')

In [29]:
# You can limit the maximum size of each file
DS_play.to_h5_file('play_max_size.h5', max_size = 300)