In [107]:
import h5py as h5
import numpy as np
import tensorflow as tf


# gauss noise
# 0.1 ~ 1 p.e., 150 ns, 4, 4, 15 m
apply_add_gauss = True
# normal
g_add_stds = [0.03, 0.005, 0.005, 0.005, 0.00003]
# bigger
#stds_gauss = [0.03, 0.005, 0.02, 0.02, 0.0007]
apply_mult_gauss = True
q_noise_fraction = 0.1

# limiting Q vals
set_up_Q_lim = True
up_Q_lim = 100

# mult noise
class gauss_mult_noise:
    def __init__(self, Q_mean_noise, n_fraction):
        self.Q_mean_noise = Q_mean_noise
        self.n_fraction = n_fraction
    def make_noise(self, Qs):
        noises = np.random.normal( scale=self.n_fraction, size=Qs.shape )
        Qs = Qs + (Qs+self.Q_mean_noise)*noises
        return Qs

# addative noise
def add_gauss(data, g_add_stds, ev_starts):
    g_add_stds = np.broadcast_to( g_add_stds, data.shape )
    noise = np.random.normal( scale=g_add_stds, size=data.shape )
    data += noise
    print('Test inside add_gauss')
    print('Data right after noise adding: ', data[0:20,1],'\n')
    print('Data shape: ',data.shape,'\n' )
    ev_idxs_local = ev_starts - ev_starts[0]
    print('Local start idxs: ',ev_idxs_local)
    print('first 2 reindexings: ')
    print(np.argsort( data[ev_idxs_local[0]:ev_idxs_local[1],1], axis=0)+ev_idxs_local[0])
    print(np.argsort( data[ev_idxs_local[1]:ev_idxs_local[2],1], axis=0)+ev_idxs_local[1])
    sort_idxs = np.concatenate(
            [ np.argsort( data[ev_idxs_local[i]:ev_idxs_local[i+1],1], axis=0)+ev_idxs_local[i] for i in range(len(ev_starts)-1) ]
    )
    print('Idxs after sorting: ', sort_idxs[0:20],'\n')
    data = data[sort_idxs]
    print('Data after reindexing: ', data[0:20,1],'\n')
    print('Data shape: ',data.shape,'\n' )
    return data

class generator:
    def __init__(self, file, regime, batch_size, 
                 set_up_Q_lim, up_Q_lim, 
                 apply_add_gauss, g_add_stds, apply_mult_gauss, q_noise_fraction):
        
        self.file = file
        self.regime = regime
        self.batch_size = batch_size
        
        self.hf = h5.File(self.file,'r')
        hf = self.hf

        self.ev_starts = hf[regime + '/ev_starts/data']
        self.num = len(self.ev_starts[1:] - self.ev_starts[0:-1])
        self.batch_num = self.num // self.batch_size
        self.gen_num = self.batch_num * self.batch_size
        
        self.norm_zeros = (0.-hf['norm_param/mean'][:])/hf['norm_param/std'][:]
        print(self.norm_zeros)

        #For noise
        self.set_up_Q_lim = set_up_Q_lim
        self.apply_add_gauss = apply_add_gauss
        self.apply_mult_gauss = apply_mult_gauss
        self.g_add_stds = g_add_stds
        if set_up_Q_lim:
            Q_mean = hf['norm_param/mean'][0]
            Q_std = hf['norm_param/std'][0]
            self.Q_up_lim_norm = (up_Q_lim-Q_mean)/Q_std
        if apply_mult_gauss:
            Q_mean = hf['norm_param/mean'][0]
            Q_std = hf['norm_param/std'][0]
            self.mult_gauss = gauss_mult_noise(Q_mean/Q_std, q_noise_fraction)
        self.batch_num = self.num // self.batch_size
        
    def step(self, start, stop, ev_starts):
        hf = self.hf
        data_start = ev_starts[0]
        data_stop = ev_starts[-1]
        data = hf[self.regime + '/data/data'][data_start : data_stop]
        print('Считали данные: ', data[0:20,1],'\n')
        print('Форма данных: ',data.shape,'\n')
        print('От и до: ', data_start, data_stop, '\n')
        labels = np.zeros((self.batch_size, 2))
        ids = hf[self.regime + '/ev_ids_corr/data'][start:stop]  # id of event - starting with 'nu' or 'mu'
        ids = np.array([i[0] for i in ids]).reshape(ids.shape[0],1)
        labels[:] = np.where(ids == 110, [0,1], [1,0])  # 110 - byte code for letter 'n'
        if self.set_up_Q_lim:
            data[:,0:1] = np.where( data[:,0:1]>self.Q_up_lim_norm, self.Q_up_lim_norm, data[:,0:1] )
        
        print('Данные после Q_limit: ', data[0:20,1],'\n')
        print('Форма данных: ',data.shape,'\n')
        
        # apply noise
        if self.apply_add_gauss:
            data = add_gauss(data, self.g_add_stds, ev_starts)
        print('Данные после добавления add noise: ', data[0:20,1],'\n')
        print('Форма данных: ',data.shape,'\n')
        if self.apply_mult_gauss:
            data[:,0] = self.mult_gauss.make_noise(data[:,0])
        print('Данные после добавления mult noise: ', data[0:20,1],'\n')
        print('Форма данных: ',data.shape,'\n')
        
        print('Начала событий глобально: ', ev_starts,'\n')
        
        data = tf.RaggedTensor.from_row_starts(values=data, row_starts=ev_starts[0:-1]- ev_starts[0])
        print('Начала событий локально:',ev_starts[0:-1]- ev_starts[0])
        
        print('Test after ragged')
        print('Форма данных: ',data.shape)
        print('Данные: ', data[0,:,1],'\n',data[1,:,1],'\n')
        data = data.to_tensor(default_value=self.norm_zeros)
        print('Test after to_tensor')
        print('Форма данных: ',data.shape)
        print('Данные: ', data[0,:,1],'\n',data[1,:,1],'\n')
        mask = tf.where(tf.not_equal(data[:,:,0:1], self.norm_zeros[0:1]),1.,0.)
        data = tf.concat([data, mask], axis = -1)
        print('Test after concat')
        print('Форма данных: ',data.shape)
        print('Данные: ', data[0,:,1],'\n',data[1,:,1],'\n')
        
        
        return (data, labels)

    def __call__(self):
        start = 0
        stop = self.batch_size
        for i in range(self.batch_num):
            ev_starts = self.hf[self.regime+'/ev_starts/data'][start:stop+1]
            out_data = self.step(start, stop, ev_starts)
            yield out_data
            start+=self.batch_size
            stop+=self.batch_size

def make_dataset(h5f, regime, batch_size, shape):
    bs = batch_size
    gen = generator(h5f, regime, bs, set_up_Q_lim, up_Q_lim, 
                             apply_add_gauss, g_add_stds, apply_mult_gauss, q_noise_fraction)
    dataset = tf.data.Dataset.from_generator(gen,
                                                   output_signature=(tf.TensorSpec(shape=(bs, shape[0], shape[1])),
                                                                     tf.TensorSpec(shape=(bs, 2)))
                                                  )
    dataset = dataset.repeat(-1).prefetch(tf.data.AUTOTUNE)
    return dataset

In [108]:
name = 'baikal_multi_0523_flat_pureMC_h5s2_norm.h5'
path_to_h5 = '../data/' + name
regime = 'train'

In [109]:
bs = 5
tr_generator = generator(path_to_h5, 'train', bs, 
                             set_up_Q_lim, up_Q_lim, 
                             apply_add_gauss, g_add_stds, apply_mult_gauss, q_noise_fraction)
dir(tr_generator)

[-0.1067038   0.09729419 -0.05384526 -0.00647046 -0.16614042]


['Q_up_lim_norm',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'apply_add_gauss',
 'apply_mult_gauss',
 'batch_num',
 'batch_size',
 'ev_starts',
 'file',
 'g_add_stds',
 'gen_num',
 'hf',
 'mult_gauss',
 'norm_zeros',
 'num',
 'regime',
 'set_up_Q_lim',
 'step']

In [110]:
gen = tr_generator()

In [111]:
example = next(gen)

Считали данные:  [ 0.02469319  0.09148708  0.4491755   0.46763283  0.4699991  -1.7807195
 -1.0911261  -0.5558291  -0.4391692   0.40692773  0.5532758   0.70569307
  0.84487706  0.86309075  1.0060765   1.3627939   3.3416753   3.4745114
  4.116899    4.1696987 ] 

Форма данных:  (73, 5) 

От и до:  0 73 

Данные после Q_limit:  [ 0.02469319  0.09148708  0.4491755   0.46763283  0.4699991  -1.7807195
 -1.0911261  -0.5558291  -0.4391692   0.40692773  0.5532758   0.70569307
  0.84487706  0.86309075  1.0060765   1.3627939   3.3416753   3.4745114
  4.116899    4.1696987 ] 

Форма данных:  (73, 5) 

Test inside add_gauss
Data right after noise adding:  [ 0.02006764  0.09489588  0.4519045   0.46615368  0.46690357 -1.7838593
 -1.0793904  -0.55047953 -0.4419314   0.39730674  0.5505153   0.7111094
  0.84005034  0.8625408   1.0122515   1.36019     3.3409069   3.4683244
  4.1267424   4.1799583 ] 

Data shape:  (73, 5) 

Local start idxs:  [ 0  5 16 23 29 73]
first 2 reindexings: 
[0 1 2 3 4]
[ 5  6  7

In [61]:
data = h5.File(path_to_h5,'r')['train/data/data'][0 : 100]

In [73]:
v = data[0:73]
print(v.shape)
rt = tf.RaggedTensor.from_row_starts(
    values=v,
    row_starts=[0,5,16,23,29,73])
rt.shape, rt[0,:,1], rt[-1,:,1]

(73, 5)


(TensorShape([6, None, 5]),
 <tf.Tensor: shape=(5,), dtype=float32, numpy=
 array([0.02469319, 0.09148708, 0.4491755 , 0.46763283, 0.4699991 ],
       dtype=float32)>,
 <tf.Tensor: shape=(0,), dtype=float32, numpy=array([], dtype=float32)>)

In [59]:
data[0:20,1]

array([ 0.5532758 ,  0.70569307,  0.84487706,  0.86309075,  1.0060765 ,
        1.3627939 ,  3.3416753 ,  3.4745114 ,  4.116899  ,  4.1696987 ,
        4.177723  ,  4.1896605 ,  4.290546  , -1.2187362 , -1.1855425 ,
       -1.1366465 , -1.0479275 , -0.20614454,  0.03989232, -1.7055093 ],
      dtype=float32)

In [60]:
data[20:40,1]

array([-1.7015259, -1.6303004, -1.6229694, -1.6003773, -1.5790453,
       -1.573774 , -1.5218332, -1.4792712, -1.469747 , -1.4313707,
       -1.3983456, -1.3816693, -1.3383052, -1.3106796, -1.2600961,
       -1.1889544, -1.1739247, -1.1454203, -1.1401889, -1.1317968],
      dtype=float32)

In [10]:
import h5py as h5
import numpy as np

name = 'baikal_multi_0523_flat_pureMC_h5s2_norm.h5'
path_to_h5 = '../data/' + name
with h5.File(path_to_h5,'r') as hf:
    print(np.std(hf['train/data/data'][0:10**6,0]))

0.90816045


In [37]:
example[0][0,:,1]

<tf.Tensor: shape=(646,), dtype=float32, numpy=
array([0.03187098, 0.08750127, 0.44391227, 0.4669332 , 0.46878722,
       0.09729419, 0.09729419, 0.09729419, 0.09729419, 0.09729419,
       0.09729419, 0.09729419, 0.09729419, 0.09729419, 0.09729419,
       0.09729419, 0.09729419, 0.09729419, 0.09729419, 0.09729419,
       0.09729419, 0.09729419, 0.09729419, 0.09729419, 0.09729419,
       0.09729419, 0.09729419, 0.09729419, 0.09729419, 0.09729419,
       0.09729419, 0.09729419, 0.09729419, 0.09729419, 0.09729419,
       0.09729419, 0.09729419, 0.09729419, 0.09729419, 0.09729419,
       0.09729419, 0.09729419, 0.09729419, 0.09729419, 0.09729419,
       0.09729419, 0.09729419, 0.09729419, 0.09729419, 0.09729419,
       0.09729419, 0.09729419, 0.09729419, 0.09729419, 0.09729419,
       0.09729419, 0.09729419, 0.09729419, 0.09729419, 0.09729419,
       0.09729419, 0.09729419, 0.09729419, 0.09729419, 0.09729419,
       0.09729419, 0.09729419, 0.09729419, 0.09729419, 0.09729419,
       0.09729