In [9]:
# import libraries
import torch
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch.optim as optim
import numpy as np
import re
import math
import time
import glob
import gzip
import sys
import matplotlib.pyplot as plt
import os
import random
import h5py
delay=0
n = 300 # number of batches per test

# Default Dataloader

In [10]:
class EventsData(Dataset):
    def __init__(self,data_dir,per=100):
        # Save the directory of the data
        self.data_dir = data_dir
        
        # Get the names of the files
        self.names=glob.glob(str(data_dir)+'*.hdf5')
        # Number of files in the dir
        self.size_dir=len(self.names)
        
        # Set the files acording to the percentage
        self.size_dir=math.ceil(len(self.names)*(per/100))
        self.names=self.names[0:self.size_dir]
        #random.shuffle(self.names)
        
        # Get the number of events per file
        print(self.names[1])
        f = h5py.File(self.names[0],'r')
        self.size_file=f['y'].shape[0]
        
        # Get the total number of events
        self.total_events=0
        for name in self.names:
            f = h5py.File(name,'r')
            y = f['y']
            self.total_events+=y.shape[0]
            
        # Number of iterations to finish the dataset
    
       

        print("There are "+str(self.total_events)+" events.")
        print("In "+str(self.size_dir)+" separate files.")
        print("Each file containing "+str(self.size_file)+" events.")
       
        
        


    def __len__(self):
        return self.total_events
    

    def __getitem__(self, idx):
        # Get the file that shall be opened
        file=math.floor(idx/self.size_file)
        new_idx=idx-file*self.size_file
        f = h5py.File(self.names[file],'r')
        
        
        data=torch.tensor(f['X1'][new_idx,:,:]).unsqueeze(dim=3)
        
        #Only for conv with modulus
        data=data.transpose(1,3)
        data=data.transpose(2,3)
        
        target=torch.tensor(f['y'][new_idx,0])
        target=torch.arccos(target)
    
        
        return data.float(),target.float()

In [11]:
# Cargamos el dataloader
test_data = EventsData(data_dir='Mod_full_dist/train_data_fixed/', per=50);
test_dataloader = DataLoader(test_data, batch_size=50, shuffle=True)

Mod_full_dist/train_data_fixed/train_dataset_42.hdf5
There are 431000 events.
In 431 separate files.
Each file containing 1000 events.


In [12]:

times = np.zeros([n,1])

for i in range(n):
    start = time.time()
    a,b=next(iter(test_dataloader))
    stop = time.time()
    times[i]=stop-start
    #time.sleep(delay)
    
print("It took "+str(np.sum(times)/n)+" seconds per batch")

It took 0.0870932674407959 seconds per batch


# New dataloader (not RAM)

In [13]:
class EventsData(Dataset):
    def __init__(self,data_dir,per=100,batch_size=500):
        # Save the directory of the data
        self.data_dir = data_dir
        
        # Get the names of the files
        self.names=glob.glob(str(data_dir)+'*.hdf5')
        # Number of files in the dir
        self.size_dir=len(self.names)
        
        # Set the files acording to the percentage
        self.size_dir=math.ceil(len(self.names)*(per/100))
        self.names=self.names[0:self.size_dir]
        #random.shuffle(self.names)
        
        # Get the number of events per file
        print(self.names[1])
        f = h5py.File(self.names[0],'r')
        self.size_file=f['y'].shape[0]
        
        # Get the total number of events
        self.total_events=0
        for name in self.names:
            f = h5py.File(name,'r')
            y = f['y']
            self.total_events+=y.shape[0]
            
        # Number of iterations to finish the dataset
        self.batch_size=batch_size
        self.iters=math.floor(self.total_events/batch_size)
        self.iters_per_file= math.floor(self.size_file/batch_size)
        self.real_events=self.batch_size*self.iters

        print("There are "+str(self.total_events)+" events.")
        print("In "+str(self.size_dir)+" separate files.")
        print("Each file containing "+str(self.size_file)+" events.")
        print("In "+str(self.iters)+" iterations")
        print("The real number of events is: "+str(self.real_events))
        self.idx_file=-1


    def get_len(self):
        return self.real_events
    
    def get_iter(self):
        # Returns the number of iteracions og getitem to finish the dataset
        return self.iters

    def get_batch(self, idx):
        # Get the file that shall be opened
        idx_file=math.floor(idx*self.batch_size/self.size_file)
        if idx_file != self.idx_file:
            self.f = h5py.File(self.names[idx_file],'r')
            self.idx_file=idx_file
            

        ind=(idx*self.batch_size)%self.size_file
  
        # Get the events that will be extracted from the file
        ind2=ind+self.batch_size
        
        data=torch.tensor(self.f['X1'][ind:ind2,:,:]).unsqueeze(dim=3)
        
        #Only for conv with modulus
        data=data.transpose(1,3)
        data=data.transpose(2,3)
        
        target=torch.zeros(self.batch_size,1)
        
        target[:,0]=torch.tensor(self.f['y'][ind:ind2,0])
        target=torch.arccos(target)*(180.0/math.pi)
    
        
        return data.float(),target.float()

In [14]:
print("Test dataset:")
test=EventsData(data_dir='Mod_full_dist/train_data_fixed/', per=50, batch_size=50);
print()
    

Test dataset:
Mod_full_dist/train_data_fixed/train_dataset_42.hdf5
There are 431000 events.
In 431 separate files.
Each file containing 1000 events.
In 8620 iterations
The real number of events is: 431000



In [15]:

times = np.zeros([n,1])

for i in range(n):
    start = time.time()
    a,b=test.get_batch(i)
    stop = time.time()
    times[i]=stop-start
    #time.sleep(delay)
    
print("It took "+str(np.sum(times)/n)+" seconds per batch")

It took 0.0025989882151285807 seconds per batch


# RAM dataloader

In [8]:
class EventsData(Dataset):
    def __init__(self,data_dir,per=100,batch_size=500):
        # Save the directory of the data
        self.data_dir = data_dir
        
        # Get the names of the files
        self.names=glob.glob(str(data_dir)+'*.hdf5')
        # Number of files in the dir
        self.size_dir=len(self.names)
        
        # Set the files acording to the percentage
        self.size_dir=math.ceil(len(self.names)*(per/100))
        self.names=self.names[0:self.size_dir]
        #random.shuffle(self.names)
        
        # Get the number of events per file
        print(self.names[1])
        f = h5py.File(self.names[0],'r')
        self.size_file=f['y'].shape[0]
        
        # Get the total number of events
        self.total_events=0
        for name in self.names:
            f = h5py.File(name,'r')
            y = f['y']
            self.total_events+=y.shape[0]
            
        # Load the whole dataset into the RAM
        self.data_big = torch.zeros(self.total_events,25,161)
        self.target_big = torch.zeros(self.total_events)

        print("Reading "+str(self.data_dir)+" with "+str(self.size_dir)+" files.")
        for a in range(len(self.names)):
            f = h5py.File(self.names[a],'r')
            self.data_big[(a*1000):(((a+1)*1000))]=torch.tensor(f['X1'][:,:,:,0])
            self.target_big[(a*1000):(((a+1)*1000))]=torch.tensor(f['y'][:,0])
            self.target_big[(a*1000):(((a+1)*1000))].size()
            
        # Number of iterations to finish the dataset
        self.batch_size=batch_size
        self.iters=math.floor(self.total_events/batch_size)
        self.iters_per_file= math.floor(self.size_file/batch_size)
        self.real_events=self.batch_size*self.iters

        print("There are "+str(self.total_events)+" events.")
        print("In "+str(self.size_dir)+" separate files.")
        print("Each file containing "+str(self.size_file)+" events.")
        print("In "+str(self.iters)+" iterations")
        print("The real number of events is: "+str(self.real_events))
        


    def get_len(self):
        return self.real_events
    
    def get_iter(self):
        # Returns the number of iteracions og getitem to finish the dataset
        return self.iters

    def get_batch(self, idx):
        # Get the file that shall be opened
        ind1=idx*self.batch_size
        ind2=((idx+1)*self.batch_size)
        
        #print(ind1)
        #print(ind2)
        
        data=self.data_big[ind1:ind2,:,:]
        target=self.target_big[ind1:ind2]
        
        # Get the events that will be extracted from the file
        #ind2=ind+self.batch_size
        
        data=data.unsqueeze(dim=3)
        target=target.unsqueeze(dim=1)
        
        #Only for conv with modulus
        data=data.transpose(1,3)
        data=data.transpose(2,3)
        
        
        #target[:,0]=torch.tensor(f['y'][ind:ind2,0])
        target=torch.arccos(target)*(180.0/math.pi)
    
        
        return data.float(),target.float()

In [9]:
print("Test dataset:")
test=EventsData(data_dir='Mod_full_dist/train_data_fixed/', per=50, batch_size=50);
print()
    

Test dataset:
Mod_full_dist/test_data_fixed/test_dataset_56.hdf5
Reading Mod_full_dist/test_data_fixed/ with 60 files.
There are 60000 events.
In 60 separate files.
Each file containing 1000 events.
In 1200 iterations
The real number of events is: 60000



In [10]:

times = np.zeros([n,1])

for i in range(n):
    start = time.time()
    a,b=test.get_batch(n)
    stop = time.time()
    times[i]=stop-start
    time.sleep(delay)
    
print("It took "+str(np.sum(times)/n)+" seconds per batch")

It took 2.4814605712890625e-05 seconds per batch
