## Get the input data and net data

In [None]:
!cp drive/MyDrive/NN_Energy/Dist_for_Collab.tar.gz .
!tar -zxvf Dist_for_Collab.tar.gz

In [3]:
!cp -r drive/MyDrive/NN_Energy/Conv7_MDN/ .

In [1]:

test_interval=5 

# import libraries
import torch
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import shutil
import re
import math
import time
import glob
import gzip
import sys
import matplotlib.pyplot as plt
import os
import random
import h5py


In [2]:
class EventsData(Dataset):
    def __init__(self,data_dir,per=100,batch_size=500):
        # Save the directory of the data
        self.data_dir = data_dir
        
        # Get the names of the files
        self.names=glob.glob(str(data_dir)+'*.hdf5')
        # Number of files in the dir
        self.size_dir=len(self.names)
        
        # Set the files acording to the percentage
        self.size_dir=math.ceil(len(self.names)*(per/100))
        self.names=self.names[0:self.size_dir]
        # Sort the files alphabetically
        self.names = sorted(self.names)
        #random.shuffle(self.names)
        
        # Get the number of events per file
        print(self.names[0])
        f = h5py.File(self.names[0],'r')
        self.size_file=f['y'].shape[0]
        
        # Get the total number of events
        self.total_events=0
        for name in self.names:
            f = h5py.File(name,'r')
            y = f['y']
            self.total_events+=y.shape[0]
            
        # Load the whole dataset into the RAM
        self.data_big = torch.zeros(self.total_events,25,161)
        self.target_big = torch.zeros(self.total_events)

        print("Reading "+str(self.data_dir)+" with "+str(self.size_dir)+" files.")
        for a in range(len(self.names)):
            f = h5py.File(self.names[a],'r')
            self.data_big[(a*1000):(((a+1)*1000))]=torch.tensor(f['X1'][:,:,:,0])
            self.target_big[(a*1000):(((a+1)*1000))]=torch.tensor(f['y'][:,0])
            self.target_big[(a*1000):(((a+1)*1000))].size()
            
        # Number of iterations to finish the dataset
        self.batch_size=batch_size
        self.iters=math.floor(self.total_events/batch_size)
        self.iters_per_file= math.floor(self.size_file/batch_size)
        self.real_events=self.batch_size*self.iters

        print("There are "+str(self.total_events)+" events.")
        print("In "+str(self.size_dir)+" separate files.")
        print("Each file containing "+str(self.size_file)+" events.")
        print("In "+str(self.iters)+" iterations")
        print("The real number of events is: "+str(self.real_events))
        


    def get_len(self):
        return self.real_events
    
    def get_iter(self):
        # Returns the number of iteracions og getitem to finish the dataset
        return self.iters

    def get_batch(self, idx):
        # Get the file that shall be opened
        ind1=idx*self.batch_size
        ind2=((idx+1)*self.batch_size)
        
        #print(ind1)
        #print(ind2)
        
        data=self.data_big[ind1:ind2,:,:]
        target=self.target_big[ind1:ind2]
        
        # Get the events that will be extracted from the file
        #ind2=ind+self.batch_size
        
        data=data.unsqueeze(dim=3)
        target=target.unsqueeze(dim=1)
        
        #Only for conv with modulus
        data=data.transpose(1,3)
        data=data.transpose(2,3)
        
        
        #target[:,0]=torch.tensor(f['y'][ind:ind2,0])
        target=torch.arccos(target)
    
        
        return data.float(),target.float()

In [3]:
def init_data(percentage,batch):
        # Save for exporting
        percentage=percentage
        batch_size=batch
        # Initialize the datasets
        print("Train dataset:")
        training_data = EventsData(data_dir='Mod_full_dist/train_data_fixed/', per=percentage, batch_size=batch);
        print()

        print("Validation dataset:")
        validation_data=EventsData(data_dir='Mod_full_dist/validation_data_fixed/', per=percentage, batch_size=batch);
        print()

        print("Test dataset:")
        test_data=EventsData(data_dir='Mod_full_dist/test_data_fixed/', per=percentage, batch_size=batch);
        print()

        return training_data,validation_data,test_data

In [4]:
batch_size_used = 1000 # Has to be fixed
percentage_used = 100

train,val,test = init_data(percentage_used,batch_size_used)

Train dataset:
Mod_full_dist/train_data_fixed/train_dataset_0.hdf5
Reading Mod_full_dist/train_data_fixed/ with 862 files.
There are 862000 events.
In 862 separate files.
Each file containing 1000 events.
In 862 iterations
The real number of events is: 862000

Validation dataset:
Mod_full_dist/validation_data_fixed/validation_dataset_0.hdf5
Reading Mod_full_dist/validation_data_fixed/ with 247 files.
There are 247000 events.
In 247 separate files.
Each file containing 1000 events.
In 247 iterations
The real number of events is: 247000

Test dataset:
Mod_full_dist/test_data_fixed/test_dataset_0.hdf5
Reading Mod_full_dist/test_data_fixed/ with 120 files.
There are 120000 events.
In 120 separate files.
Each file containing 1000 events.
In 120 iterations
The real number of events is: 120000



## Declaramos la arquitectura de nuestra red, tiene que ser exactamente igual a que vamos a evaluar

In [5]:
# This class will contain the NN arquitecture, it will be pushed to the GPU
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        kernel_size=(2,10)
        
        #conv layer, sees 25x161x1 tensor
        self.conv1 = nn.Conv2d(1, 16, kernel_size,padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size,padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size,padding=1)
        self.conv4 = nn.Conv2d(64, 128, kernel_size,padding=1)

        
        self.pool = nn.MaxPool2d((2, 2))
        input_flatten = 6656
        hidden_1 = 600
        hidden_2 = 200
        hidden_3 = 50
        # linear layer (784 -> hidden_1)
        self.fc1 = nn.Linear(input_flatten, hidden_1)
        # linear layer (n_hidden -> hidden_2)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        
        self.fc3 = nn.Linear(hidden_2, hidden_3)
        self.mu = nn.Linear(hidden_3, 1)
        self.sigma = nn.Linear(hidden_3,1)
        
        # dropout layer (p=0.2)
        # dropout prevents overfitting of data
        self.dropout = nn.Dropout(0.25)


    def forward(self, x):
        input_flatten = 6656
        # add sequence of convolutional and max pooling layers
        x = (F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = self.pool(F.relu(self.conv4(x)))
   
        x = x.reshape(-1, input_flatten)
        # add hidden layer, with relu activation function
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = F.relu(self.fc3(x))
        x = self.dropout(x)
        
        mu = self.mu(x)
        sigma = torch.exp(self.sigma(x))
        return mu,sigma

## Pasamos los datos a la red, podemos seleccionar:
* La carpeta donde está el archivo
* El nombre del archivo que vamos a cargar
* El dispositivo donde vamos a correr la red: cuda o cpu

También declaramos las variables en RAM donde guardaremos los resultados de los diferentes datasets

In [6]:
# Check if cuda is available and set as default device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net=Net()
print(net)
net.cuda()
# Cargar aqui el archivo
net.load_state_dict(torch.load('Conv7_MDN/Conv7_MDNH_1000_Rad_exp_2.pt'))
net.eval()

# This is where the values will be stored
results_train=np.zeros([train.get_len(),2])
results_validation=np.zeros([val.get_len(),2])
results_test=np.zeros([test.get_len(),2])

Net(
  (conv1): Conv2d(1, 16, kernel_size=(2, 10), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(16, 32, kernel_size=(2, 10), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(32, 64, kernel_size=(2, 10), stride=(1, 1), padding=(1, 1))
  (conv4): Conv2d(64, 128, kernel_size=(2, 10), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=6656, out_features=600, bias=True)
  (fc2): Linear(in_features=600, out_features=200, bias=True)
  (fc3): Linear(in_features=200, out_features=50, bias=True)
  (mu): Linear(in_features=50, out_features=1, bias=True)
  (sigma): Linear(in_features=50, out_features=1, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)


## Sacamos los resultados de todos los datasets y los guardamos en sus corresponientes variables

In [11]:
for batch in range(train.get_iter()):
    # Get the data
    data,target = train.get_batch(batch)
    # move tensors to GPU if CUDA is available
    data = data.cuda()
    # Get the results from the foward pass to the CPU 
    # And get it as an numpy matrix
    mu,sigma = net(data)
    
    # Get from tensors to numpy
    output = mu.cpu().detach().numpy()
    sigma = sigma.detach().cpu().numpy()

    output.shape=(batch_size_used)
    sigma.shape=(batch_size_used)

    # Get Z components from the Zenith angle (in radians)
    zeta = np.cos(output)
    zigma = sigma*np.sin(output)

    # Append to the historical value
    results_train[(batch*1000):((batch+1)*1000),0]=zeta
    results_train[(batch*1000):((batch+1)*1000),1]=zigma


In [12]:
for batch in range(val.get_iter()):
    # Get the data
    data,target = val.get_batch(batch)
    # move tensors to GPU if CUDA is available
    data = data.cuda()
    # Get the results from the foward pass to the CPU 
    # And get it as an numpy matrix
    mu,sigma = net(data)
    
    # Get from tensors to numpy
    output = mu.cpu().detach().numpy()
    sigma = sigma.detach().cpu().numpy()

    output.shape=(batch_size_used)
    sigma.shape=(batch_size_used)

    # Get Z components from the Zenith angle (in radians)
    zeta = np.cos(output)
    zigma = sigma*np.sin(output)

    # Append to the historical value
    results_validation[(batch*1000):((batch+1)*1000),0]=zeta
    results_validation[(batch*1000):((batch+1)*1000),1]=zigma

In [13]:
for batch in range(test.get_iter()):
    # Get the data
    data,target = test.get_batch(batch)
    # move tensors to GPU if CUDA is available
    data = data.cuda()
    # Get the results from the foward pass to the CPU 
    # And get it as an numpy matrix
    mu,sigma = net(data)
    
    # Get from tensors to numpy
    output = mu.cpu().detach().numpy()
    sigma = sigma.detach().cpu().numpy()

    output.shape=(batch_size_used)
    sigma.shape=(batch_size_used)

    # Get Z components from the Zenith angle (in radians)
    zeta = np.cos(output)
    zigma = sigma*np.sin(output)

    # Append to the historical value
    results_test[(batch*1000):((batch+1)*1000),0]=zeta
    results_test[(batch*1000):((batch+1)*1000),1]=zigma


## Guardamos los resultados en un archivo .hdf5 

In [15]:
# Abrimos el archivo
f = h5py.File("Z_results_zenith.hdf5", "w")
# Guardamos las variables
f.create_dataset('Train_Z', data=results_train)
f.create_dataset('Val_Z', data=results_validation)
f.create_dataset('Test_Z', data=results_test)
# Cerramos el archivo
f.close()