# Creation of CaloChallenge 2022 datasets

In [2]:
import pandas as pd
import numpy as np
import h5py

## Dataset 1

### Photons

In [2]:
""" Based on high-stats data of https://opendata.cern.ch/record/15012# 
    steps:
        - load csv, take first N events (size of actual GAN training sets)
        - check for nans
        - create dataset in hdf5 file of that energy

"""
folder = '../../../../ML_source/CaloChallenge/photons_samples_highStat/'
num_events = {256: 10000, 512: 10000, 1024: 10000, 2048: 10000, 4096: 10000, 8192: 10000,
              16384: 10000, 32768: 10000, 65536: 10000, 131072: 10000, 262144: 10000, 
              524288: 5000, 1048576: 3000, 2097152: 2000, 4194304: 1000}

dataset_file = h5py.File(folder+'dataset_1_photons.hdf5', 'w')

for n in range(8,23):
    energy = float(2**n)
    file_name = folder+'pid22_E'+str(2**n)+'_eta_20_25_voxalisation.csv'
    loaded_array = pd.read_csv(file_name, header=None).to_numpy()
    if np.isnan(loaded_array[:num_events[energy]]).any():
        raise ValueError("Dataset contains NaNs!")
    dataset_file.create_dataset('data_'+str(int(energy)), data=loaded_array[:num_events[energy]].clip(min=0.),
                               compression='gzip')

dataset_file.close()


### Pions

In [3]:
""" Based on high-stats data of https://opendata.cern.ch/record/15012# 
    steps:
        - load csv, take all given events until high-stats are ready (only events at 4 TeV are missing)
        - check for nans
        - create dataset in hdf5 file of that energy

"""
folder = '../../../../ML_source/CaloChallenge/pion_samples/'

# not used for now:
num_events = {256: 10000, 512: 10000, 1024: 10000, 2048: 10000, 4096: 10000, 8192: 10000,
             16384: 10000, 32768: 10000, 65536: 10000, 131072: 10000, 262144: 10000, 
             524288: 5000, 1048576: 3000, 2097152: 2000, 4194304: 1000}

dataset_file = h5py.File(folder+'dataset_1_pions.hdf5', 'w')

for n in range(8,23):
    energy = float(2**n)
    file_name = folder+'pid211_E'+str(2**n)+'_eta_20_25_voxalisation.csv'
    loaded_array = pd.read_csv(file_name, header=None).to_numpy()
    if np.isnan(loaded_array).any():
        raise ValueError("Dataset contains NaNs!")
    dataset_file.create_dataset('data_'+str(int(energy)), data=loaded_array.clip(min=0.), compression='gzip')

dataset_file.close()

## Dataset 2

### Electrons

In [2]:
""" Based on dataset 2 of Dalila, taken from https://cernbox.cern.ch/index.php/s/KwFvdbub9QNP6qA
    steps:
        - load existing hdf5 file
        - transform shape (num_events, r_bins, alpha_bins, layer_id) to 
          (num_events, layer_id, alpha_bins, r_bins) as in dataset 1, then flatten last dimensions
        - write to new hdf5 file

"""
folder = '../../../../ML_source/CaloChallenge/Dataset2/'

data_source = h5py.File(folder+'SiW_LowGran.hdf5', 'r')
dataset_file = h5py.File(folder + 'dataset_2.hdf5', 'w')

for n in range(0, 11):
    energy = 2**n
    data = data_source[str(2**n)]["data"][:]
    data = np.moveaxis(data, 3, 1)
    data = np.moveaxis(data, 3, 2)
    dataset_file.create_dataset('data_'+str(energy*1000), data=data.clip(min=0.).reshape(len(data), -1),
                               compression='gzip')

data_source.close()
dataset_file.close()

## Dataset 3

### Electrons

In [10]:
""" Based on dataset 3 of Dalila, taken from https://cernbox.cern.ch/index.php/s/KwFvdbub9QNP6qA Dataset3/HDF5/SiW
    steps:
        - load existing hdf5 files (1 per incident angle)
        - transform shape (num_events, r_bins, alpha_bins, layer_id) to 
          (num_events, layer_id, alpha_bins, r_bins) as in dataset 1, then flatten last dimensions
        - write to new hdf5 file
"""

folder = '../../../../ML_source/CaloChallenge/Dataset3/'
angles = ['50', '60', '70', '80', '90']
for angle in angles:
    data_source = h5py.File(folder+'SiW_angle_{}.h5'.format(angle), 'r')
    dataset_file = h5py.File(folder + 'dataset_3_{}.hdf5'.format(angle), 'w')

    for n in range(0, 11):
        energy = 2**n
        data = data_source[str(2**n)]
        data = np.moveaxis(data, 3, 1)
        data = np.moveaxis(data, 3, 2)
        dataset_file.create_dataset('data_'+str(energy*1000), data=data.clip(min=0.).reshape(len(data), -1), 
                                    compression='gzip')
    data_source.close()
    dataset_file.close()
    print("Done with angle {}".format(angle))


Done with angle 50
Done with angle 60
Done with angle 70
Done with angle 80
Done with angle 90


In [None]:
# missing events in:
# 50 2 (8000, 18, 50, 45)
# 50 64 (8000, 18, 50, 45)
# 70 256 (8000, 18, 50, 45)
# 70 512 (8000, 18, 50, 45)
