# Creation of CaloChallenge 2022 datasets

In [2]:
import pandas as pd
import numpy as np
import h5py
import glob

## Dataset 1

### Photons

In [3]:
""" Based on high-stats data of https://opendata.cern.ch/record/15012# 
    steps:
        - load csv
        - check for nans
        - save incident energies and showers to an array
        - create m hdf5 files, each with N events (size of actual GAN training sets)
        - have the samples shuffled before saving

"""
folder = '../../../../ML_source/CaloChallenge/photons_samples_highStat/'

# number of events in the training dataset of ATLAS
# even though we have more events in the high-stats dataset, we only use this reduced amount
num_events = {256: 10000, 512: 10000, 1024: 10000, 2048: 10000, 4096: 10000, 8192: 10000,
              16384: 10000, 32768: 10000, 65536: 10000, 131072: 10000, 262144: 10000, 
              524288: 5000, 1048576: 3000, 2097152: 2000, 4194304: 1000}

# number of output files generated with event numbers as specified above.
# despite of what is written above, we are interested in having more data for evaluating the generative models.
num_datasets = 3

energies = [None for _ in range(num_datasets)]
showers = [None for _ in range(num_datasets)]

for i, n in enumerate(range(8,23)):
    energy = float(2**n)
    file_name = folder+'pid22_E'+str(2**n)+'_eta_20_25_voxalisation.csv'
    loaded_array = pd.read_csv(file_name, header=None).to_numpy()
    if np.isnan(loaded_array[:num_events[energy]]).any():
        raise ValueError("Dataset contains NaNs!")
    assert num_datasets*num_events[energy] <= len(loaded_array), "Not enough events in source file for E = {} MeV!".format(energy)
    for dataset_nr in range(num_datasets):
        if i == 0:
            energies[dataset_nr] = energy*np.ones(num_events[energy])
            showers[dataset_nr] = loaded_array[num_events[energy]*(dataset_nr):num_events[energy]*(dataset_nr+1)]
        else:
            energies[dataset_nr] = np.append(energies[dataset_nr], energy*np.ones(num_events[energy]))
            showers[dataset_nr] = np.append(showers[dataset_nr], loaded_array[num_events[energy]*(dataset_nr):num_events[energy]*(dataset_nr+1)], axis=0)
    print("Done with energy {}.".format(energy))

for i in range(num_datasets):
    dataset_file = h5py.File(folder+'dataset_1_photons_{}.hdf5'.format(int(i+1)), 'w')

    shuffled_idx = np.arange(len(energies[0]))
    np.random.shuffle(shuffled_idx)

    dataset_file.create_dataset('incident_energies', 
                                data=energies[i].clip(min=0.).reshape(len(energies[i]), -1)[shuffled_idx], 
                                compression='gzip')
    dataset_file.create_dataset('showers', 
                                data=showers[i].clip(min=0.).reshape(len(showers[i]), -1)[shuffled_idx], 
                                compression='gzip')
    print("Done with writing file {}".format(i+1))
    dataset_file.close()


Done with energy 256.0.
Done with energy 512.0.
Done with energy 1024.0.
Done with energy 2048.0.
Done with energy 4096.0.
Done with energy 8192.0.
Done with energy 16384.0.
Done with energy 32768.0.
Done with energy 65536.0.
Done with energy 131072.0.
Done with energy 262144.0.
Done with energy 524288.0.
Done with energy 1048576.0.
Done with energy 2097152.0.
Done with energy 4194304.0.
Done with writing file 1
Done with writing file 2
Done with writing file 3


### Pions

In [14]:
""" Based on high-stats data of https://opendata.cern.ch/record/15012# 
    steps:
        - load csv
        - check for nans
        - save incident energies and showers to an array
        - create m hdf5 files, each with N events (size of actual GAN training sets)
        - have the samples shuffled before saving

"""
folder = '../../../../ML_source/CaloChallenge/pion_samples/'

# number of events in the training dataset of ATLAS
# even though we have more events in the high-stats dataset, we only use this reduced amount

# NOT USED FOR NOW AS PIONS ARE NOT YET HIGH_STAT
num_events = {256: 10000, 512: 10000, 1024: 10000, 2048: 10000, 4096: 10000, 8192: 10000,
              16384: 10000, 32768: 10000, 65536: 10000, 131072: 10000, 262144: 10000, 
              524288: 5000, 1048576: 3000, 2097152: 2000, 4194304: 1000}

# USED NOW, LOW STAT DATA:
num_events = {256: 10000, 512: 10000, 1024: 10000, 2048: 10000, 4096: 10000, 8192: 10000,
              16384: 10000, 32768: 10000, 65536: 10000, 131072: 10000, 262144: 10000, 
              524288: 5000, 1048576: 3000, 2097152: 2000, 4194304: 230}


# number of output files generated with event numbers as specified above.
# despite of what is written above, we are interested in having more data for evaluating the generative models.
num_datasets = 1

energies = [None for _ in range(num_datasets)]
showers = [None for _ in range(num_datasets)]

for i, n in enumerate(range(8,23)):
    energy = float(2**n)
    file_name = folder+'pid211_E'+str(2**n)+'_eta_20_25_voxalisation.csv'
    loaded_array = pd.read_csv(file_name, header=None).to_numpy()
    if np.isnan(loaded_array[:num_events[energy]]).any():
        raise ValueError("Dataset contains NaNs!")
    assert num_datasets*num_events[energy] <= len(loaded_array), "Not enough events in source file for E = {} MeV!".format(energy)
    for dataset_nr in range(num_datasets):
        if i == 0:
            energies[dataset_nr] = energy*np.ones(num_events[energy])
            showers[dataset_nr] = loaded_array[num_events[energy]*(dataset_nr):num_events[energy]*(dataset_nr+1)]
        else:
            energies[dataset_nr] = np.append(energies[dataset_nr], energy*np.ones(num_events[energy]))
            showers[dataset_nr] = np.append(showers[dataset_nr], loaded_array[num_events[energy]*(dataset_nr):num_events[energy]*(dataset_nr+1)], axis=0)
    print("Done with energy {}.".format(energy))

for i in range(num_datasets):
    dataset_file = h5py.File(folder+'dataset_1_pions_{}.hdf5'.format(int(i+1)), 'w')

    shuffled_idx = np.arange(len(energies[0]))
    np.random.shuffle(shuffled_idx)

    dataset_file.create_dataset('incident_energies', 
                                data=energies[i].clip(min=0.).reshape(len(energies[i]), -1)[shuffled_idx], 
                                compression='gzip')
    dataset_file.create_dataset('showers', 
                                data=showers[i].clip(min=0.).reshape(len(showers[i]), -1)[shuffled_idx], 
                                compression='gzip')
    print("Done with writing file {}".format(i+1))
    dataset_file.close()


Done with energy 256.0.
Done with energy 512.0.
Done with energy 1024.0.
Done with energy 2048.0.
Done with energy 4096.0.
Done with energy 8192.0.
Done with energy 16384.0.
Done with energy 32768.0.
Done with energy 65536.0.
Done with energy 131072.0.
Done with energy 262144.0.
Done with energy 524288.0.
Done with energy 1048576.0.
Done with energy 2097152.0.
Done with energy 4194304.0.
Done with writing file 1


## Dataset 2

### Electrons

In [2]:
""" Based on CaloChallenge_Dataset/Dataset2 of Dalila, taken from https://cernbox.cern.ch/index.php/s/KwFvdbub9QNP6qA
    steps:
        - load existing hdf5 files
        - read out energy and shower
        - concatenate to list of 150k showers
        - transform shape (num_events, r_bins, alpha_bins, layer_id) to 
          (num_events, layer_id, alpha_bins, r_bins) as in dataset 1, then flatten last dimensions
        - rescale by sampling_fraction, as given by Anna
        - write to new hdf5 files

"""
folder = '../../../../ML_source/CaloChallenge/Dataset2_cont_energy/'
sampling_fraction = 1./0.033

energy = []
shower = []

output_nr = 1

for idx, source_file in enumerate(glob.glob(folder+'*')):
    data_source = h5py.File(source_file, 'r')

    for key in data_source["Angle_90"].keys():
        energy.append(float(key))
        shower.append(data_source["Angle_90"][key][:])
    data_source.close()
    print("Done with reading file {}/{}".format(idx+1, len(glob.glob(folder+'*'))-output_nr+1))
    if idx % 40 == 39:
        energy = np.array(energy)
        print(len(energy))
        shower = np.array(shower)
        print(shower.shape)
        shower = np.moveaxis(shower, 3, 1)
        print(shower.shape)
        shower = np.moveaxis(shower, 3, 2)
        print(shower.shape)
        
        shuffled_idx = np.arange(len(energy))
        np.random.shuffle(shuffled_idx)
        
        dataset_file = h5py.File(folder + 'dataset_2_{}.hdf5'.format(output_nr), 'w')
        dataset_file.create_dataset('incident_energies', data=energy.clip(min=0.).reshape(len(energy), -1)[shuffled_idx], compression='gzip')
        dataset_file.create_dataset('showers', data=sampling_fraction*shower.clip(min=0.).reshape(len(shower), -1)[shuffled_idx], compression='gzip')
        print("Done with writing file {}".format(output_nr))
        dataset_file.close()
        output_nr += 1
        energy = []
        shower = []


Done with reading file 1/124
Done with reading file 2/124
Done with reading file 3/124
Done with reading file 4/124
Done with reading file 5/124
Done with reading file 6/124
Done with reading file 7/124
Done with reading file 8/124
Done with reading file 9/124
Done with reading file 10/124
Done with reading file 11/124
Done with reading file 12/124
Done with reading file 13/124
Done with reading file 14/124
Done with reading file 15/124
Done with reading file 16/124
Done with reading file 17/124
Done with reading file 18/124
Done with reading file 19/124
Done with reading file 20/124
Done with reading file 21/124
Done with reading file 22/124
Done with reading file 23/124
Done with reading file 24/124
Done with reading file 25/124
Done with reading file 26/124
Done with reading file 27/124
Done with reading file 28/124
Done with reading file 29/124
Done with reading file 30/124
Done with reading file 31/124
Done with reading file 32/124
Done with reading file 33/124
Done with reading f

## Dataset 3

### Electrons

In [2]:
""" Based on CaloChallenge_Dataset/Dataset3 of Dalila, taken from https://cernbox.cern.ch/index.php/s/KwFvdbub9QNP6qA
    steps:
        - load existing hdf5 files
        - read out energy and shower
        - concatenate to list of 50k showers
        - transform shape (num_events, r_bins, alpha_bins, layer_id) to 
          (num_events, layer_id, alpha_bins, r_bins) as in dataset 1, then flatten last dimensions
        - rescale by sampling_fraction, as given by Anna
        - write to new hdf5 files

"""

folder = '../../../../ML_source/CaloChallenge/Dataset3_cont_energy/'
sampling_fraction = 1./0.033

energy = []
shower = []

output_nr = 1

for idx, source_file in enumerate(glob.glob(folder+'*')):
    data_source = h5py.File(source_file, 'r')

    for key in data_source["Angle_90"].keys():
        energy.append(float(key))
        shower.append(data_source["Angle_90"][key][:])
    data_source.close()
    print("Done with reading file {}/{}".format(idx+1, len(glob.glob(folder+'*'))-output_nr+1))
    if idx % 20 == 19:
        energy = np.array(energy)
        print(len(energy))
        shower = np.array(shower)
        print(shower.shape)
        shower = np.moveaxis(shower, 3, 1)
        print(shower.shape)
        shower = np.moveaxis(shower, 3, 2)
        print(shower.shape)
        
        shuffled_idx = np.arange(len(energy))
        np.random.shuffle(shuffled_idx)


        dataset_file = h5py.File(folder + 'dataset_3_{}.hdf5'.format(output_nr), 'w')
        dataset_file.create_dataset('incident_energies', data=energy.clip(min=0.).reshape(len(energy), -1)[shuffled_idx], compression='gzip')
        dataset_file.create_dataset('showers', data=sampling_fraction*shower.clip(min=0.).reshape(len(shower), -1)[shuffled_idx], compression='gzip')
        print("Done with writing file {}".format(output_nr))
        dataset_file.close()
        output_nr += 1
        energy = []
        shower = []


Done with reading file 1/127
Done with reading file 2/127
Done with reading file 3/127
Done with reading file 4/127
Done with reading file 5/127
Done with reading file 6/127
Done with reading file 7/127
Done with reading file 8/127
Done with reading file 9/127
Done with reading file 10/127
Done with reading file 11/127
Done with reading file 12/127
Done with reading file 13/127
Done with reading file 14/127
Done with reading file 15/127
Done with reading file 16/127
Done with reading file 17/127
Done with reading file 18/127
Done with reading file 19/127
Done with reading file 20/127
50000
(50000, 18, 50, 45)
(50000, 45, 18, 50)
(50000, 45, 50, 18)
Done with writing file 1
Done with reading file 21/128
Done with reading file 22/128
Done with reading file 23/128
Done with reading file 24/128
Done with reading file 25/128
Done with reading file 26/128
Done with reading file 27/128
Done with reading file 28/128
Done with reading file 29/128
Done with reading file 30/128
Done with reading 