# Notebook for creation of the images dataset (env: Scikit-HEP)

### Load the packges we are going to use:

In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import matplotlib as mpl
from pyjet import cluster,DTYPE_PTEPM
import pandas as pd

In [2]:
## Disable the interactive session from matplotlib to avoid fill up all your memory:
mpl.interactive(False)

### Load the data we want to use to create the images:

#### Instead of load the full large h5 file we are going to make a generator to load the file in batches more menageble:

In [None]:
def generator(filename, chunksize=512,total_size=1100000):

    m = 0
    
    while True:

        yield pd.read_hdf(filename,start=m*chunksize, stop=(m+1)*chunksize)

        m+=1
        if (m+1)*chunksize > total_size:
            m=0

#### I'm going to create a class that takes as arguments the size of the images, the size of the pixels (bins), the unclestered events from my dataframe, the weights of each particle (aka each pixel in our images) which will be use as color scale and generate the images (.png) we are going to use in our classifier:

- later I'm going to move this code to a .py file with all functions and classes we will use.

In [7]:
class DetectorImage:
    '''
    Class for generate pixeleted jet images from pseudojets objects from pyjet.
    arguments:
        pixel_size = size of the pixels based onthe hist2d bin images
        image_size = dimension for the images generated
        eta_min = min pseudo-rapidity
        eta_max = max pseudo-rapidity
        path = path to save the images
    '''
    
    def __init__(self, pixel_size=200, image_size=200, 
                 eta_min= -5., eta_max=5., path=''):
        self.eta_min, self.eta_max = eta_min, eta_max
        self.extent = self.eta_min, self.eta_max, -np.pi, np.pi
        self.image_size = image_size
        self.bins = pixel_size

        self.eta_edges = np.linspace(self.eta_min, self.eta_max, self.bins + 1)
        self.phi_edges = np.linspace(-np.pi, np.pi, self.bins + 1)
        self.eta = np.linspace(self.eta_min, self.eta_max, self.bins + 1)[:-1] + (self.eta_max - self.eta_min) / (2 * self.bins)
        self.phi = np.linspace(-np.pi, np.pi, self.bins + 1)[:-1] + (np.pi / self.bins)

        self.phi_min,self.phi_max = np.min(self.phi), np.max(self.phi)
        self.path = path
        
    def pseudojets_images(self, pseudojet, weight, idx):
        '''
        Function to drawn jet images from the pyjet pseudojets objects.
        arg:
            pseudojet = unclustered pseudojet
            weight = weights arguments for the histo2d function to colorize the pixels
            idx = event index
        '''
        fig, ax = plt.subplots(subplot_kw=dict(xlim=(self.eta_min,self.eta_max),
                                               ylim=(self.phi_min,self.phi_max)),
                               figsize=(self.image_size/10, self.image_size/10),dpi=10)
        ax.axis('off')

        hadrons, _, _ = np.histogram2d(pseudojet[:]['eta'], pseudojet[:]['phi'], 
                                   bins=(self.eta_edges, self.phi_edges), weights=weight)
        ax.imshow(np.ma.masked_where(hadrons == 0, hadrons).T,
                  extent=self.extent, aspect=(self.eta_max - self.eta_min) / (2*np.pi),
                  interpolation='none', origin='lower')


        plt.tight_layout()
        plt.savefig(self.path + '/{}.png'.format(idx), dpi=10)
        plt.clf()
        plt.close('all')

## Now we can loop throught some events and create our images for a given signal or background event and create the respective images in each designated folder

### I'm using the generator to load the events in batches of 512 * 100 rows per time, we can increase this number to process things a bit faster, however pyjet and matplotlib will always process everything at the same speed, so to process all the data will take some time.

In [None]:
#fn is the path to the file and fb is our file batch:
fn = '../data/resized_events_anomalydetection.h5'
fb = generator(fn,chunksize=512*100)

#### I'm modifying the loop through events to include the batch files and change the range to accomodate the batches:

In [8]:
for batch in fb:
    events_combined = batch.T
    for mytype in ['background','signal']:
        for i in tqdm(range(events_combined.keys()[0],events_combined.keys()[-1]+1), #change the range to accomodate the batches indexes
                      total=len(batch)):
            issignal = events_combined[i][2100]
            if (mytype=='background' and issignal):
                continue
            elif (mytype=='signal' and issignal==0):
                 continue   

            pseudojets_input = np.zeros(700, dtype=DTYPE_PTEPM)
            for j in range(700): #700 due to the way the jets are padded
            #if (events_combined[i][j*3]>0):
                pseudojets_input[j]['pT'] = events_combined[i][j*3]
                pseudojets_input[j]['eta'] = events_combined[i][j*3+1]
                pseudojets_input[j]['phi'] = events_combined[i][j*3+2]
                pass
            weight_pt = pseudojets_input[:]['pT']/np.max(pseudojets_input[:]['pT'])

            if issignal == 1:
                DetectorImage(path='../data/signal/',image_size=400,pixel_size=100).pseudojets_images(pseudojets_input,
                                                                                                      weight=weight_pt, idx=i)
            else:
                DetectorImage(path='../data/background/',image_size=400,pixel_size=100).pseudojets_images(pseudojets_input,
                                                                                                      weight=weight_pt, idx=i)