In [1]:
#We can go into our root file and see what Trees are availiable
%matplotlib inline
import sys, os
if __package__ is None:
    import sys, os
    sys.path.append(os.path.realpath("/data/shared/Software/"))
    sys.path.append(os.path.realpath("../../"))
import numpy as np
import pandas as pd
import ntpath
import glob
import deepconfig

#from keras.utils.visualize_util import plot
#from IPython.display import Image, display

from CMS_Deep_Learning.preprocessing.preprocessing import *
from CMS_Deep_Learning.callbacks import OverfitStopping, SmartCheckpoint
from CMS_Deep_Learning.storage.batch import batchAssertArchived, batchExecuteAndTestTrials
from CMS_Deep_Learning.storage.archiving import *
from CMS_Deep_Learning.postprocessing.analysistools import findsubsets
from CMS_Deep_Learning.layers.lorentz import Lorentz, _lorentz
from CMS_Deep_Learning.layers.slice import Slice

from keras.models import Sequential, Model, model_from_json
from keras.layers import Dense, Flatten, Reshape, Activation, Dropout, Convolution2D, merge, Input, Flatten, Lambda, LSTM, Masking
from keras.engine.topology import Layer
from keras.callbacks import EarlyStopping
from keras.utils.visualize_util import plot
from keras.layers.advanced_activations import LeakyReLU


dc = deepconfig.deepconfig(gpu='gpu0', backend='theano')




Using Theano backend.
Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled, cuDNN 5005)

Couldn't import dot_parser, loading of dot files will not be possible.
using gpu0
using theano





In [2]:
#The observables taken from the table
observ_types = ['E/c', 'Px', 'Py', 'Pz', 'PT_ET','Eta', 'Phi', 'Charge', 'X', 'Y', 'Z',\
                     'Dxy', 'Ehad', 'Eem', 'MuIso', 'EleIso', 'ChHadIso','NeuHadIso','GammaIso', "ObjType"]
vecsize = len(observ_types)
epochs = 60
batch_size = 100

label_dir_pairs = \
            [   ("ttbar", "/data/shared/Delphes/ttbar_lepFilter_13TeV/pandas_h5/"),
                ("wjet", "/data/shared/Delphes/wjets_lepFilter_13TeV/pandas_h5/"),
                ("qcd", "/data/shared/Delphes/qcd_lepFilter_13TeV/pandas_h5/")
            ]


In [3]:
#Find all the subsets of label_dir_pairs and store them as sorted lists
#ldpsubsets = [sorted(list(s)) for s in findsubsets(label_dir_pairs)]
#Make sure that we do 3-way classification as well
ldpsubsets = []
ldpsubsets.append(label_dir_pairs)


In [4]:
def genModel(name,object_profiles,out_dim, depth, lstm_activation="relu", lstm_dropout = 0.0, dropout=0.0):
    inputs = []
    mergelist = []
    for i, profile in enumerate(object_profiles):
        inp = a = Input(shape=(profile.max_size + 1*(profile.punctuation != None), vecsize), name="input_"+str(i))
        inputs.append(inp)
        mergelist.append(a)
    a = merge(mergelist,mode='concat',concat_axis=1, name="merge")
    for i in range(depth):
        a = Masking(mask_value=0.0)(a)
        a = LSTM(vecsize,
                 input_shape=(None,vecsize),
                 dropout_W=lstm_dropout,
                 dropout_U=lstm_dropout,
                 activation=lstm_activation,
                 name = "lstm_" +str(i))(a)
        if(dropout > 0.0):
            a =  Dropout(dropout, name="dropout_"+str(i))(a)
    dense_out = Dense(out_dim, activation='softmax', name='main_output')(a)
    model = Model(input=inputs, output=dense_out, name=name)
    return model

In [5]:
def symlinkFolderFromDPS(dps, folder):
    if(not os.path.exists(folder)):
        os.makedirs(folder)
    for i, dp in enumerate(dps):
        path = folder + "%03d"% i + ".h5"
        try:
            os.unlink(path)
        except Exception as e:
            pass
        os.symlink(dp.get_path()+"archive.h5", path)

In [6]:
import h5py
def readH5(filepath):
    h5f = h5py.File(filepath, 'r')
    X = []
    X_group = h5f['X']
    keys = list(X_group.keys())
    keys.sort()
    for key in keys:
        X.append(X_group[key][:])


    Y = []
    Y_group = h5f['Y']
    keys = list(Y_group.keys())
    keys.sort()
    for key in keys:
        Y.append(Y_group[key][:])

    h5f.close()
    out = (X, Y)
    return out

In [7]:
archive_dir = "/data/shared/Delphes/keras_archive/"
dustin_dir = "/data/shared/Delphes/dustin_MPI_files/"
patience = 8
earlyStopping = EarlyStopping(verbose=1, patience=patience)
#trial_tups = []
#Loop over all subsets
for ldp in ldpsubsets:
    labels = [x[0] for x in ldp]
    #for sort_on in ["PT_ET", "Phi", "Eta"]:
    for sort_on in ["Phi"]:
        #Use object maxes from Find_Maxes_From Query
        object_profiles = [ObjectProfile("Electron",-1, pre_sort_columns=["PT_ET"], pre_sort_ascending=False, sort_columns=[sort_on], sort_ascending=False, addColumns={"ObjType":1}),
                            ObjectProfile("MuonTight", -1, pre_sort_columns=["PT_ET"], pre_sort_ascending=False, sort_columns=[sort_on], sort_ascending=False, addColumns={"ObjType":2}),
                            ObjectProfile("Photon", -1, pre_sort_columns=["PT_ET"], pre_sort_ascending=False, sort_columns=[sort_on], sort_ascending=False, addColumns={"ObjType":3}),
                            ObjectProfile("MissingET", 1, addColumns={"ObjType":4}),
                            ObjectProfile("EFlowPhoton",100, pre_sort_columns=["PT_ET"], pre_sort_ascending=False, sort_columns=[sort_on], sort_ascending=False, addColumns={"ObjType":5}), 
                            ObjectProfile("EFlowNeutralHadron",100, pre_sort_columns=["PT_ET"], pre_sort_ascending=False, sort_columns=[sort_on], sort_ascending=False, addColumns={"ObjType":6}), 
                            ObjectProfile("EFlowTrack",100, pre_sort_columns=["PT_ET"], pre_sort_ascending=False, sort_columns=[sort_on], sort_ascending=False, addColumns={"ObjType":7})]  
        #This will replace the -1's used in the previous lines with the acutal maximum number of particles for each object type
        resolveProfileMaxes(object_profiles, ldp)
        
        #This outputs generators for training and validation with 75,000 events per process and 20,000 events per process respectively in files of roughly 100MB
        dps, l = getGensDefaultFormat(archive_dir, (75000,20000), 115000, \
                             object_profiles,ldp,observ_types,megabytes=100, verbose=0)
        
        #Don't worry about dependencies, this is for systems like CSCS where thing are run in parellel batches, but on titans it does nothing
        #batchAssertArchived is very important though, it makes sure that each 100MB chunk of data is actually archived.
        dependencies = batchAssertArchived(dps)
        train, num_train = l[0]
        val,   num_val   = l[1]
        max_q_size = l[2]
        print("MAXQ: ",max_q_size)
        
        #Generate the model
        name = 'LSTM'
        model = genModel('LSTM',object_profiles, len(labels), 1, 'tanh', 0.0, 0.0)
        model.summary()
        
        #Write it to a .json file, write_json_obj is in CMS_SURF_2016.utils.archiving
        write_json_obj(model.to_json(), dustin_dir, "model.json")
        
        #Get the archived DataProcedures and make folders with symbolic links to their content
        trainfolder = dustin_dir + "train/"
        train_dps = train.args[0]
        symlinkFolderFromDPS(train_dps, trainfolder)
        valfolder = dustin_dir + "val/"
        val_dps = val.args[0]
        symlinkFolderFromDPS(val_dps, valfolder)
        
        #Read the data back
        (t_X,t_Y)  = readH5(trainfolder + "000.h5")
        (v_X,v_Y)  = readH5(valfolder + "000.h5")
        
        print("here are the shapes of our inputs (number of events, number of particles, number of observables)")
        print([x.shape for x in t_X])
        print("here are the shapes of our targets (number of events, number of classes/processes)")
        print([y.shape for y in t_Y])
                                


    

Starting batchAssertArchived...


KeyboardInterrupt: 