In [13]:
#We can go into our root file and see what Trees are availiable
%matplotlib inline
import sys, os
if __package__ is None:
    import sys, os
    sys.path.append(os.path.realpath("/data/shared/Software/"))
import numpy as np
import pandas as pd
import ntpath
import glob
import deepconfig

from keras.utils.visualize_util import plot
from IPython.display import Image, display

from CMS_Deep_Learning.utils.preprocessing import ObjectProfile, preprocessFromPandas_label_dir_pairs, label_dir_pairs_args_decoder, resolveProfileMaxes
from CMS_Deep_Learning.utils.metrics import plot_history, print_accuracy_m
from CMS_Deep_Learning.utils.callbacks import OverfitStopping, SmartCheckpoint
from CMS_Deep_Learning.utils.archiving import *

from keras.models import Sequential, Model
from keras.layers import Dense, Flatten, Reshape, Activation, Dropout, Convolution2D, merge, Input, Flatten
from keras.callbacks import EarlyStopping
import time
import threading

print(time.clock())
#Choose the GPU
dc = deepconfig.deepconfig(gpu='gpu0', backend='theano')

(True, True)
120.812519
using gpu0
using theano


In [14]:
#Set up all of our preprocessing parameters
observ_types = ['E/c', 'Px', 'Py', 'Pz', 'Charge', "PT_ET", "Eta", "Phi", "Dxy_Ehad_Eem"]
vecsize = len(observ_types)
sample_start = 0
#NOT USING VERY MANY SMAPLES JUST SO ILLUSTRATE
samples_per_label = 1000

archive_dir = "/data/shared/Delphes/keras_archive/"


object_profiles = [ObjectProfile("Electron",5),
                    ObjectProfile("MuonTight", 5),
                    ObjectProfile("Photon", 25),
                    ObjectProfile("MissingET", 1),
                    ObjectProfile("EFlowPhoton",100, sort_columns=["PT_ET"], sort_ascending=False),  #max ~1300
                    ObjectProfile("EFlowNeutralHadron",100, sort_columns=["PT_ET"], sort_ascending=False),  #max ~1000
                    ObjectProfile("EFlowTrack",100, sort_columns=["PT_ET"], sort_ascending=False)]  #max ~1050

label_dir_pairs = \
            [   ("ttbar", "/data/shared/Delphes/ttbar_lepFilter_13TeV/pandas_unjoined/"),
                ("wjet", "/data/shared/Delphes/wjets_lepFilter_13TeV/pandas_unjoined/")#,#
#                ("qcd", "/data/shared/Delphes/qcd_lepFilter_13TeV/pandas_unjoined/")
            ]

In [15]:
#object_profiles = [ObjectProfile("Electron",-1),
#                    ObjectProfile("MuonTight", -1),
#                    ObjectProfile("Photon", -1),
#                    ObjectProfile("MissingET", -1),
#                    ObjectProfile("EFlowPhoton",-1, sort_columns=["PT_ET"], sort_ascending=False),  #max ~1300
#                    ObjectProfile("EFlowNeutralHadron",-1, sort_columns=["PT_ET"], sort_ascending=False),  #max ~1000
#                    ObjectProfile("EFlowTrack",-1, sort_columns=["PT_ET"], sort_ascending=False)]  #max ~1050


#resolveProfileMaxes(object_profiles, label_dir_pairs)

In [16]:
for profile in object_profiles:
    print(profile.name , profile.max_size)

('Electron', 5)
('MuonTight', 5)
('Photon', 25)
('MissingET', 1)
('EFlowPhoton', 100)
('EFlowNeutralHadron', 100)
('EFlowTrack', 100)


In [17]:
import numpy as np
import pandas as pd
import glob

def maxMutualLength(label_dir_pairs, object_profiles):
    label_totals = {}
    for (label,data_dir) in label_dir_pairs:
        files = glob.glob(data_dir+"*.h5")
        files.sort()
        
        keys = None
        if(object_profiles != None):
            keys = ["/" + o.name for o in object_profiles]
        
        label_totals[label] = 0
         #Loop the files associated with the current label
        
        for f in files:
            #Get the HDF Store for the file
            store = pd.HDFStore(f)
            #print(keys)
            #print(store.keys())
            #print(set(keys).issubset(set(store.keys())))
            if(keys != None and set(keys).issubset(set(store.keys())) == False):
                raise KeyError('File: ' + f + ' may be corrupted:' + os.linesep + 
                                'Requested keys: ' + str(keys) + os.linesep + 
                                'But found keys: ' + str(store.keys()) )
            
            #Get file_total_entries
            try:
                num_val_frame = store.get('/NumValues')
            except KeyError as e:
                raise KeyError(str(e) + " " + f)
            file_total_entries = len(num_val_frame.index)
            label_totals[label] += file_total_entries
    #print(label_totals)
    return min(label_totals.items())[1]

def start_num_fromSplits(splits, length):
    if(np.isclose(sum(splits),1.0) == False):
        raise ValueError("Sum of splits %r must equal 1.0" % sum(splits))
    if(True in [x < 0.0 for x in splits]):
        raise ValueError("Splits cannot be negative") 
    nums = [int(s*length) for s in splits]
    out = []
    start = 0
    for n in nums:
        out.append((start, n))
        start += n
    return out

def procsFrom_label_dir_pairs(start, samples_per_label, stride, archive_dir,label_dir_pairs, object_profiles, observ_types, verbose=1):
    procs = []
    end = start+samples_per_label
    if(verbose >= 1): print("Generating DataProcedure in range(%r,%r):" % (start, end))
    for proc_start in range(start, end, stride):
        proc_num = min(stride, end-proc_start)
        pp = DataProcedure(
                archive_dir,
                True,
                preprocessFromPandas_label_dir_pairs,
                label_dir_pairs,
                proc_start,
                proc_num,
                object_profiles,
                observ_types
            )
        procs.append(pp)
        #print(proc_start, samples_per_label, stride)
        if(verbose >= 1):
            num_lables = len(label_dir_pairs)
            print("   From %r labels in range(%r,%r) for %rx%r = %r Samples"
                     % (num_lables,proc_start, proc_start+proc_num, num_lables,proc_num,num_lables*proc_num))
    #print([p.hash() for p in procs])
    return procs

class dataFetchThread(threading.Thread):

    def __init__(self, proc, group=None, target=None, name=None,
                 args=(), kwargs=None, verbose=None):
        threading.Thread.__init__(self, group=group, target=target, name=name,
                                  verbose=verbose)
        self.proc = proc
        self.args = args
        self.kwargs = kwargs
        self.X = None
        self.Y = None
        return

    def run(self):
        self.X, self.Y = self.proc.getData()
        return

def genFromPPs(pps, batch_size, threading=False):
    for pp in pps:
        if(isinstance(pp, DataProcedure) == False):
            raise TypeError("Only takes DataProcedure got" % type(pp))
            
    
    while True:
        if(threading == True):
            datafetch = dataFetchThread(pps[0])
            datafetch.start()
        for i in range(0,len(pps)):
            if(threading == True):
                #Wait for the data to come in
                while(datafetch.isAlive()):
                    pass
                X,Y = datafetch.X, datafetch.Y

                #Start the next dataFetch
                if(i != len(pps)-1):
                    datafetch = dataFetchThread(pps[i+1])
                else:
                    datafetch = dataFetchThread(pps[0])
                datafetch.start()
            else:
                X,Y = pps[i].getData()
                                   
            if(isinstance(X,list) == False): X = [X]
            if(isinstance(Y,list) == False): Y = [Y]
            tot = Y[0].shape[0]
            assert tot == X[0].shape[0]
            for start in range(0, tot, batch_size):
                end = start+min(batch_size, tot-start)
                yield [x[start:end] for x in X], [y[start:end] for y in Y]
                
def genFromPPs_noThreading(pps, batch_size):
    for pp in pps:
        if(isinstance(pp, DataProcedure) == False):
            raise TypeError("Only takes DataProcedure got" % type(pp))
    while True:
        for i in range(0,len(pps)):            
            X,Y = pps[i].getData()
            if(isinstance(X,list) == False): X = [X]
            if(isinstance(Y,list) == False): Y = [Y]
            tot = Y[0].shape[0]
            assert tot == X[0].shape[0]
            for start in range(0, tot, batch_size):
                end = start+min(batch_size, tot-start)
                yield [x[start:end] for x in X], [y[start:end] for y in Y]

def genFrom_label_dir_pairs(start, samples_per_label, stride, batch_size, archive_dir,label_dir_pairs, object_profiles, observ_types):
    pps = procsFrom_label_dir_pairs(start,
                                    samples_per_label,
                                    stride,
                                    archive_dir,
                                    label_dir_pairs,
                                    object_profiles,
                                    observ_types)
    gen = genFromPPs(pps, batch_size, threading = False)
    return (gen, len(label_dir_pairs)*samples_per_label)

keras_archive = "/data/shared/Delphes/keras_archive/"
SNs = start_num_fromSplits((.7,.15,.15), maxMutualLength(label_dir_pairs, object_profiles))
print(SNs)
gen_lambda =  lambda s : genFrom_label_dir_pairs(start=s[0],
                                   samples_per_label=s[1],
                                   stride=10000,
                                    batch_size=100,
                                   archive_dir=archive_dir,
                                   label_dir_pairs=label_dir_pairs,
                                   object_profiles=object_profiles,
                                   observ_types=observ_types)
train, val, test = tuple([gen_lambda(s) for s in SNs])

train_procs = procsFrom_label_dir_pairs(start=SNs[0][0],
                                   samples_per_label=SNs[0][1],
                                   stride=10000,
                                   archive_dir=archive_dir,
                                   label_dir_pairs=label_dir_pairs,
                                   object_profiles=object_profiles,
                                   observ_types=observ_types)
import threading



#print("SOMTHEING")
#print(train_gen,val_gen,test_gen)
#for X,Y in train[0]:
    #print('\r' + str(X[0].shape) + ',' + str(Y[0].shape),)
#   pass
#for X,Y in train_gen:
#    pass
    #print('\r' + str(X[0].shape) + ',' + str(Y[0].shape),)
    #sys.stdout.flush()
#for X,Y in val_gen:
#    print('\r' + str(X[0].shape) + ',' + str(Y[0].shape),)
#    sys.stdout.flush()
#for X,Y in test_gen:
#    print('\r' + str(X[0].shape) + ',' + str(Y[0].shape),)
#    sys.stdout.flush()

[(0, 97996), (97996, 20999), (118995, 20999)]
Generating DataProcedure in range(0,97996):
   From 2 labels in range(0,10000) for 2x10000 = 20000 Samples
   From 2 labels in range(10000,20000) for 2x10000 = 20000 Samples
   From 2 labels in range(20000,30000) for 2x10000 = 20000 Samples
   From 2 labels in range(30000,40000) for 2x10000 = 20000 Samples
   From 2 labels in range(40000,50000) for 2x10000 = 20000 Samples
   From 2 labels in range(50000,60000) for 2x10000 = 20000 Samples
   From 2 labels in range(60000,70000) for 2x10000 = 20000 Samples
   From 2 labels in range(70000,80000) for 2x10000 = 20000 Samples
   From 2 labels in range(80000,90000) for 2x10000 = 20000 Samples
   From 2 labels in range(90000,97996) for 2x7996 = 15992 Samples
Generating DataProcedure in range(97996,118995):
   From 2 labels in range(97996,107996) for 2x10000 = 20000 Samples
   From 2 labels in range(107996,117996) for 2x10000 = 20000 Samples
   From 2 labels in range(117996,118995) for 2x999 = 1998 S

In [None]:
#X,Y = preprocessFromPandas_label_dir_pairs(label_dir_pairs, 128995, 301, object_profiles, observ_types)

In [None]:
def genModel(depth, width):
    inputs = [None] * len(object_profiles)
    fist_layer = [None] * len(object_profiles)
    #Build Inputs
    for i,profile in enumerate(object_profiles):
        key = profile.name
        max_size = profile.max_size
        inp = Input(shape=(max_size, vecsize), name=key)
        inputs[i] = inp
        fist_layer[i] = Flatten()(inp)
    #Merge inputs
    merged = merge(fist_layer, mode='concat', name="merge")
    a = merged
    v = [width] * depth
    for j in range(len(v)):
        a = Dense(v[j], activation='relu', name="dense_2_"+str(j))(a)
    second_dense = a 
    
    #Create dense sigmoid layer for classification
    dense_out = Dense(len(label_dir_pairs), activation='sigmoid', name='main_output')(second_dense)
    dense = Model(input=inputs, output=dense_out, name="dense")
    return dense

#Get the training data
#X_train, Y_train = preprocessFromPandas_label_dir_pairs(label_dir_pairs,
#                                                        sample_start,
#                                                        samples_per_label,
#                                                        object_profiles,
#                                                        observ_types)

#Generate the model from our function above
model = genModel(3,100)
model.compile(loss='binary_crossentropy',
          optimizer='rmsprop',
          metrics=['accuracy']
              )

#VERY FEW EPOCHS FOR ILLUSTRATIVE REASONS
epochs = 10
batch_size = 100

earlystopping = EarlyStopping(patience=10, verbose=1)

start_time = time.clock()
history = model.fit_generator(generator=train[0],
                samples_per_epoch=train[1],
                nb_epoch=epochs,
                max_q_size=100,
                #nb_workers=2,
                validation_data=val[0],
                nb_val_samples=val[1],
                #YOU SHOULD REALLY USE CALLBACKS AND INCREASE THE # of EPOCHS TO LIKE 100
                callbacks=[earlystopping])

#for proc in train_procs:
#    X,Y = proc.get_XY()
#    history = model.fit(X,Y,batch_size=batch_size,
#                 nb_epoch=epochs,
#                 validation_split=.15)

plot_history([("MySimpleModel", history)])
dot = plot(model, to_file="MySimpleModel.png", show_shapes=True, show_layer_names=False)
display(Image("MySimpleModel.png"))
print("ElapseTime:", time.clock()-start_time)

Epoch 1/10
DataProcedure results '44f7c73ddbda7e3cbb436c464571f84bfeca3562' read from archive
 10900/195992 [>.............................] - ETA: 26s - loss: 1.6455 - acc: 0.6626Reading 10000 samples from '/data/shared/Delphes/ttbar_lepFilter_13TeV/pandas_unjoined/ttbar_lepFilter_13TeV_1.h5':
Mapping 5 Values/Sample from 'Electron'
 17100/195992 [=>............................] - ETA: 57s - loss: 1.2425 - acc: 0.7038Mapping 5 Values/Sample from 'MuonTight'
 20000/195992 [==>...........................] - ETA: 63s - loss: 1.1315 - acc: 0.7177

In [None]:
model.evaluate_generator(test[0], test[1])