In [183]:
import sys,os
import numpy as np
import pandas as pd
import glob
import threading
#from .archiving import DataProcedure, KerasTrial
#from .meta import msgpack_assertMeta
#from ..layers.lorentz import Lorentz
#from ..layers.slice import Slice
import os
import re
import sys
import socket
import time
if not "/data/shared/Software/" in sys.path:
    sys.path.append("/data/shared/Software/")
from CMS_SURF_2016.utils.preprocessing import getFiles_StoreType,getNumValFrame
DEFAULT_PROFILE = {
                        "name" : " ",
                        "max_size" : 100,
                        "pre_sort_columns" : None,
                        "pre_sort_ascending" :True,
                        "sort_columns" : None,
                        "sort_ascending" : True,
                        "query" : None,
                        "shuffle" : False,
                        "addColumns" : None}
class ObjectProfile():
    


    def __init__(self, *args, **kargs):
        ''' An object containing processing instructions for each observable object type
            #unkeyed Arguements:
                name       -- The name of the data type (i.e. Electron, Photon, EFlowTrack, etc.)
                max_size   -- The maximum number of objects to use in training
            #keyed Arguements
                pre_sort_columns -- What columns to sort before cutting on max_size (See pandas.DataFrame.sort)
                pre_sort_ascending -- Whether each column will be sorted ascending or decending before cutting on max_size (See pandas.DataFrame.sort)
                sort_columns -- What columns to sort on after processing (See pandas.DataFrame.sort)
                sort_ascending -- Whether each column will be sorted ascending or decending after processing (See pandas.DataFrame.sort)
                query        -- A selection query string to use before truncating the data (See pands.DataFrame.query)
                shuffle     -- Whether or not to shuffle the data
                addColumns -- A dictionary with single constant floats or integers to fill an additional column in the table.
                             This column should be in observ_types if it is used with preprocessFromPandas_label_dir_pairs
        '''
        d = {}
        if(isinstance(args[0], dict)):
            d = args[0]
        elif(isinstance(args[0], str)):
            d["name"] = args[0]
            if(isinstance(args[1], int)):
                d["max_size"] = args[1]
        if(len(args) > 2):
            raise ValueError("Please explicitly name arguements with values %r" % args[2:])

        for key, value in DEFAULT_PROFILE.items():
            # print(kargs.get(key, "Nope"),d.get(key, "Nope"), value)
            setattr(self, key, kargs.get(key, d.get(key, value)))

        if(self.max_size < -1):
            raise ValueError("max_size cannot be less than -1. Got %r" % self.max_size)
        if(self.addColumns != None and not isinstance(self.addColumns, dict)):
            raise ValueError("arguement addColumns must be a dictionary, but got %r" % type(self.addColumns))
       
        self.class_name = self.__class__.__name__



    def __str__(self):
        main_clause = 'name:%r max_size=%r ' % (self.name, self.max_size)
        sort_clause = ''
        query_clause = ''
        if(self.pre_sort_columns != None):
            sort_clause = 'pre_sort_columns=%r pre_sort_ascending=%r ' % (self.pre_sort_columns, self.pre_sort_ascending)
        if(self.sort_columns != None):
            sort_clause = 'sort_columns=%r sort_ascending=%r ' % (self.sort_columns, self.sort_ascending)
        if(self.query != None):
            query_clause = 'query=%r ' % (self.query)
        shuffle_clause = 'shuffle=%r' % self.shuffle

        return main_clause + sort_clause + query_clause + shuffle_clause
    
    __repr__ = __str__

#def padItem(x,max_size, vecsize, shuffle=False):
#    '''A helper function that pads a numpy array up to MAX_SIZE or trucates it down to MAX_SIZE. If shuffle==True,
#        shuffles the padded output before returning'''
#    if(len(x) > max_size):
#        out = x[:max_size]
#    else:
#        out = np.append(x ,np.array(np.zeros((max_size - len(x), vecsize))), axis=0)
#    if(shuffle == True): np.random.shuffle(out)
#    return out
def _getStore(f, storeType):
    '''Helper Function - Gets the HDFStore or frames for the file and storeType'''
    frames = None
    if(storeType == "hdf5"):
        store = pd.HDFStore(f)
    elif(storeType == "msgpack"):
        print("Bulk reading .msg. Be patient, reading in slices not supported.")
        sys.stdout.flush()
        #Need to check for latin encodings due to weird pandas default
        try:
            frames = pd.read_msgpack(f)
        except UnicodeDecodeError as e:
            frames = pd.read_msgpack(f, encoding='latin-1')
    return store,frames
def _getFrame(store, storeType, key, select_start, select_stop,
              samples_to_read, file_total_entries, frames):
    '''Helper Function - gets '''
    if(storeType == "hdf5"):
        #If we are reading all the samples use get since it might be faster
        #TODO: check if it is actually faster
        if(samples_to_read == file_total_entries):
            frame = store.get('/'+key)
        else:
            frame = store.select('/'+key, start=select_start, stop=select_stop)
    elif(storeType == "msgpack"):
        frame = frames[key]
        frame = frame[select_start:select_stop]
    return frame

def _groupsByEntry(f, storeType, samples_per_label, samples_to_read, file_total_entries, num_val_frame,file_start_read,object_profiles):
    '''Helper Function - produces dict keyed by object type and filled with groupBy objects w.r.t Entry'''
    store, frames = _getStore(f, storeType)

    #Get information about how many rows there are for each entry for the rows we want to skip and read
    skip_val_frame = num_val_frame[:file_start_read]
    num_val_frame = num_val_frame[file_start_read : file_start_read+samples_to_read]


    groupBys = {}
    #Loop over every profile and read the corresponding tables in the pandas file
    for index, profile in enumerate(object_profiles):
        key = profile.name                
        #Where to start reading the table based on the sum of the selection start 
        select_start = int(skip_val_frame[key].sum())
        select_stop = select_start + int(num_val_frame[key].sum())

        frame = _getFrame(store, storeType, key, select_start, select_stop,
                          samples_to_read, file_total_entries,frames)
        #Group by Entry
        groupBys[key] = frame.groupby(["Entry"], group_keys=True)
    return groupBys, store

def _applyCuts(df, profile,vecsize):
    '''Helper Function - presorts, applies queries, adds columns, and makes cuts'''
    if(profile.pre_sort_columns != None):
        df = df.sort(profile.pre_sort_columns, ascending=profile.pre_sort_ascending
    if(profile.query != None):
        df = df.query(profile.query)
    #Add any additional columns
    if(profile.addColumns != None):
        for key, value in profile.addColumns.items():
            df[key] = value
    #Make cut, preserving only profile.max_size of top of table
    df = df.head(profile.max_size)
    #Only use observable columns
    df = df[observ_types]
    return df
    
def _padAndSort(df, profile,vecsize):
    '''Helper Function - pads the data and sorts it'''
    if(isinstance(df, type(None))):
        #If a DataFrame does not exist for this entry then just inject zeros 
        x = np.array(np.zeros((profile.max_size, vecsize)))
    else:
        #Find sort_locs before we convert to np array
        sort_locs = None
        if(profile.sort_columns != None):
            sort_locs = [df.columns.get_loc(s) for s in profile.sort_columns]
        
        #x is an np array not a DataFrame
        x = df.values
        
        if(sort_locs != None):
            for loc in reversed(sort_locs):
                if(profile.sort_ascending == True):
                    x = x[x[:,loc].argsort()]
                else:
                    x = x[x[:,loc].argsort()[::-1]]
    
        #pad the array
        x = np.append(x ,np.array(np.zeros((profile.max_size - len(x), vecsize))), axis=0)
    return x    

def _initializeXY(single_list, label_dir_pairs, num_object_profiles, samples_per_label, num_labels):
    '''Helper Function - Generates the initial data structures for the X (data) and Y (target)'''
    label_vecs = {}
    for i, (label, data_dir) in enumerate(label_dir_pairs):
        arr = np.zeros((num_labels,))
        arr[i] = 1
        label_vecs[label] = arr
        
    if(single_list):
        X_train = [None] * (samples_per_label * num_labels)
        #global_profile = ObjectProfile("list", max_size="")
    else:
        X_train = [None] * (num_object_profiles)
        #Prefill the arrays so that we don't waste time resizing lists
        for index in range(num_object_profiles):
            X_train[index] = [None] * (samples_per_label * num_labels)
            
    y_train = [None] * (samples_per_label * num_labels)
    return X_train, y_train, label_vecs
   
def _check_Object_Profiles(object_profiles):
    '''Helper Function - Makes sure that all ObjectProfiles are correctly formatted,
        makes formatting corrections if necessary'''
    for i,profile in enumerate(object_profiles):
        if(isinstance(profile, dict) and profile.get('class_name', None) == "ObjectProfile"):
            profile = ObjectProfile(profile)
            object_profiles[i] = profile
        if(profile.max_size == -1 or profile.max_size == None):
            raise ValueError("ObjectProfile max_sizes must be resolved before preprocessing. \
                         Please first use: utils.preprocessing.resolveProfileMaxes(object_profiles, label_dir_pairs)")
        if(profile.addColumns != None):
            for key, value in profile.addColumns.items():
                if(not key in observ_types):
                    raise ValueError("addColumn Key %r must be in observ_types" % key)
    return object_profiles

def _check_inputs(label_dir_pairs, observ_types):
    '''Helper Function - Makes sure that label_dir_pairs, and observ_types are correctly formatted'''
    labels = [x[0] for x in label_dir_pairs]
    duplicates = list(set([x for x in labels if labels.count(x) > 1]))
    if(len(duplicates) != 0):
        raise ValueError("Cannot have duplicate labels %r" % duplicates)
    if("Entry" in observ_types):
        raise ValueError("Using Entry in observ_types can result in skewed training results. Just don't.")
        
def preprocessFromPandas_label_dir_pairs(label_dir_pairs,start, samples_per_label, object_profiles, observ_types,
                                         single_list=False, sort_columns=None, sort_ascending=True,verbose=1):
    '''Gets training data from folders of pandas tables
        #Arguements:
            label_dir_pairs -- a list of tuples of the form (label, directory) where the directory contains
                                tables containing data of all the same event types.
            start             --    Where to start reading (as if all of the files are part of one long list)
            samples_per_label -- The number of samples to read for each label
            object_profiles -- A list of ObjectProfile(s) corresponding to each type of observable object and
                                its preprocessing steps. 
            observ_types    -- The column headers for the data to be read from the panadas table
            single_list -- If True all object types are joined into a single list.
            sort_columns -- If single_list the columns to sort by.
            sort_ascending -- If True sort in ascending order, false decending  
        #Returns:
            Training data with its correspoinding labels
            (X_train, Y_train)
    '''
    _check_inputs(label_dir_pairs, observ_types)
    #Make sure that all the profile are proper objects and have resolved max_sizes
    object_profiles = _check_Object_Profiles(object_profiles)
    
    vecsize = len(observ_types)
    num_labels = len(label_dir_pairs)
    
    #Build vectors in the form [1,0,0], [0,1,0], [0, 0, 1] corresponding to each label
    X_train, y_train, label_vecs = _initializeXY(single_list, label_dir_pairs, len(object_profiles), samples_per_label, num_labels)
    X_train_index = 0
    
    #Loop over label dir pairs and get the file list for each directory
    y_train_start = 0
    for (label,data_dir) in label_dir_pairs:
        files, storeType = getFiles_StoreType(data_dir)
        files.sort()
        samples_read = 0
        location = 0
        
         #Loop the files associated with the current label
        for f in files:
            num_val_frame = getNumValFrame(f,storeType)

            file_total_entries = len(num_val_frame.index)

            assert file_total_entries > 0, "num_val_frame has zero values"
            
            if(location + file_total_entries <= start):
                location += file_total_entries
                continue
            
            #Determine what row to start reading the num_val table which contains
            #information about how many rows there are for each entry
            file_start_read = start-location if start > location else 0
            
            #How many rows we will read from this table each corresponds to one entry
            samples_to_read = min(samples_per_label-samples_read, file_total_entries-file_start_read)
            assert samples_to_read >= 0
            
            if(verbose >= 1): print("Reading %r samples from %r:" % (samples_to_read,f))
            
            groupBys,store = _groupsByEntry(f, storeType, samples_per_label,samples_to_read, file_total_entries,
                                            num_val_frame,file_start_read,object_profiles)
                
            if(verbose >= 1): print("Values/Sample from: %r" % {p.name: p.max_size for p in object_profiles})
            
            cut_tables = [None] * len(object_profiles)
            last_time = time.clock()-1.0
            prev_entry = file_start_read
            for entry in range(file_start_read, file_start_read+samples_to_read):
                #Make a pretty progress bar in the terminal
                if(verbose >= 1):      
                    c = time.clock() 
                    if(c > last_time + .25):
                        percent = float(entry-file_start_read)/float(samples_to_read)
                        sys.stdout.write('\r')
                        sys.stdout.write("[%-20s] %r/%r  %r(Entry/sec)" % ('='*int(20*percent), entry, int(samples_to_read), 4 * (entry-prev_entry)))
                        sys.stdout.flush()
                        last_time = c
                        prev_entry = entry
                        
                for index, profile in enumerate(object_profiles):
                        #print(groupBys.keys())
                        groupBy = groupBys[profile.name]
                        if(entry in groupBy.groups):
                            df = _applyCuts(groupBy.get_group(entry), profile, vecsize)
                            cut_tables[index] = df
                        else:
                            cut_tables[index] = None
                if(single_list):
                    df = pd.concat(cut_tables)
                    list_profile = ObjectProfile("single_list",
                                                sum([profile.max_size for profile in object_profiles]),
                                                sort_columns=sort_columns,
                                                sort_ascending=sort_ascending)    
                                                    
                    x  = _padAndSort(df,list_profile,vecsize)
                    X_train[X_train_index + entry - file_start_read] = x
                else:
                    for index, profile in enumerate(object_profiles):
                        arr = X_train[index]
                        df = cut_tables[index]
                        x  = _padAndSort(df,profile, vecsize)
                        arr[X_train_index + entry - file_start_read] = x
            
            X_train_index += samples_to_read
            
            #Free this (probably not necessary)
            num_val_frame = None
            if(storeType == "hdf5"):
                store.close()
            location     += file_total_entries
            samples_read += samples_to_read
            if(verbose >= 1): print("*Read %r Samples of %r in range(%r, %r)" % (samples_read, samples_per_label, start, samples_per_label+start))
            if(samples_read >= samples_per_label):
                if(verbose >= 1): print('-' * 50)
                assert samples_read == samples_per_label
                break
        if(samples_read != samples_per_label):
            raise IOError("Not enough data in %r to read in range(%r, %r)" % (data_dir, start, samples_per_label+start))
        
        #Generate the target data as vectors like [1,0,0], [0,1,0], [0,0,1]
        for i in range(samples_per_label):
            y_train[y_train_start+i] = label_vecs[label]
        y_train_start += samples_per_label
    
    #Turn everything into numpy arrays and shuffle them just in case.
    #Although, we probably don't need to shuffle since keras shuffles by default.
    y_train = np.array(y_train)
    
    indices = np.arange(len(y_train))
    np.random.shuffle(indices)
    if(single_list):
        X_train = np.array(X_train)
    else:
        for index in range(len(X_train)):
            X_train[index] = np.array(X_train[index])[indices]

    y_train = y_train[indices]
    return X_train, y_train

SyntaxError: invalid syntax (<ipython-input-183-ff8cd1375e3c>, line 151)

In [163]:
#from CMS_SURF_2016.utils.preprocessing import *



#The observables taken from the table
observ_types = ['E/c', 'Px', 'Py', 'Pz', 'PT_ET','Eta', 'Phi', 'Charge', 'X', 'Y', 'Z',\
                     'Dxy', 'Ehad', 'Eem', 'MuIso', 'EleIso', 'ChHadIso','NeuHadIso','GammaIso', "ObjType"]
vecsize = len(observ_types)
epochs = 30
batch_size = 100

label_dir_pairs = \
            [   ("ttbar", "/data/shared/Delphes/ttbar_lepFilter_13TeV/pandas_h5/"),
                ("wjet", "/data/shared/Delphes/wjets_lepFilter_13TeV/pandas_h5/"),
                ("qcd", "/data/shared/Delphes/qcd_lepFilter_13TeV/pandas_h5/")
            ]
object_profiles = [ObjectProfile("EFlowPhoton",100, pre_sort_columns=["PT_ET"],
                                 pre_sort_ascending=False, sort_columns=["Phi"],
                                 addColumns={"ObjType":1}),
                   ObjectProfile("Electron",100, pre_sort_columns=["PT_ET"],
                                 pre_sort_ascending=False, sort_columns=["Phi"],
                                 addColumns={"ObjType":1})
                  
                  ] 
X1, Y1 = preprocessFromPandas_label_dir_pairs(label_dir_pairs,0, 1000, object_profiles, observ_types, verbose=1)
X2, Y2 = preprocessFromPandas_label_dir_pairs(label_dir_pairs,0, 1000, object_profiles, observ_types, single_list=True,verbose=1)

Reading 1000 samples from '/data/shared/Delphes/ttbar_lepFilter_13TeV/pandas_h5/ttbar_lepFilter_13TeV_0.h5':
Values/Sample from: {'EFlowPhoton': 100, 'Electron': 100}
--------------------------------------------------
Reading 1000 samples from '/data/shared/Delphes/wjets_lepFilter_13TeV/pandas_h5/wjets_lepFilter_13TeV_1.h5':
Values/Sample from: {'EFlowPhoton': 100, 'Electron': 100}
--------------------------------------------------
Reading 999 samples from '/data/shared/Delphes/qcd_lepFilter_13TeV/pandas_h5/qcd_lepFilter_13TeV_0.h5':
Values/Sample from: {'EFlowPhoton': 100, 'Electron': 100}
Reading 1 samples from '/data/shared/Delphes/qcd_lepFilter_13TeV/pandas_h5/qcd_lepFilter_13TeV_1.h5':
Values/Sample from: {'EFlowPhoton': 100, 'Electron': 100}
[                    ] 0/1  0(Entry/sec)*Read 1000 Samples of 1000 in range(0, 1000)
--------------------------------------------------
Reading 1000 samples from '/data/shared/Delphes/ttbar_lepFilter_13TeV/pandas_h5/ttbar_lepFilter_13TeV_0.h5

KeyboardInterrupt: 

In [164]:
print(X1[0][0])

[[  1.64259722e+01  -1.21117621e+00  -1.98987888e-01 ...,   0.00000000e+00
   -9.62650660e-01   1.00000000e+00]
 [  4.85319116e+01  -1.44196159e+00  -3.84855252e-01 ...,   1.55532244e-01
   -8.66347893e-01   1.00000000e+00]
 [  6.36782964e+01  -9.73977580e-01  -3.98754945e-01 ...,   5.82509306e-02
   -8.61783705e-01   1.00000000e+00]
 ..., 
 [  7.67499237e+01  -1.32991701e+00   1.19275905e-01 ...,   1.39568590e-01
   -9.07536685e-01   1.00000000e+00]
 [  3.07067674e+01  -1.11861911e+00   7.69236240e-02 ...,   2.16747297e-01
   -7.87645092e-01   1.00000000e+00]
 [  1.98119325e+01  -2.02595219e+00   1.06812292e-01 ...,   1.17052374e-01
   -9.57811825e-01   1.00000000e+00]]


In [165]:
print(X2[0], X2.shape)

NameError: name 'X2' is not defined

In [222]:
import unittest
import tempfile
gen_observ_types = ['PT_ET','Eta', 'Phi']
observ_types = gen_observ_types + ["ObjType"]

obs_pl_t = ["Entry"] + gen_observ_types
vecsize = len(obs_pl_t)
RANDOM_SEED = 7
np.random.seed(seed=RANDOM_SEED)

object_profiles1 = [ObjectProfile("EFlowPhoton",20, pre_sort_columns=["PT_ET"],
                                 pre_sort_ascending=False,
                                 addColumns={"ObjType":1}),
                   ObjectProfile("EFlowTracks",20, pre_sort_columns=["PT_ET"],
                                 pre_sort_ascending=True,
                                 addColumns={"ObjType":2}),
                   ObjectProfile("EFlowNeutralHadron",20, pre_sort_columns=["PT_ET"],
                                 pre_sort_ascending=False, sort_columns=["Phi"], sort_ascending=True,
                                 addColumns={"ObjType":3}),
                   ObjectProfile("MET",1, pre_sort_columns=["PT_ET"],
                                 pre_sort_ascending=False, sort_columns=["Phi"],
                                 addColumns={"ObjType":4}),
                   ObjectProfile("MuonTight",10, pre_sort_columns=["PT_ET"],
                                 pre_sort_ascending=False, sort_columns=["Phi"], sort_ascending=False,
                                 addColumns={"ObjType":5}),
                   ObjectProfile("Electron",10, #pre_sort_columns=["PT_ET"],
                                 pre_sort_ascending=False, sort_columns=["Phi"],
                                 addColumns={"ObjType":6})
                  ]
def rand_pl_entry(entry, a,b):
    out = np.concatenate([np.full((a,1),entry, dtype='float64'), np.random.randn(a,b)], axis = 1)
    return out
def norm_uint(mean, std):
    return max(int(np.random.normal(mean, std)),0)
def fake_frames(N,object_profiles):
    vecsize = len(obs_pl_t)
    frames = {profile.name:pd.DataFrame(columns=obs_pl_t) for profile in object_profiles}
    #print(frames.values()[0].shape, (1,1, vecsize))
    num_val_dict = {key:[None]*N for key, frame in frames.items()}
    for profile in object_profiles:
        frames[profile.name] = pd.DataFrame(columns=obs_pl_t)
    for entry in range(N):
        #n = norm_uint(100,35)
        n = norm_uint(3,1)
        num_val_dict["EFlowPhoton"][entry] = n       
        frames["EFlowPhoton"] = pd.concat([frames["EFlowPhoton"],pd.DataFrame(rand_pl_entry(entry,n, vecsize-1), columns=obs_pl_t)])
       
        #n = norm_uint(120, 23)
        n = norm_uint(3,1)
        num_val_dict["EFlowTracks"][entry] = n  
        frames["EFlowTracks"] = pd.concat([frames["EFlowTracks"] ,pd.DataFrame(rand_pl_entry(entry,n, vecsize-1), columns=obs_pl_t)])
        
        #n = norm_uint(90, 27)
        n = norm_uint(3,1)
        num_val_dict["EFlowNeutralHadron"][entry] = n
        frames["EFlowNeutralHadron"] = pd.concat([frames["EFlowNeutralHadron"] ,pd.DataFrame(rand_pl_entry(entry,n, vecsize-1), columns=obs_pl_t)])

        n = 1
        num_val_dict["MET"][entry] = n
        frames["MET"] = pd.concat([frames["MET"] ,pd.DataFrame(rand_pl_entry(entry,n, vecsize-1), columns=obs_pl_t)])

        n = int(np.random.uniform(0, 5))
        num_val_dict["MuonTight"][entry] = n
        frames["MuonTight"] = pd.concat([frames["MuonTight"] ,pd.DataFrame(rand_pl_entry(entry,n, vecsize-1), columns=obs_pl_t)])

        n = int(np.random.uniform(0, 5))
        num_val_dict["Electron"][entry] = n
        frames["Electron"] = pd.concat([frames["Electron"] ,pd.DataFrame(rand_pl_entry(entry,n, vecsize-1), columns=obs_pl_t)])
    frames["NumValues"] = pd.DataFrame(num_val_dict)
    #print([f.shape for f in frames.values()])
    #raise ValueError()
    return frames
def store_frames(frames, filepath):
    store = pd.HDFStore(filepath)
    for key,frame in frames.items():
        store.put(key, frame, format='table')
        #print(filepath,key, frame.shape)
    store.close()
    #print(frames.keys())
    store = pd.HDFStore(filepath)
    #print(store.keys())
def store_fake(directory, size, num, object_profiles):
    if not os.path.exists(directory):
        os.makedirs(directory)
    frames_list = [None]* num
    for i in range(num):
        frames = fake_frames(size, object_profiles)
        store_frames(frames, directory+"%03i.h5" % i)
        frames_list[i] = frames
    return frames_list

temp_dir = tempfile.gettempdir() + "/fake_delphes/"
ttbar_dir = temp_dir + "ttbar/"
wjet_dir = temp_dir + "wjet/"
qcd_dir = temp_dir + "qcd/"
frame_lists = {}
label_dir_pairs = [("ttbar", ttbar_dir), ("wjet", wjet_dir), ("qcd", qcd_dir)]

import operator

def justCheckSize(X, Y,sizes):
    if(not np.array_equal(
            np.array([x.shape for x in X]),sizes)):
        print("Failed: data is wrong size")
    if(True in [x.dtype == np.dtype(object) for x in X]):
        print("Failed: data is not square")

def checkGeneralSanity(X, Y, frame_lists, sizes,  NUM, label_dir_pairs):
    is_single_list = False
    if(not isinstance(X, list)):
        is_single_list = True
        X = [X]
    justCheckSize(X, Y,sizes)
    
    all_values_by_label = {tup[0]:[None] * NUM for tup in label_dir_pairs}
    
    for entry in range(NUM):
        for label, frame_list in frame_lists.items():
            for f in frame_list:
                f = {k: df.query("Entry == %r" % entry) for k, df in f.items() if k != "NumValues"}
                all_values_by_label[label][entry] =pd.concat([df for df in f.values()]) 
                
    tn = [0]*5
    tn[4] = {p[0]: 0 for p in label_dir_pairs}
    z = np.zeros(vecsize)
    for i in range(1,len(sizes)+1):
        x = X[i-1]
        for s in x:
            was_zero = False
            for row in s:
                iszero = np.array_equal(row,z)
                if(not iszero):
                    if(was_zero):
                        tn[0] = 1 
                    #print(row[vecsize-1])
                    if (not row[vecsize-1] == i):
                        tn[1] = 1
                if(iszero): 
                    was_zero = True
                else:
                    row = row[:-1]
                    ok = 0
                    for label, frame in all_values_by_label.items():
                        frame = pd.concat(frame)
                        frame = frame.drop("Entry", axis=1)
                        if((frame == row).all(1).any()):
                            ok += 1
                            tn[4][label] = True
                    if(ok == 0):
                        tn[2] = 3
                    if(ok > 1):
                        tn[3] = 3
                
    if(tn[0] != 0):        
        print("Failed: padding not at end")
    if(tn[1] != 0 ^ is_single_list):       
        print("Failed: Add column value incorrect")
    if(tn[2] != 0):
        print("Failed: Data does not come from table")
    if(tn[3] != 0):
        print("Failed: Row persists between samples of different classes")
    if(min(tn[4].values()) == 0):
        print("Failed: Not all labels are used")
def checkCutsAndSorts(X, Y, frame_lists, sizes,  NUM, label_dir_pairs, object_profiles, observ_types, sort_columns=None, sort_ascending=True):
    first = lambda x: x if not isinstance(x, list) else x[0]
    is_single_list = False
    if(not isinstance(X, list)):
        is_single_list = True
        if(sort_columns != None):
            object_profiles = [ObjectProfile(" ", 0, sort_columns=sort_columns, sort_ascending=sort_ascending)]
        else:
            return
        X = [X]
        
    tn = [0]* 2
    z = np.zeros(vecsize)
    
    for index, profile in enumerate(object_profiles):
        x = X[index]
        sort_index = None
        ascending = False
        try:
            if(profile.sort_columns == None):
                sort_index = observ_types.index(first(profile.pre_sort_columns))
                ascending = profile.pre_sort_ascending
                ti = 0
            else:
                sort_index = observ_types.index(first(profile.sort_columns))
                ascending = profile.sort_ascending
                ti = 1
        except ValueError:
            pass
        if(sort_index != None):
            for s in x:
                prev_row = None
                for row in s:
                    iszero = np.array_equal(row,z)
                    if(not isinstance(prev_row, type(None)) and not iszero):
                        if(ascending):
                            if(row[sort_index] < prev_row[sort_index]):
                                tn[ti] = 1
                        else:
                            if(row[sort_index] > prev_row[sort_index]):
                                tn[ti] = 1    
                    prev_row = row
        if(tn[0] != 0):
            print("Failed: presorting incorrect.")
        if(tn[1] != 0):
            print("Failed: sorting incorrect.")
def checkDuplicates(X, Y, object_profiles):
    is_single_list = False
    if(not isinstance(X, list)):
        is_single_list = True
        X = [X]
    z = np.zeros(vecsize)
    rows = []
    for index, profile in enumerate(object_profiles):
        x = X[index]
        for s in x:
            for row in s:
                iszero = np.array_equal(row,z)
                if(not iszero):
                    rows.append(tuple(row))
    if(len(set(rows)) != len(rows)):
        print("Duplicate Found")
                                       
NUM = 5
frame_lists = {l:store_fake(d,NUM, 1, object_profiles1) for l, d in label_dir_pairs}
OPS = object_profiles1

X, Y = preprocessFromPandas_label_dir_pairs(label_dir_pairs,0, NUM, OPS, observ_types, verbose=1)
sizes = np.array([[len(label_dir_pairs)*NUM, p.max_size, vecsize] for p in OPS])
checkGeneralSanity(X, Y, frame_lists, sizes,  NUM, label_dir_pairs)
checkCutsAndSorts(X, Y, frame_lists, sizes,  NUM, label_dir_pairs, OPS, observ_types)

X, Y = preprocessFromPandas_label_dir_pairs(label_dir_pairs,0, NUM, OPS, observ_types, verbose=1, single_list=True)
sizes = np.array([[len(label_dir_pairs)*NUM, sum([p.max_size for p in OPS]), vecsize]])
checkGeneralSanity(X, Y, frame_lists, sizes,  NUM, label_dir_pairs)
checkCutsAndSorts(X, Y, frame_lists, sizes,  NUM, label_dir_pairs, OPS, observ_types)

X, Y = preprocessFromPandas_label_dir_pairs(label_dir_pairs,0, NUM, OPS, observ_types, verbose=1, single_list=True,
                                            sort_columns=["Eta"], sort_ascending=False)
checkCutsAndSorts(X, Y, frame_lists, sizes,  NUM, label_dir_pairs, OPS, observ_types,
                 sort_columns=["Eta"], sort_ascending=False)

X, Y = preprocessFromPandas_label_dir_pairs(label_dir_pairs,0, NUM, OPS, observ_types, verbose=1, single_list=True,
                                            sort_columns=["Phi"], sort_ascending=True)
checkCutsAndSorts(X, Y, frame_lists, sizes,  NUM, label_dir_pairs, OPS, observ_types,
                  sort_columns=["Phi"], sort_ascending=True)

NUM = 15
frame_lists = {l:store_fake(d,5, 4, object_profiles1) for l, d in label_dir_pairs}

X, Y = preprocessFromPandas_label_dir_pairs(label_dir_pairs,5, NUM, OPS, observ_types, verbose=1)
sizes = np.array([[len(label_dir_pairs)*NUM, p.max_size, vecsize] for p in OPS])
justCheckSize(X,Y, sizes)
checkDuplicates(X,Y,object_profiles)

((0, 4), (1, 1, 4))
((0, 4), (1, 1, 4))
((0, 4), (1, 1, 4))
Reading 5 samples from '/tmp/fake_delphes/ttbar/000.h5':
Values/Sample from: {'EFlowPhoton': 20, 'MuonTight': 10, 'Electron': 10, 'MET': 1, 'EFlowTracks': 20, 'EFlowNeutralHadron': 20}
--------------------------------------------------
Reading 5 samples from '/tmp/fake_delphes/wjet/000.h5':
Values/Sample from: {'EFlowPhoton': 20, 'MuonTight': 10, 'Electron': 10, 'MET': 1, 'EFlowTracks': 20, 'EFlowNeutralHadron': 20}
--------------------------------------------------
Reading 5 samples from '/tmp/fake_delphes/qcd/000.h5':
Values/Sample from: {'EFlowPhoton': 20, 'MuonTight': 10, 'Electron': 10, 'MET': 1, 'EFlowTracks': 20, 'EFlowNeutralHadron': 20}
--------------------------------------------------
Reading 5 samples from '/tmp/fake_delphes/ttbar/000.h5':
Values/Sample from: {'EFlowPhoton': 20, 'MuonTight': 10, 'Electron': 10, 'MET': 1, 'EFlowTracks': 20, 'EFlowNeutralHadron': 20}
--------------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
import sys,os
sys.path.append(os.path.realpath("../"))

In [32]:
import sys,os
from six import string_types
from types import NoneType

#if __package__ is None:

    #sys.path.append(os.path.realpath(__file__+"/../../../"))


import time, math,re,h5py,shutil
import argparse
from multiprocessing import Process
from time import sleep
from CMS_Deep_Learning.io import size_from_meta,get_sizes_meta_dict

PARTICLE_OBSERVS = ['Energy', 'Px', 'Py', 'Pz', 'Pt', 'Eta', 'Phi', 'Charge',
                    'ChPFIso', 'GammaPFIso', 'NeuPFIso',
                    'isChHad', 'isEle', 'isGamma', 'isMu', 'isNeuHad',
                    'vtxX', 'vtxY', 'vtxZ']
HLF_OBSERVS = ['HT', 'MET', 'MT', 'PhiMET', 'bJets', 'nJets']
#ROWS PER EVENT
DEFAULT_RPE = {"Particles": 801, "HLF": 1}
DEFAULT_OBSERVS = {"Particles": PARTICLE_OBSERVS, "HLF": HLF_OBSERVS }

import pandas as pd
import numpy as np
import numpy.ma as ma

#----------------------------IO-----------------------------
def numpy_from_h5(f, file_start_read, samples_to_read, file_total_events=-1, format='numpy', observ_types=DEFAULT_OBSERVS, rows_per_event=DEFAULT_RPE):
    '''Helper Function - Gets a numpy array from a pandas or numpy .h5 file
    
        :param f: The filepath of the pandas file
        :type f: str
        :param file_start_read: what samples to start reading with
        :type file_start_read: uint
        :param samples_to_read: the number of samples to read
        :type samples_to_read: uint
        :param file_total_events: the total events in the file (if you know it)
        :type file_total_events: uint
        :param observ_types: A dictionary of the features (ordered) to get from pandas for each data type
        :type observ_types: dict
        :param rows_per_event: A dictionary of the number of rows per event for each data type
        :type rows_per_event: dict
        :returns: the numpy array
    '''
    if(format == "pandas"):
        store = pd.HDFStore(f)
    else:
        store = h5py.File(f)
    
    
    values = {}
    for key, rpe in rows_per_event.items():
        # Where to start reading the table based on the sum of the selection start 
        select_start = file_start_read * rpe
        select_stop = select_start + samples_to_read * rpe
        
        if(format == 'pandas'):
            if (samples_to_read == file_total_events):
                frame = store.get('/' + key)
            else:
                frame = store.select('/' + key, start=select_start, stop=select_stop)
            columns = list(frame.columns)
            x = frame.values
        else:
            if (samples_to_read == file_total_events):
                x = store[key][:]
            else:
                x = store[key][select_start:select_stop]
            columns = ["EvtId"] + DEFAULT_OBSERVS[key]

        
        if (observ_types != None):
            evtIDS = x[:,columns.index("EvtId")]
            x = np.take(x, [columns.index(o) for o in observ_types[key]], axis=-1)
        if (rpe > 1):
            n_rows, n_columns = x.shape
            x = x.reshape((n_rows / rpe, rpe, n_columns))
            assert np.array([len(np.unique(y))==1 for y in evtIDS.reshape((n_rows / rpe, rpe))]).all(), "FAIL, reshape does not correctly group event ids"
        values['sources'] = np.array([[f]*len(evtIDS), np.array(evtIDS,dtype='int')]).T
        values[key] = x
    store.close()
    return values
#------------------------------------------------------------


#---------------------------HELPERS---------------------------
def _gen_label_vecs(data_dirs):
    num_labels = len(data_dirs)
    label_vecs = {}
    for i, data_dir in enumerate(data_dirs):
        arr = np.zeros((num_labels,))
        arr[i] = 1
        label_vecs[data_dir] = arr
    return label_vecs


def _initializeArrays(data_dirs, samples_per_class):
    '''Helper Function - Generates the initial data structures for the X (data) and Y (target)'''
    num_classes = len(data_dirs)
    X_train = [None] * (samples_per_class * num_classes)
    y_train = [None] * (samples_per_class * num_classes)
    HLF_train = [None] * (samples_per_class * num_classes)
    sources_train = [None] * (samples_per_class * num_classes)
    return X_train, y_train, HLF_train, sources_train
#-------------------------------------------------------------


def _check_inputs(data_dirs, observ_types):
    '''Helper Function - Makes sure that data_dirs, and observ_types are correctly formatted'''
    if (len(set(data_dirs)) != len(data_dirs)):
        raise ValueError("Cannot have duplicate directories %r" % data_dirs)
    for x in observ_types.values():
        if ("EvtId" in x):
            raise ValueError("Using EvtId in observ_types can result in skewed training results. Just don't.")



#--------------------SORTING UTILS--------------------------------
def maxLepPtEtaPhi(X, locs):
    for x in X:
        if (x[locs['isEle']] or x[locs["isMu"]]):
            return x[locs['Pt']], x[locs['Eta']], x[locs['Phi']]
        
def assertZerosBack(sort_slice, x, locs, sort_ascending):
    from numpy import inf
    sort_slice[np.all(x == 0.0, axis=1)] = inf if sort_ascending else -inf
    return sort_slice

def resolveMetric(s, locs, sort_ascending):
    if s in SORT_METRICS:
        return lambda x: assertZerosBack(SORT_METRICS[s](x, locs), x, locs, sort_ascending)
    else:
        raise ValueError("Unrecognized sorting metric %r" % s)


def _sortBy(x, sorts, sort_ascending):  
    if (sorts != None):
        for s in reversed(sorts):
            if (isinstance(s, int)):
                sort_slice = x[:, s]
            else:
                sort_slice = s(x)
            if (sort_ascending == True):
                x = x[sort_slice.argsort()]
            else:
                x = x[sort_slice.argsort()[::-1]]
    return x


def sort_numpy(x, sort_columns, sort_ascending, observ_types):
    '''Helper Function - pads the data and sorts it'''
    sort_locs = None
    assert not isinstance(sort_columns, string_types), "sort_columns improperly stored"
    if (sort_columns != None):
        if (True in [c in sort_columns for c in ["shuffle", "random"]]):
            np.random.shuffle(x)
        elif (not None in sort_columns):
            assert not False in [isinstance(s, string_types) for s in sort_columns], \
                "Type should be string got %s" % (",".join([str(type(s)) for s in sort_columns]))
            locs = {t: s for s, t in enumerate(observ_types)}
            sorts = [locs[s] if s in observ_types else resolveMetric(s, locs, sort_ascending)
                     for s in sort_columns]
            # KLUGE FIX
            x[x[:, locs["Energy"]] == 0] = 0.0
            # Sort
            x = _sortBy(x, sorts, sort_ascending)  

    return x
#------------------------------------------------------------------

#-------------------------SORTINGS---------------------------------
def MaxLepDeltaPhi(X, locs, mlpep=None):
    maxLepPt, maxLepEta, maxLepPhi = maxLepPtEtaPhi(X, locs) if isinstance(mlpep, type(None)) else mlpep
    out = maxLepPhi - X[:, locs["Phi"]]

    tooLarge = -2.0 * math.pi * (out > math.pi)
    tooSmall = 2.0 * math.pi * (out < -math.pi)
    out = out + tooLarge + tooSmall
    return out


def MaxLepDeltaEta(X, locs, mlpep=None):
    maxLepPt, maxLepEta, maxLepPhi = maxLepPtEtaPhi(X, locs) if isinstance(mlpep, type(None)) else mlpep
    return maxLepEta - X[:, locs["Eta"]]


def MaxLepDeltaR(X, locs, mlpep=None):
    mlpep = maxLepPtEtaPhi(X, locs) if isinstance(mlpep, type(None)) else mlpep
    return np.sqrt(MaxLepDeltaPhi(X, locs, mlpep) ** 2 + MaxLepDeltaEta(X, locs, mlpep) ** 2)


def MaxLepKt(X, locs):
    mlpep = maxLepPtEtaPhi(X, locs)
    maxLepPt, maxLepEta, maxLepPhi = mlpep
    return np.minimum(X[:, locs["Pt"]] ** 2, maxLepPt ** 2) * MaxLepDeltaR(X, locs, mlpep) ** 2


def MaxLepAntiKt(X, locs):
    mlpep = maxLepPtEtaPhi(X, locs)
    maxLepPt, maxLepEta, maxLepPhi = mlpep
    return np.minimum(X[:, locs["Pt"]] ** -2, maxLepPt ** -2) * MaxLepDeltaR(X, locs, mlpep) ** 2


SORT_METRICS = {f.__name__: f for f in
                [MaxLepDeltaPhi, MaxLepDeltaEta, MaxLepDeltaR, MaxLepKt, MaxLepAntiKt]}
#---------------------------------------------------------------------





import glob


def pandas_to_numpy(data_dirs, start, samples_per_class,
                    observ_types=DEFAULT_OBSERVS, sort_columns=None, sort_ascending=True, particle_mean=None, particle_std=None, hlf_mean=None, hlf_std=None, verbose=1):
    '''Builds a trainable (particle level) sorted and (event level) shuffled numpy array from directories of pandas .h5 files.
    
        :param data_dirs: 
            A list of pandas directories containing pandas .h5 files, tuples of ('label','dir'),
            or dictionary with .values() equal to such a list. The order indicates which
            files correspond to which output (i.e. the first directory corresponds to
            [1,0,...,0] and the second to [0,1,...,0], etc.). For dictionaries the 
            order defaults to the alphabetical order of the directory names.
        :param start:        Where to start reading (as if all of the files in a given directory are part of one long list)
        :param samples_per_class: The number of samples to read for each label. Every directory must have enough data starting
                            from 'start'.
        :param observ_types: The column headers for the data to be read from the panadas table. Also indicated the order of the columns.
        :param sort_columns: The columns to sort by, or special quantities including [MaxLepDeltaPhi,
                            MaxLepDeltaEta,MaxLepDeltaR,MaxLepKt,MaxLepAntiKt]
        :param sort_ascending: If True sort in ascending order, false decending  
        :param particle_mean: The mean of the particle features to be used for centering the data. Default None indicates no centering
        :param particle_std: The std of the particle features to be used for standardizing the data.Default None indicates no standardization
        :param hlf_mean: The mean of the HLF features to be used for centering the data. Default None indicates no centering
        :param hlf_std: The std of the HLF features to be used for standardizing the data. Default None indicates no standardization
        :returns: (X_train, Y_train, HFL_train) 
    '''
    if (isinstance(data_dirs, dict)): data_dirs = sorted(data_dirs.values(), key=lambda x: x.join(x.split("/")[::-1]))
    if (isinstance(data_dirs[0], tuple)): data_dirs = [x[1] for x in data_dirs]
    _check_inputs(data_dirs, observ_types)

    label_vecs = _gen_label_vecs(data_dirs)
    X_train, y_train, HLF_train,sources_train = _initializeArrays(data_dirs, samples_per_class)
    X_train_index = 0

    y_train_start = 0
    for data_dir in data_dirs:
        files = glob.glob(os.path.abspath(data_dir) + "/*.h5")
        files.sort()
        samples_read, location = 0, 0

        sizesDict = get_sizes_meta_dict(data_dir)

        last_time = time.clock() - 1.0
        count,last_count = 0,0
        # Loop the files associated with the current label
        for f in files:
            file_total_events = size_from_meta(f, sizesDict=sizesDict)  # len(num_val_frame.index)
            if (file_total_events == None or file_total_events == 0):
                print("Skipping %r" % f)
                continue

            if (location + file_total_events <= start):
                location += file_total_events
                continue

            # Determine what row to start reading the num_val table which contains
            # information about how many rows there are for each entry
            file_start_read = start - location if start > location else 0

            # How many rows we will read from this table each corresponds to one entry
            samples_to_read = min(samples_per_class - samples_read, file_total_events - file_start_read)
            assert samples_to_read >= 0

            d = numpy_from_h5(f, file_start_read=file_start_read,
                                       samples_to_read=samples_to_read,
                                       file_total_events=file_total_events,
                                       rows_per_event=DEFAULT_RPE,
                                       observ_types=observ_types)
            Particles, HLF,sources = d["Particles"], d["HLF"],d["sources"]

            for s, (particles, hlf,source) in enumerate(zip(Particles, HLF,sources)):
                # ----------pretty progress bar---------------
                
                if (verbose >= 1):
                    c = time.clock()
                    if (c > last_time + .25):
                        prog = X_train_index + s
                        percent = float(prog) / (samples_per_class * len(data_dirs))
                        sys.stdout.write('\r')
                        sys.stdout.write("[%-20s] %r/%r  %r(Event/sec)" % ('=' * int(20 * percent), prog,
                                                                           int(samples_per_class) * len(data_dirs),
                                                                           4 * (count-last_count)))
                        sys.stdout.flush()
                        #prev_sample = s
                        last_time = c
                        last_count = count

                count += 1
                # ------------------------------------------
                particles = sort_numpy(particles, sort_columns, sort_ascending, observ_types["Particles"])
                
                #-----------------STANDARDIZATION --------------------------------------
                #Mask out the padding so that it is not altered during standarization
                if(not isinstance(particle_mean,NoneType) or not isinstance(particle_std,NoneType)):
                    mask = (particles == 0.0).all(axis=-1)
                    mask_extended = np.expand_dims(mask, axis=-1)
                    mask_extruded = np.repeat(mask_extended, len(PARTICLE_OBSERVS), axis=-1)
                    particles = ma.masked_array(particles, mask=mask_extruded)
                
                #Apply standardization
                if(not isinstance(particle_mean,NoneType)):
                    particles = particles - particle_mean.reshape(1,len(PARTICLE_OBSERVS))
                if(not isinstance(particle_std,NoneType)):
                    particles = particles / particle_std.reshape(1,len(PARTICLE_OBSERVS))
                if (not isinstance(hlf_mean, NoneType)):
                    hlf = hlf - hlf_mean
                if (not isinstance(hlf_std, NoneType)):
                    hlf = hlf / hlf_std
                #------------------------------------------------------------------------

                X_train[X_train_index + s] = np.array(particles)
                HLF_train[X_train_index + s] = hlf
                sources_train[X_train_index + s] = source

            X_train_index += samples_to_read

            location += file_total_events
            samples_read += samples_to_read
            if (samples_read >= samples_per_class):
                assert samples_read == samples_per_class
                break
        if (samples_read != samples_per_class):
            raise IOError(
                "Not enough data in %r to read in range(%r, %r)" % (data_dir, start, samples_per_class + start))

        # Generate the target data as vectors like [1,0,0], [0,1,0], [0,0,1]
        for i in range(samples_per_class):
            y_train[y_train_start + i] = label_vecs[data_dir]
        y_train_start += samples_per_class

    # Turn everything into numpy arrays and shuffle them just in case.
    # Although, we probably don't need to shuffle since keras shuffles by default.
    y_train = np.array(y_train)

    indices = np.arange(len(y_train))
    np.random.shuffle(indices)

    X_train = np.array(X_train)[indices]
    HLF_train = np.array(HLF_train)[indices]
    y_train = y_train[indices]
    sources_train = np.array(sources_train)[indices]
    

    return X_train, y_train, HLF_train,sources_train

In [33]:
dirs = ["/bigdata/shared/Delphes/REDUCED_IsoLep_NEW/qcd_lepFilter_13TeV/",
        "/bigdata/shared/Delphes/REDUCED_IsoLep_NEW/wjets_lepFilter/"
       ]

a,b,c,d = pandas_to_numpy(dirs,0,10)



In [35]:
print(d,b)

(array([[ '/bigdata/shared/Delphes/REDUCED_IsoLep_NEW/qcd_lepFilter_13TeV/qcd_lepFilter_13TeV_1000_0_TO_998.h5',
        '4'],
       [ '/bigdata/shared/Delphes/REDUCED_IsoLep_NEW/wjets_lepFilter/wjets_lepFilter_475_0_TO_99.h5',
        '9'],
       [ '/bigdata/shared/Delphes/REDUCED_IsoLep_NEW/qcd_lepFilter_13TeV/qcd_lepFilter_13TeV_1000_0_TO_998.h5',
        '1'],
       [ '/bigdata/shared/Delphes/REDUCED_IsoLep_NEW/wjets_lepFilter/wjets_lepFilter_475_0_TO_99.h5',
        '0'],
       [ '/bigdata/shared/Delphes/REDUCED_IsoLep_NEW/qcd_lepFilter_13TeV/qcd_lepFilter_13TeV_1000_0_TO_998.h5',
        '8'],
       [ '/bigdata/shared/Delphes/REDUCED_IsoLep_NEW/wjets_lepFilter/wjets_lepFilter_475_0_TO_99.h5',
        '8'],
       [ '/bigdata/shared/Delphes/REDUCED_IsoLep_NEW/wjets_lepFilter/wjets_lepFilter_475_0_TO_99.h5',
        '5'],
       [ '/bigdata/shared/Delphes/REDUCED_IsoLep_NEW/wjets_lepFilter/wjets_lepFilter_475_0_TO_99.h5',
        '6'],
       [ '/bigdata/shared/Delphes/REDUCED

In [27]:
d = numpy_from_h5("/bigdata/shared/Delphes/REDUCED_IsoLep_NEW/qcd_lepFilter_13TeV/qcd_lepFilter_13TeV_4115_0_TO_998.h5",0,10)

In [23]:
d['source'].shape

(10, 2)

In [24]:
d['source']

array([[ '/bigdata/shared/Delphes/REDUCED_IsoLep_NEW/qcd_lepFilter_13TeV/qcd_lepFilter_13TeV_4115_0_TO_998.h5',
        '0'],
       [ '/bigdata/shared/Delphes/REDUCED_IsoLep_NEW/qcd_lepFilter_13TeV/qcd_lepFilter_13TeV_4115_0_TO_998.h5',
        '1'],
       [ '/bigdata/shared/Delphes/REDUCED_IsoLep_NEW/qcd_lepFilter_13TeV/qcd_lepFilter_13TeV_4115_0_TO_998.h5',
        '2'],
       [ '/bigdata/shared/Delphes/REDUCED_IsoLep_NEW/qcd_lepFilter_13TeV/qcd_lepFilter_13TeV_4115_0_TO_998.h5',
        '3'],
       [ '/bigdata/shared/Delphes/REDUCED_IsoLep_NEW/qcd_lepFilter_13TeV/qcd_lepFilter_13TeV_4115_0_TO_998.h5',
        '4'],
       [ '/bigdata/shared/Delphes/REDUCED_IsoLep_NEW/qcd_lepFilter_13TeV/qcd_lepFilter_13TeV_4115_0_TO_998.h5',
        '5'],
       [ '/bigdata/shared/Delphes/REDUCED_IsoLep_NEW/qcd_lepFilter_13TeV/qcd_lepFilter_13TeV_4115_0_TO_998.h5',
        '6'],
       [ '/bigdata/shared/Delphes/REDUCED_IsoLep_NEW/qcd_lepFilter_13TeV/qcd_lepFilter_13TeV_4115_0_TO_998.h5',
      

KeyError: 'sources'