In [1]:
import tensorflow.keras as keras
import tensorflow as tf
import numpy as np
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import uproot
import pandas as pd
import awkward as ak
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import tensorflow_decision_forests as tfdf
import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots
from keras.callbacks import LearningRateScheduler
from keras.models import Sequential
from tensorflow import python as tf_python

2022-10-25 14:02:42.508899: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-25 14:02:43.774888: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/benk/root/root-6.26.06-install/lib
2022-10-25 14:02:43.774962: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-10-25 14:02:43.932624: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-2

In [2]:
#loads in files for signal and background
file_sig = uproot.open("mc16e_signal.root")
file_back = uproot.open("mc16e_ttbar.root")

#Sets trees of files to variables
tree_sig = file_sig["nominal"]
tree_back = file_back["nominal"]

#Prints number of entries for each tree
print(f'Signal tree entries: {tree_sig.num_entries}')
print(f'Background tree entries: {tree_back.num_entries}')

Signal tree entries: 3319
Background tree entries: 3531


In [3]:
#Shows contents of each tree
#tree_sig.show()
#tree_back.show()

In [4]:
muonStats_sig = tree_sig.arrays(['mu_pt', 'mu_eta', 'mu_phi'])
jetStats_sig = tree_sig.arrays(['jet_pt', 'jet_eta', 'jet_phi'])
muonStats_back = tree_back.arrays(['mu_pt', 'mu_eta', 'mu_phi'])
jetStats_back = tree_back.arrays(['jet_pt', 'jet_eta', 'jet_phi'])

In [5]:
print(np.concatenate(np.array(muonStats_sig['mu_pt']), axis = 0))

[109464.52   65954.625 148726.    ...  53041.37   58756.44   56119.04 ]


In [6]:
#plt.hist(np.concatenate(muonStats_sig['mu_pt'], axis = 0),bins=np.linspace(0,450000,101),label='Signal', histtype='step')
#plt.hist(np.concatenate(muonStats_back['mu_pt'], axis = 0),bins=np.linspace(0,450000,101),label='Background', histtype='step')
#plt.xlabel(r'Muon $p_{T}$ [GeV]')
#plt.ylabel('Count')
#plt.legend()
#plt.show()

In [7]:
print(tree_sig.keys(filter_name="/(ljet|mu)_(pt|eta|phi)/"))
allStats_sig = tree_sig.arrays(filter_name="/(ljet|mu)_(pt|eta|phi)/", library = 'pd')
allStats_back = tree_back.arrays(filter_name="/(ljet|mu)_(pt|eta|phi)/", library = 'pd')
muonStats_sig = allStats_sig[0];
jetStats_sig = allStats_sig[1];
muonStats_back = allStats_back[0];
jetStats_back = allStats_back[1];

print(allStats_sig);

['mu_pt', 'mu_eta', 'mu_phi', 'ljet_pt', 'ljet_eta', 'ljet_phi', 'ljet_pt_cand', 'ljet_eta_cand', 'ljet_phi_cand']
(                        mu_pt    mu_eta    mu_phi   ljet_pt_cand  \
entry subentry                                                     
0     0         109464.523438  0.752634 -0.603566  274842.875000   
1     0          65954.625000  0.954476  0.651902  212455.109375   
2     0         148726.000000  1.510869 -0.182065  371398.625000   
3     0          71516.101562  0.254257  2.440928  331628.312500   
4     0         222168.671875  0.672102  0.522216  341838.843750   
...                       ...       ...       ...            ...   
3314  0          52193.957031  0.362391  2.417538  231281.109375   
3315  0         146224.984375 -0.150667  0.455322  268928.281250   
3316  0          53041.371094 -1.209407  1.332303  219864.437500   
3317  0          58756.441406  0.259072  1.774282  221806.578125   
3318  0          56119.039062 -0.593785  1.690351  293187.750000   


In [15]:
#ENSURE WEIGHTS ARE THE LAST ENTRY IN THE VAR ARRAY
class DataGenerator(tf.keras.utils.Sequence):

    def __init__(self, list_IDs, labelsFunc, batch_size=32, dim=(14), n_channels=1, n_classes=2, shuffle=True):
        self.dim = dim
        self.batch_size = batch_size
        self.labelsFunc = labelsFunc
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X[:, :-1], y, X[:, -1]

    def on_epoch_end(self):
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        # Initialization
        X = np.empty((self.batch_size, self.dim,))
        y = np.empty((self.batch_size), dtype=int)
        
        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            X[i] = np.load('data/' + str(ID) + '.npy')

            # Store class
            y[i] = self.labelsFunc(ID)

        return X, keras.utils.to_categorical(y, num_classes=self.n_classes), 

#ENSURE WEIGHTS ARE THE LAST ENTRY IN THE VAR ARRAY
def getList_ID(fileName, tree, varNames, max_entry = 10000):
    df = pd.DataFrame()
    opFile = uproot.open(fileName + ':' + tree)
    for var in varNames:
        varDf = opFile[var].array(entry_stop = max_entry, library = 'pd')
        if(varDf.index.nlevels == 2):
            df[var] = varDf.reset_index(level=1, drop=True)
        else:
            df[var] = varDf
    saveArr = df.to_numpy()
    savePrefix = fileName[:fileName.find('.')]
    saveStrings = []
    for i in range(saveArr.shape[0]):
        saveString = savePrefix + str(i)
        saveStrings.append(saveString)
        np.save('data/' + saveString + '.npy', saveArr[i])
    return saveStrings

def create_heat_map(df):
    corr = df.corr()
    sns.heatmap(corr, 
    cmap='RdYlGn', 
    xticklabels=corr.columns.values,
    yticklabels=corr.columns.values)
    plt.show()

def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.show()
    
def plot_loss(fit):
    plt.plot(fit.history['loss'])
    plt.plot(fit.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

def plot_accuracy(fit):
    plt.plot(fit.history['accuracy'])
    plt.plot(fit.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    
def custom_LearningRate_schedular(epoch):
    if epoch < 5:
        return 0.01
    else:
        return 0.01 * tf.math.exp(0.1 * (10 - epoch))

    
def get_model(inputShape):
    model = keras.Sequential([
#    keras.layers.Dense(14, activation='relu', input_shape=inputShape),
#    keras.layers.Flatten(),
#    keras.layers.Dropout(0.2),
#    # keras.layers.Dense(32, activation='relu'),
#    keras.layers.Dense(16, activation='relu'),
#    # keras.layers.Dense(4, activation='relu'),
#    keras.layers.Dense(1, activation=tf.nn.sigmoid)
    tf.keras.layers.InputLayer(input_shape=inputShape),
    #tf.keras.layers.Dense(128, activation='relu'),
    #tf.keras.layers.Dense(64, activation='relu'),
    #tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    #tf.keras.layers.Dense(8, activation='relu'),
    #tf.keras.layers.Dense(4, activation='relu'),
    tf.keras.layers.Dense(units=2, activation='softmax')
    ])
    #model.compile(optimizer=tf.optimizers.SGD(learning_rate=0.0001),
    #            loss=tf.keras.losses.BinaryCrossentropy(
    #                    name='binary_crossentropy'),
    #            metrics=['accuracy', 
    #                    keras.metrics.AUC(name='auc'),
    #                    keras.metrics.AUC(name='prc', curve='PR')])
    model.compile(optimizer = keras.optimizers.Adam(learning_rate=1e-2), loss = 'binary_crossentropy', metrics = 'accuracy')
    return model

def boosted_decision_tree():
    model = tfdf.keras.GradientBoostedTreesModel()
    return model

In [16]:
max_entries = 10000

train_variables = ['mupt_cand', 'mueta_cand', 'muphi_cand', 'ljet_pt_cand', 'ljet_eta_cand',\
                   'ljet_phi_cand', 'ljet_mass_cand', 'dR_values_cand', 'pt_higgs',\
                   'mass_T', 'met_met', 'met_phi', 'mass_mj', 'weight']

signalMatrix = getList_ID('mc16e_signal.root', 'nominal', train_variables, max_entry = max_entries)
signalLabels = np.ones(shape = len(signalMatrix))

ttbarMatrix = getList_ID('mc16e_ttbar.root', 'nominal', train_variables, max_entry = max_entries)
ttbarLabels = np.zeros(shape = len(ttbarMatrix))

mixedMatrix = np.concatenate((signalMatrix, ttbarMatrix))
mixedLabels = np.concatenate((signalLabels, ttbarLabels))

trainMatrix1, valMatrix, trainLabels1, valLabels = train_test_split(mixedMatrix, mixedLabels, test_size = 0.1)
trainMatrix, testMatrix, trainLabels, testLabels = train_test_split(trainMatrix1, trainLabels1, test_size = 0.1)

In [17]:
print(trainMatrix.__getitem__(1))
print(trainMatrix[0])
#print(trainLabels)
#print(testMatrix)
#print(testLabels)

mc16e_signal2458
mc16e_ttbar1815


In [18]:
def getLabel(ID):
    if("mc16e_signal" in ID):
        return 1
    elif("mc16e_ttbar" in ID):
        return 0

trainGenerator = DataGenerator(mixedMatrix, getLabel, batch_size=256, dim=14, n_channels=1, n_classes=2, shuffle=True)

valGenerator = DataGenerator(valMatrix, getLabel, batch_size=256, dim=14, n_channels=1, n_classes=2, shuffle=True)

testGenerator = DataGenerator(testMatrix, getLabel, batch_size=256, dim=14, n_channels=1, n_classes=2, shuffle=True)

In [19]:
print(trainGenerator[0])

(array([[ 3.62343008e+04,  8.72936100e-02,  1.92481875e-01, ...,
         3.75471367e+04, -5.06908774e-01,  1.33178922e+05],
       [ 5.98010664e+04, -1.98087931e-01,  1.74881482e+00, ...,
         2.99754150e+03,  2.16092229e+00,  1.18696859e+05],
       [ 3.46061719e+04,  1.42934370e+00, -1.19033062e+00, ...,
         1.25533496e+04, -8.23347047e-02,  1.15611148e+05],
       ...,
       [ 8.04241328e+04, -1.21859312e-01, -1.46989179e+00, ...,
         7.55425547e+04, -5.64514697e-01,  1.46771203e+05],
       [ 3.74561367e+04, -7.94552445e-01, -1.39853999e-01, ...,
         2.82609922e+04, -5.38902640e-01,  8.45607266e+04],
       [ 7.26159922e+04, -8.08387935e-01, -1.96164346e+00, ...,
         8.50403984e+04, -2.48496485e+00,  1.09365477e+05]]), array([[1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
   

In [20]:
nn_model = get_model((13,))
nn_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 16)                224       
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 241
Trainable params: 241
Non-trainable params: 0
_________________________________________________________________


In [21]:
nnfit = nn_model.fit(trainGenerator, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
 5/26 [====>.........................] - ETA: 3s - loss: 23.2013 - accuracy: 0.5000

In [None]:
print(testGenerator[0])

(array([[ 6.14487773e+04,  6.56550109e-01, -7.68729866e-01, ...,
         2.33677000e+05, -9.93868887e-01,  9.95781875e+04],
       [ 1.24750375e+05, -2.56126970e-02,  1.23471844e+00, ...,
         1.17055508e+04,  1.62572753e+00,  9.45575703e+04],
       [ 4.25528555e+04,  2.86719501e-01, -2.10796341e-01, ...,
         5.65752500e+04, -2.88516045e+00,  8.67482656e+04],
       ...,
       [ 8.39709688e+04, -5.91560483e-01,  2.42386413e+00, ...,
         1.64780469e+05,  2.34537148e+00,  1.06327969e+05],
       [ 2.87045449e+04,  9.68313992e-01, -7.95157731e-01, ...,
         6.61717109e+04,  1.60240789e-03,  1.10869570e+05],
       [ 5.54113555e+04, -2.30690956e-01,  2.49521661e+00, ...,
         1.21073877e+04,  5.77773094e-01,  1.11561617e+05]]), array([1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 

In [None]:
probability_model = tf.keras.Sequential([nn_model, 
                                         tf.keras.layers.Softmax()])
predictions = probability_model.predict(testGenerator[0][0])
print(predictions)

[[0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99999994]
 [0.99

In [None]:
#nn_model = get_model((14,))
# #fit the model to train on all but the last column
#print("MATT, FITTING MODEL")
#callback = LearningRateScheduler(custom_LearningRate_schedular)
# print(train_dataset[:,train_dataset.shape[1]-1 : train_dataset.shape[1]])
#nn_fit = nn_model.fit(train_dataset[:, 0:train_dataset.shape[1]-1], train_output, epochs=500, batch_size = 500, validation_data=(val_dataset[:, 0:train_dataset.shape[1]-1], val_output), sample_weight=train_dataset[:,train_dataset.shape[1]-1 : train_dataset.shape[1]], shuffle=True)
# validation_data=(val_dataset[:, 0:train_dataset.shape[1]-1], val_output),
# print(train_dataset[:,0:train_dataset.shape[1]-1])
# nn_fit = nn_model.fit(train_dataset[:,0:train_dataset.shape[1]-1], train_output[:,0:0:train_dataset.shape[1]-1], epochs=70, batch_size=500, verbose=1, shuffle=True, validation_data=(val_dataset[:,0:train_dataset.shape[1]-1], val_output[:,0:train_dataset.shape[1]-1]), sample_weight=train_dataset[:,train_dataset.shape[1]-1:train_dataset.shape[1]])
#print("MATT, MODEL FITTED")
#print("MATT, PREDICTING")
#y_scores = nn_model.predict(test_dataset[:, 0:train_dataset.shape[1]-1])



#bdt_model = boosted_decision_tree()
#print("MATT, FITTING MODEL")
#bdt_fit = bdt_model.fit(train_dataset[:, 0:train_dataset.shape[1]-1], train_output, sample_weight=train_dataset[:,train_dataset.shape[1]-1 : train_dataset.shape[1]])
#print("MATT, MODEL FITTED")
#print("MATT, PREDICTING")
#bdt_y_scores = bdt_model.predict(test_dataset[:, 0:train_dataset.shape[1]-1])


In [None]:
from skimage.io import imread
from skimage.transform import resize
import numpy as np
import math

class CIFAR10Sequence(tf.keras.utils.Sequence):

    def __init__(self, x_set, y_set, batch_size):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size

    def __len__(self):
        return math.ceil(len(self.x) / self.batch_size)

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) *
        self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) *
        self.batch_size]

        return np.array([
            resize(imread(file_name), (200, 200))
               for file_name in batch_x]), np.array(batch_y)

In [None]:
genObj = CIFAR10Sequence([1, 2, 3, 4, 5], [1, 4, 9, 16, 25], 2)

In [None]:
testingModel = tf.keras.models.Sequential([
  tf.keras.layers.InputLayer(input_shape=(1,)),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(62, activation='softmax')
])

