In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout, BatchNormalization
from sklearn.utils import shuffle
from PIL import ImageFont
import datetime

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

import tensorflow as tf
%load_ext tensorboard

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
!rm -rf ./logs/mcc
!rm -rf /home2/kalp_shah/tmp/backup/mcc

In [5]:
files = ['ttbar','wmp','wpwm','zwpm','n2n2']

cs_lo_k = {
            'ttbar':988.57,
            'wmp'  :1.95*1e5,
            'wpwm' :124.31,
            'zwpm' :51.82,
            'n2n2' :1
          }

br_ratio = {
            'ttbar':0.67*(1-0.67)*2,
            'wmp'  :(1-0.67),
            'wpwm' :(1-0.67)*0.67*2,
            'zwpm' :0.7*(1-0.67),
            'n2n2' :1
          }

cs_nmg = {
         'ttbar':393.30,
         'wmp'  :7.865*1e4,
         'wpwm' :74.96,
         'zwpm' :14.28,
         'n2n2' :1
         }

cs_mg = {'ttbar':5.883,
          'wmp':111.5,
          'wpwm':0.944,
          'zwpm':0.2381,
          'n2n2':3.99*1e-4
        }

cs_pb = []
for f in files:
    cs_pb.append((cs_lo_k[f]*br_ratio[f]*cs_mg[f])/cs_nmg[f])

cs = [i*1e3 for i in cs_pb]
#k_f = [1.954,1.356,1.92,2.09,1.0]

cs_corr = {files[i] : cs[i] for i in range(len(files))}

In [7]:
no_of_files = {'ttbar':0,
              'wmp':0,
              'wpwm':0,
              'zwpm':0,
              'n2n2':0
              }

red_merging = {'ttbar':98159/1e5,
               'wmp':96494/1e5,
               'wpwm':97633/1e5,
               'zwpm':0,
               'n2n2'1
              }

In [8]:
def get_res(x):
    res = np.zeros(shape=(x.shape[0],5))
    #print(x.shape[0],5)
    for i in range(len(x)):
        #print(i.x[i])
        res[i,x[i]] = 1
    
    return res

In [9]:
df = []
for f in range(len(files)):
    con_df = []
    
    for i in range(1,53):
        try:
            con_df.append(pd.read_csv('~/neutrino/datasets/csvdata/' + files[f] + str(i) + '.csv'))
            no_of_files[files[f]] += 1
        except:
            pass
            #print("Not Here : ",files[f],i)
    
    df.append(pd.concat(con_df,ignore_index=True))
    df[-1]['type'] = f
    
    if files[f] == "n2n2":
        df[-1]['tag'] = 1
    else:
        df[-1]['tag'] = 0

In [12]:
total_signal = no_of_files['n2n2']*1e5

total_background = 0
for f in range(len(files)-1):
    total_background += no_of_files[files[f]]*1e5
    
total_signal,total_background

(3900000.0, 15300000.0)

In [13]:
df[-1].head()

Unnamed: 0,ptl,etal,energyl,ptj,etaj,energyj,massj,mjj,rjj,rjl,met,n21_1,n21_2,n32_1,n32_2,infl,drfl,type,tag
0,576.562,0.232894,592.269,459.681,0.248122,475.279,36.1316,379.243,1.14509,2.25305,889.067,0.190804,0.258472,0.517955,0.670891,769.506,1.86471,4,1
1,823.599,-0.4188,896.887,339.247,-0.696095,431.789,77.3122,1052.58,2.67178,2.07278,1930.33,0.197178,0.222316,0.468612,0.544453,1113.56,1.81712,4,1
2,354.758,-1.72193,1024.2,883.807,-0.817714,1200.2,98.775,1764.85,2.82048,0.922735,3256.57,0.328865,0.361102,0.40928,0.760725,1015.05,2.09248,4,1
3,858.275,0.386685,923.245,1258.81,0.184948,1285.29,112.015,1546.48,1.12283,2.55146,388.116,0.216487,0.234383,0.308864,0.722296,510.332,2.04593,4,1
4,590.04,-0.006999,590.055,811.602,0.44512,897.952,90.8938,818.864,1.16291,1.4955,1133.33,0.172456,0.329166,0.461283,0.696238,351.882,3.21196,4,1


In [14]:
dtset = pd.concat(df,ignore_index=True)
dtset = shuffle(dtset)
dtset['met'] = np.fabs(dtset['met'])

In [15]:
## Analysis Level Cuts
dtset = dtset[dtset['ptl'] >= 120.0][dtset['ptj'] >= 120.0][dtset['etaj'] <= 2.0][dtset['etaj'] >= -2.0]

In [16]:
train_len = int(0.8*len(dtset))
x_train = dtset.T[:-2].T[:train_len]
y_train = get_res(dtset['type'][:train_len].values)

x_test = dtset.T[:-2].T[train_len:]
y_test = get_res(dtset['type'][train_len:].values)

print('Shapes : ',x_train.shape,y_train.shape,x_test.shape,y_test.shape)

Shapes :  (9034704, 17) (9034704, 5) (2258676, 17) (2258676, 5)


In [17]:
# Callbacks
backup_callback = tf.keras.callbacks.experimental.BackupAndRestore(backup_dir="/home2/kalp_shah/tmp/backup/mcc")

log_dir = "/home2/kalp_shah/logs/mcc/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [26]:
class sig_callback(tf.keras.callbacks.Callback):
    
    def __init__(self,tot_sig,tot_back):
        self.tot_sig = tot_sig
        self.tot_back = tot_back
    
    def on_epoch_end(self, epoch, logs=None):
        ns_t = logs['Ns']
        nb_t = logs['Nb']
        
        s_train = (ns_t/(self.tot_sig*(0.8*0.6)))/np.sqrt((nb_t/(self.tot_back*(0.8*0.6))))
        
        ns_v = logs['val_Ns']
        nb_v = logs['val_Nb']
        
        s_val = (ns_v/(self.tot_sig*(0.8*0.2)))/np.sqrt((nb_v/(self.tot_back*(0.8*0.2))))
        print()
        print('The training significance is : ',s_train)
        print('The validation significance is : ',s_val)
        print('The luminosity required for 5 sigma (training) : ',np.square(5/s_train))
        print('The luminosity required for 5 sigma (validation) : ',np.square(5/s_val))
        
    #def on_test_end(self, epoch, logs=None):
    #    ns_t = logs['Ns']
    #    nb_t = logs['Nb']
    #    
    #    s_train = (ns_t/(self.tot_sig))/(nb_t/(self.tot_back))
    #    print('The testing significance is : ',s_train)

In [19]:
class NSignal(tf.keras.metrics.Metric):

    def __init__(self,cross_section,name='Ns',**kwargs):
        super(NSignal, self).__init__(name=name, **kwargs)
        self.ns = self.add_weight(name='tp', initializer='zeros')
        self.cs = cross_section
        
    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.cast(tf.argmax(y_true,axis=1),self.dtype)
        y_pred = tf.cast(tf.argmax(y_pred,axis=1),self.dtype)
        
        total = tf.equal(y_true, tf.cast(4,self.dtype))
        total = tf.cast(total, tf.bool)
        
        prediction = tf.equal(y_pred, tf.cast(4,self.dtype))
        prediction = tf.cast(prediction, tf.bool)
        
        signal = tf.logical_and(prediction,total)
        signal = tf.cast(signal,self.dtype)
        #signal = tf.multiply(signal,self.cs[-1])
        self.ns.assign_add(tf.reduce_sum(signal)*self.cs[-1])

    def result(self):
        return self.ns

In [20]:
class NBack(tf.keras.metrics.Metric):

    def __init__(self,cross_section,name='Nb',**kwargs):
        super(NBack, self).__init__(name=name, **kwargs)
        self.nb = self.add_weight(name='tp', initializer='zeros')
        self.cs = cross_section
        
    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.cast(tf.argmax(y_true,axis=1),self.dtype)
        y_pred = tf.cast(tf.argmax(y_pred,axis=1),self.dtype)
        
        comp_back = tf.cast(0,self.dtype)
        for i in range(len(self.cs) - 1):
            total = tf.equal(y_true, tf.cast(i,self.dtype))
            total = tf.cast(total, tf.bool)
        
            prediction = tf.equal(y_pred, tf.cast(4,self.dtype))
            prediction = tf.cast(prediction, tf.bool)
        
            back = tf.logical_and(prediction,total)
            back = tf.cast(back,self.dtype)
            #back = tf.multiply(back,self.cs[i])
        
            comp_back += tf.reduce_sum(back)*self.cs[i]

        self.nb.assign_add(tf.cast(comp_back,self.dtype))

    def result(self):
        return self.nb

In [24]:
model = Sequential()
input_shape = x_train.shape

from keras.layers.normalization.batch_normalization import BatchNormalization
model.add(Dense(10,activation = 'relu',input_dim = input_shape[1]))
model.add(BatchNormalization())
model.add(Dense(25,activation = 'relu',input_dim = 10))
model.add(BatchNormalization())
model.add(Dense(40,activation = 'relu',input_dim = 25))
model.add(BatchNormalization())
model.add(Dense(20,activation = 'relu',input_dim = 40))
model.add(BatchNormalization())
model.add(Dense(12,activation = 'relu',input_dim = 20))
model.add(BatchNormalization())
model.add(Dense(8,activation = 'relu',input_dim = 12))
model.add(BatchNormalization())
model.add(Dense(5,activation = 'softmax',input_dim = 8))

model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy',NSignal(cs_list),NBack(cs_list)])

In [27]:
try:
    model.fit(x_train,y_train,epochs=100,batch_size=1024,validation_split=0.4,
              class_weight={0:5,1:7,2:4,3:4,4:.005},
          callbacks=[backup_callback,tensorboard_callback,sig_callback(total_signal,total_background)])

except:
    pass

Epoch 10/100
The training significance is :  0.1365521182514594
The validation significance is :  0.24441267121469631
The luminosity required for 5 sigma (training) :  1340.7355522682067
The luminosity required for 5 sigma (validation) :  418.49721638498414
Epoch 11/100
The training significance is :  0.1662127293240047
The validation significance is :  0.23642724837591425
The luminosity required for 5 sigma (training) :  904.9226252298762
The luminosity required for 5 sigma (validation) :  447.2444429306809
Epoch 12/100
The training significance is :  0.14103898523072966
The validation significance is :  0.20524963476144287
The luminosity required for 5 sigma (training) :  1256.7869413776118
The luminosity required for 5 sigma (validation) :  593.4378229053115
Epoch 13/100
The training significance is :  0.12973382445197595
The validation significance is :  0.20080431128507234
The luminosity required for 5 sigma (training) :  1485.3663012937286
The luminosity required for 5 sigma (val

In [30]:
%tensorboard --logdir /home2/kalp_shah/logs/mcc/ --port 8009

In [31]:
tot_pred = model.evaluate(dtset.T[:-2].T,get_res(dtset['type'].values),batch_size=1024)



In [32]:
tot_pred[2],tot_pred[3]

(175067.3125, 2298663.0)

In [52]:
sig = ((tot_pred[2]/total_signal)/np.sqrt((tot_pred[3]/total_background)))*np.sqrt(3000)
sig

6.343210853052492

In [53]:
total_pred = model.predict(dtset.T[:-2].T)

In [54]:
def get_back_ax(x):
    return x.argmax(axis=1)

In [55]:
sol = get_back_ax(total_pred)

In [56]:
sol.shape

(11293380,)

In [57]:
sol[2:50]

array([3, 1, 1, 3, 1, 1, 3, 0, 0, 0, 0, 3, 4, 1, 1, 3, 1, 3, 3, 2, 3, 3,
       1, 1, 3, 3, 0, 3, 1, 3, 1, 0, 4, 0, 1, 1, 0, 1, 1, 0, 3, 0, 1, 3,
       3, 1, 3, 3])

In [58]:
pred_set = dtset.copy()
pred_set['pred'] = sol

In [59]:
cor_pred = len(pred_set[train_len:][pred_set['pred'] == 4][pred_set['tag'] == 1]) + len(pred_set[train_len:][pred_set['pred'] != 4][pred_set['tag'] == 0])
print('The accuracy of the test set is : ',cor_pred/(len(pred_set[train_len:])))

The accuracy of the test set is :  0.7539642693330075


In [60]:
import warnings
warnings.filterwarnings('ignore')

In [61]:
print('Correctly identified signal (True Positive)     : ',len(pred_set[pred_set['pred'] == 4][pred_set['tag'] == 1]))
print('Falsely identified signal (False Positive)      : ',len(pred_set[pred_set['pred'] == 4][pred_set['tag'] == 0]))
print('Correctly identified background (True Negative) : ',len(pred_set[pred_set['pred'] != 4][pred_set['tag'] == 0]))
print('Falsely identified background (False Negative)  : ',len(pred_set[pred_set['pred'] != 4][pred_set['tag'] == 1]))

Correctly identified signal (True Positive)     :  438763
Falsely identified signal (False Positive)      :  246
Correctly identified background (True Negative) :  8073260
Falsely identified background (False Negative)  :  2781111


In [62]:
print('The amount of signal left is     :', len(pred_set[pred_set['pred'] == 4][pred_set['tag'] == 1])/len(df[-1]))
print('The amount of background left is :', len(pred_set[pred_set['pred'] == 4][pred_set['tag'] == 0])/np.sum([len(i) for i in df[:-1]]))

The amount of signal left is     : 0.13187479374040836
The amount of background left is : 2.5549843538366672e-05


In [63]:
print('Thus, the rate of correct signal prediction is : ',len(pred_set[pred_set['pred'] == 4][dtset['tag'] == 1])/(len(pred_set[pred_set['pred'] == 4])))

Thus, the rate of correct signal prediction is :  0.9994396470231818


In [64]:
L = 3000

In [65]:
ns = cs_corr['n2n2']*(len(pred_set[pred_set['pred'] == 4][pred_set['tag'] == 1])/total_signal)*L
print('n2n2',(len(pred_set[pred_set['pred'] == 4][pred_set['tag'] == 1])))
nb = 0

for i in range(len(files)-1):
    nb += cs_corr[files[i]]*(len(pred_set[pred_set['pred'] == 4][pred_set['type'] == i])/total_background)*L
    print(files[i],(len(pred_set[pred_set['pred'] == 4][pred_set['type'] == i])))

n2n2 438763
ttbar 2
wmp 24
wpwm 106
zwpm 114


In [66]:
print('The number of signal is :', ns)
print('The number of background is :', nb)
print('The significance is :',ns/np.sqrt(nb))

The number of signal is : 134.66649
The number of background is : 450.718590121684
The significance is : 6.343176634246552


In [70]:
model.save('/home2/kalp_shah/datasets/Models/s5')

INFO:tensorflow:Assets written to: /home2/kalp_shah/Datasets/Models/s5/assets
