In [2]:
import os
import glob
import numpy as np
import awkward

In [3]:
import matplotlib
%matplotlib notebook
import matplotlib.pyplot as plt

In [4]:
from sklearn.metrics import roc_auc_score, roc_curve, auc

In [5]:
# v14 8gev (updated for full 2e14 pn sample)
presel_eff = {1: 0.9952855229150378, 10: 0.9976172400798192, 100: 0.9979411114121182, 1000: 0.9981519444725636, 0: 0.03282988102560554}

In [6]:
sig_filelist = glob.glob('/home/zwan/LDMX/LDMX-scripts/GraphNet/temp_plot/eddp_ecal_full_1reg/v14_8gev_1*.parquet') + \
            glob.glob('/home/zwan/LDMX/LDMX-scripts/GraphNet/temp_plot/eddp_ecal_full_1reg/v14_8gev_0*.parquet')
bkg_filelist = glob.glob('/home/zwan/LDMX/LDMX-scripts/GraphNet/temp_plot/eddp_ecal_full_1reg/*pn*.parquet')

In [7]:
sig_tables = [awkward.from_parquet(f) for f in sig_filelist]
#print(sig_tables[0]['ParticleNet_extra_label'])
bkg_tables = [awkward.from_parquet(f) for f in bkg_filelist]

In [9]:
load_branches = [
    'discValue_',
    'recoilX_',
    'recoilY_',
     
    'ParticleNet_extra_label',
    'ParticleNet_disc',
    'TargetSPRecoilE_pt', # use this for plotting: this is the recoil electron pT at TargetSP
    'maxPE'
]

In [10]:
def load_dict(sig_tables, bkg_tables):
    a = {}
    for k in load_branches:
        #print("Loading "+k)
        arrs = []
        for tab in sig_tables + bkg_tables:
            #print(awkward.type(tab))
            #print(tab)
            #print(tab['TargetSPRecoilE_pt'])
            #print(awkward.fields(tab))
            arr = tab[k] if k in awkward.fields(tab) else np.zeros_like(tab['ParticleNet_disc'])
            arrs.append(arr)
        a[k] = awkward.concatenate(arrs)
        if k.startswith('EcalVeto'):
            #a[k] = a[k].regular()
            a[k] = awkward.to_regular(a[k])
            if a[k].ndim==2 and len(a[k][0]) == 1:  #a[k].shape[1]==1:
                a[k] = a[k][:,0]
        print("   Found {} events".format(len(a[k])),k)
    return a

a = load_dict(sig_tables, bkg_tables)

print("Done")

   Found 13755268 events discValue_
   Found 13755268 events recoilX_
   Found 13755268 events recoilY_
   Found 13755268 events ParticleNet_extra_label
   Found 13755268 events ParticleNet_disc
   Found 13755268 events TargetSPRecoilE_pt
   Found 13755268 events maxPE
Done


In [11]:
for k in a.keys():
    print(k, awkward.type(a[k]))

discValue_ 13755268 * var * float64
recoilX_ 13755268 * var * float64
recoilY_ 13755268 * var * float64
ParticleNet_extra_label 13755268 * int64
ParticleNet_disc 13755268 * float64
TargetSPRecoilE_pt 13755268 * var * float64
maxPE 13755268 * var * float64


In [12]:
def to_categorical(y, num_classes=None):
    """Converts a class vector (integers) to binary class matrix.
    E.g. for use with categorical_crossentropy.
    # Arguments
        y: class vector to be converted into a matrix
            (integers from 0 to num_classes).
        num_classes: total number of classes.
    # Returns
        A binary matrix representation of the input.
    """
    y = np.array(y, dtype='int').ravel()
    if not num_classes:
        num_classes = np.max(y) + 1
    n = y.shape[0]
    categorical = np.zeros((n, num_classes), dtype='int')
    categorical[np.arange(n), y] = 1
    return categorical

def plotROC(y_preds, y_truth, sample_weight=None, output=None, labels=['signal'], sig_eff=1, bkg_eff=1, energy=0, **kwargs):
    from sklearn.metrics import auc, roc_curve, accuracy_score

    fpr = dict()
    tpr = dict()
    thresholds= dict()
    roc_auc = dict()
    outputs = {}

    plt.figure()

    for label, pred in zip(labels, y_preds):
        print("Label: ", label)
        fpr[label], tpr[label], thresholds[label] = roc_curve(y_truth, pred, sample_weight=sample_weight)
        roc_auc[label] = auc(fpr[label], tpr[label])
        fpr[label] *= bkg_eff
        tpr[label] *= sig_eff

        legend = '%s (auc* = %0.6f)' % (label, roc_auc[label])
        print(legend)
        eff = get_signal_effs(fpr[label], tpr[label], thresholds[label])
        outputs[label] = eff
        print(eff)
        plt.plot(fpr[label], tpr[label], label=legend)
#     plt.plot([0, 1], [1, 0], 'k--')
    plt.xlim(kwargs.get('xlim', [0, 1]))
    plt.ylim(kwargs.get('ylim', [0, 1]))
    plt.xlabel('False positive rate ($\epsilon_{B}$)')
    plt.ylabel('True positive rate ($\epsilon_{S}$)')
#     plt.title('Receiver operating characteristic example')
    plt.legend(loc='best')
    if kwargs.get('logy', False):
        plt.yscale('log')
    if kwargs.get('logx', False):
        plt.xscale('log')
    plt.grid()
    # TEMPORARY TITLE
    plt.title(str(k)+" MeV", fontdict = {'fontsize' : 15})
    #plt.title("All events, "+str(k)+" MeV", fontdict = {'fontsize' : 15})
    if output:
        plt.savefig(output)
#     return {'fpr':fpr, 'tpr':tpr, 'thresholds':thresholds}
    return outputs


def plotROC_multi(y_preds_, y_truth_, sample_weight_=None, output=None, labels=['signal'], sig_eff=1, bkg_eff=1, energy=0, **kwargs):
    from sklearn.metrics import auc, roc_curve, accuracy_score

    # y_preds, etc are now tuples of (a, b, c) (1reg, 2reg, 3reg)
    plt.figure()
    
    for i in range(1):  # 1, 2, 3 regions
        
        fpr = dict()
        tpr = dict()
        thresholds= dict()
        roc_auc = dict()
        outputs = {}

        #plt.figure()

        y_preds = y_preds_[i]
        y_truth = y_truth_[i]
        sample_weight = sample_weight_[i]
        print("Y_TRUTH:")
        print(y_truth[:10])
        for label, pred in zip(labels, y_preds):
            #if label == 'BDT' and i < 1:  continue
            print("Label: ", label)
            fpr[label], tpr[label], thresholds[label] = roc_curve(y_truth, pred, sample_weight=sample_weight)
            roc_auc[label] = auc(fpr[label], tpr[label])
            fpr[label] *= bkg_eff
            tpr[label] *= sig_eff
            
            if label == 'BDT':
                legend = '%s\n(auc* = %0.6f)' % (label+' (old Gabriel)', roc_auc[label])
            #elif i == 0:
                #legend = '(1,1)-reg SplitNetX\n(auc* = %0.6f)' % (roc_auc[label])
                #legend = '1-reg SplitNet\n(auc* = %0.6f)' % (roc_auc[label])
            else:
                legend = '%i-reg %s, PN \n(auc* = %0.6f)' % (i+1, label, roc_auc[label])
            #else:
                #legend = '1-reg SplitNet\n(auc* = %0.6f)' % (roc_auc[label])
            print(legend)
            eff = get_signal_effs(fpr[label], tpr[label], thresholds[label])
            outputs[label] = eff
            print(eff)
            print(len(eff))
            #print(tpr[label][:10])
            #print(len(fpr[label]))
            plt.plot(fpr[label], tpr[label], label=legend)
#     plt.plot([0, 1], [1, 0], 'k--')
    plt.xlim(kwargs.get('xlim', [0, 1]))
    plt.ylim(kwargs.get('ylim', [0, 1]))
    plt.xlabel('False positive rate ($\epsilon_{B}$)', fontsize=12)
    plt.ylabel('True positive rate ($\epsilon_{S}$)', fontsize=12)
#     plt.title('Receiver operating characteristic example')
    plt.legend(loc='best')
    if kwargs.get('logy', False):
        plt.yscale('log')
    if kwargs.get('logx', False):
        plt.xscale('log')
    plt.grid()
    # TEMPORARY TITLE
    plt.title(str(k)+" MeV w/ PN Background", fontdict = {'fontsize' : 15})
    #plt.title("All events, "+str(k)+" MeV", fontdict = {'fontsize' : 15})
    if output:
        print("yes output")
        plt.savefig(output, facecolor='w', dpi=250)
#     return {'fpr':fpr, 'tpr':tpr, 'thresholds':thresholds}
    else:
        print("no output")
    return outputs


mistags=[1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9]
def get_signal_effs(fpr, tpr, thresholds, mistags=mistags):
    outputs = []
    for m in mistags:
        idx = next(idx for idx, v in enumerate(fpr) if v > m)
        outputs.append((fpr[idx], tpr[idx], thresholds[idx]))
    return outputs

In [13]:
#pT = np.array(a['TargetSPRecoilE_pt']).T[0]
test_extra_labels = a['ParticleNet_extra_label']#[pT != -999]
test_labels = test_extra_labels>0
print(len(test_extra_labels))
print(test_extra_labels[-20:])

13755268
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [12]:
roc_info = {}
#y_preds_ = [[a['ParticleNet_disc'], a['discValue_']], [b['ParticleNet_disc'], b['discValue_']], [c['ParticleNet_disc'], c['discValue_']]]
y_preds_ = [[a['ParticleNet_disc'], a['discValue_']]]
#test_labels_ = [test_labels, test_labels_b, test_labels_c]
test_labels_ = [test_labels]
#print(a['ParticleNet_disc'][:10])
for k in presel_eff:
    if k > 0:
        mass = '%d MeV' % k
        print(mass)
        weights = [np.logical_or(test_extra_labels==0, test_extra_labels==k)]
                  #np.logical_or(test_extra_labels_c==0, test_extra_labels_c==k)]
        print(type(weights))
        roc_info[k] = plotROC_multi(y_preds_, test_labels_, 
                sample_weight_=weights,
                sig_eff=presel_eff[k], bkg_eff=presel_eff[0],
                labels=['SplitNet', 'BDT'], xlim=[1e-7, 0.1], ylim=[0, 1], logx=True, energy=k,output=f'v14_pnx_ROC_{mass}')
# NOTE:  add 'BDT' to labels list if desired

1 MeV


<class 'list'>


<IPython.core.display.Javascript object>

Y_TRUTH:
[True, True, True, True, True, True, True, True, True, True]
Label:  SplitNet
1-reg SplitNet, PN 
(auc* = 0.996587)
[(0.0010000930237995993, 0.9919376318427768, 0.4831149528560138), (0.00010000341960958693, 0.6221397654603269, 0.8518890820249218), (1.0000709634107006e-05, 0.1808631625207049, 0.9435696790214237), (1.0000709634107006e-06, 0.017925101045001432, 0.9978342574347435), (6.029839632329225e-07, 3.1554110011884757e-06, 1.0), (6.029839632329225e-07, 3.1554110011884757e-06, 1.0), (6.029839632329225e-07, 3.1554110011884757e-06, 1.0)]
7
Label:  BDT
BDT (old Gabriel)
(auc* = 0.981619)
[(0.0010000011055125212, 0.8404034886762853, 0.02136063575744629), (0.00010000341960958693, 0.4676203405357944, 0.7935066819190979), (1.0000709634107006e-05, 0.19021448597364368, 0.9970032572746277), (1.0000709634107006e-06, 0.052763205056373094, 0.9999043941497803), (1.838365741563788e-07, 0.016768117011232323, 0.9999752044677734), (1.1030194449382727e-08, 0.0007533543765337485, 0.999984145164

<IPython.core.display.Javascript object>

Y_TRUTH:
[True, True, True, True, True, True, True, True, True, True]
Label:  SplitNet
1-reg SplitNet, PN 
(auc* = 0.998201)
[(0.0010008246933647417, 0.9956712317298823, 0.4828129807518565), (0.00010000341960958693, 0.8239900506354461, 0.8518890820249218), (1.0000709634107006e-05, 0.46333589877844233, 0.9435696790214237), (1.0000709634107006e-06, 0.11984723802064876, 0.9978342574347435), (6.029839632329225e-07, 1.5155828270536873e-05, 1.0), (6.029839632329225e-07, 1.5155828270536873e-05, 1.0), (6.029839632329225e-07, 1.5155828270536873e-05, 1.0)]
7
Label:  BDT
BDT (old Gabriel)
(auc* = 0.989766)
[(0.0010000011055125212, 0.9152059082759479, 0.02136063575744629), (0.00010001077307255318, 0.6864761687941081, 0.7934941053390503), (1.0000709634107006e-05, 0.40963374727317337, 0.9970032572746277), (1.0000709634107006e-06, 0.187436169775935, 0.9999043941497803), (1.838365741563788e-07, 0.10392351445107134, 0.9999752044677734), (1.1030194449382727e-08, 0.004132489175099721, 0.9999841451644897)

<IPython.core.display.Javascript object>

Y_TRUTH:
[True, True, True, True, True, True, True, True, True, True]
Label:  SplitNet




1-reg SplitNet, PN 
(auc* = nan)
[(0.03282988102560554, nan, 2.810319679440199e-250), (0.03282988102560554, nan, 2.810319679440199e-250), (0.03282988102560554, nan, 2.810319679440199e-250), (0.03282988102560554, nan, 2.810319679440199e-250), (6.029839632329225e-07, nan, 1.0), (6.029839632329225e-07, nan, 1.0), (6.029839632329225e-07, nan, 1.0)]
7
Label:  BDT




BDT (old Gabriel)
(auc* = nan)
[(0.0010000121357069706, nan, 0.021359801292419434), (0.00010143366815652356, nan, 0.7881415486335754), (1.0096304652668323e-05, nan, 0.9969411492347717), (1.0147778893432107e-06, nan, 0.9999022483825684), (1.838365741563788e-07, nan, 0.9999752044677734), (6.985789817942394e-08, nan, 0.9999756813049316), (3.6767314831275756e-09, nan, 0.9999903440475464)]
7
yes output
1000 MeV
<class 'list'>


<IPython.core.display.Javascript object>

Y_TRUTH:
[True, True, True, True, True, True, True, True, True, True]
Label:  SplitNet
Y_TRUTH:
[True, True, True, True, True, True, True, True, True, True]
Label:  SplitNet




1-reg SplitNet, PN 
(auc* = nan)
[(0.03282988102560554, nan, 2.810319679440199e-250), (0.03282988102560554, nan, 2.810319679440199e-250), (0.03282988102560554, nan, 2.810319679440199e-250), (0.03282988102560554, nan, 2.810319679440199e-250), (6.029839632329225e-07, nan, 1.0), (6.029839632329225e-07, nan, 1.0), (6.029839632329225e-07, nan, 1.0)]
7
Label:  BDT




BDT (old Gabriel)
(auc* = nan)
[(0.0010000121357069706, nan, 0.021359801292419434), (0.00010143366815652356, nan, 0.7881415486335754), (1.0096304652668323e-05, nan, 0.9969411492347717), (1.0147778893432107e-06, nan, 0.9999022483825684), (1.838365741563788e-07, nan, 0.9999752044677734), (6.985789817942394e-08, nan, 0.9999756813049316), (3.6767314831275756e-09, nan, 0.9999903440475464)]
7
yes output


In [15]:
disc_threshold = 0.65

bkg_pt = np.array(a['TargetSPRecoilE_pt'][test_extra_labels==0])
bkg_disc_value = a['ParticleNet_disc'][test_extra_labels==0]
bkg_maxPE = a['maxPE'][test_extra_labels==0]

nPass = np.sum( (bkg_disc_value > disc_threshold) * (bkg_pt.T[0] != -999) )
nPass_veto = np.sum( (bkg_disc_value > disc_threshold) * (bkg_pt.T[0] != -999) * (bkg_maxPE < 8) )

print(nPass)
print(nPass_veto)

170408
3


In [17]:
masses = [0, 1, 10] #100, 1000]
pT = {}
nEvents = {}
for m in masses:
    pT[m] = np.array(a['TargetSPRecoilE_pt'][test_extra_labels==m]).T[0]
    print(len(pT[m]))
    pT[m] = pT[m][pT[m] != -999]
    nEvents[m] = len(pT[m])
    print(len(pT[m]))
print(nEvents)

8929094
8929088
3785062
3783143
1041112
1032569
{0: 8929088, 1: 3783143, 10: 1032569}


In [18]:
# threshold = [474896 ->10, 74805 -> 1, 1k -> 0, 100 -> 0]   # 10k -> 0
#thresholds = [0.222477, 0.7335808, 0.94611, 0.98472]  # 0.94611
#thresholds = [0.98472, 0.94611, 0.7335808, 0.222477]
thresholds = [0.94611, 0.87439, 0.7335808, 0.222477]
pT_pass = {}
for m in masses:
    disc_value = np.array(a['ParticleNet_disc'][test_extra_labels==m])
    print(disc_value.shape)
    pt = np.array(a['TargetSPRecoilE_pt'][test_extra_labels==m])
    print(pt.T[0])
    print(pt.shape)
    disc_value = disc_value[pt.T[0]!=-999]
    pT_pass[m] = [pT[m][disc_value > threshold] for threshold in thresholds]
    print(len(pT_pass[m]))

(8929094,)
[6.43670654 2.3927443  1.98252368 ... 0.29802188 3.56029201 6.38674259]
(8929094, 1)
4
(3785062,)
[2.18705893 6.13530159 5.38155508 ... 4.40631628 9.31886864 6.45992804]
(3785062, 1)
4
(1041112,)
[ 0.83014333  2.53539848 20.0387001  ...  5.46140957  2.61757088
  1.04124641]
(1041112, 1)
4


In [16]:
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

#colors = ['#4A7DFF', '#14AD0C', '#FF212E', '#FF8F13', '#871EFE']
colors = ['#FF212E', '#FF8F13', '#4A7DFF', '#14AD0C', '#871EFE']

for m in masses:
    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, gridspec_kw={'height_ratios': [2, 1]})
    #if m == 1000:
        #ax1.set_ylim(100, 1300)
    vals = [pT[m], pT_pass[m]]
    bins = np.linspace(-50, 200, 51)
    #labels = ['N bkg ~ $5 \\times 10^{5} \\to 10$', 'N bkg ~ $7.5 \\times 10^{4} \\to 1$', 
               #'N bkg ~ $1 \\times 10^{3} \\to 0$', 'N bkg ~ $1 \\times 10^{2} \\to 0$']
    labels = ['N bkg ~ $1 \\times 10^{3} \\to 0$', 'N bkg ~ $1 \\times 10^{4} \\to 0$', 
             'N bkg ~ $7.5 \\times 10^{4} \\to 1$', 'N bkg ~ $5 \\times 10^{5} \\to 10$']
    n1, bins, _ = ax1.hist(vals[0], bins=bins, range=(0, 250), density=False, stacked=False, histtype='step', color=colors[-1],
                log=True, label='Inclusive')
    n2, bins, _ = ax1.hist(vals[1], bins=bins, range=(0, 250), density=False, stacked=False, histtype='step', color=colors[:-1],
                    log=True, label=labels)
    #n1, bins, _ = ax1.hist(vals[0], bins=bins, range=(0, 250), density=False, stacked=False, histtype='step', color=colors[0], 
                 #log=True, label='Inclusive')
    #n2, bins, _ = ax1.hist(vals[1], bins=bins, range=(0, 250), density=False, stacked=False, histtype='step', color=colors[3], 
                 #log=True, label=f'ParticleNetX_disc > {disc_threshold}')
    ax1.legend()
    
    ratio_arrs = []
    errs = []
    for val in n2:
        ratio_arr = val / n1
        err = (val / n1) * ( (1/np.sqrt(val)) + (1/np.sqrt(n1)) ) 
        #print(type(ratio_arr))
        #print(len(ratio_arr))       
        #print(len(bins[:-1]))
        ratio_arrs.append(ratio_arr)
        errs.append(err)
    
    for i, ratio_arr in enumerate(ratio_arrs):

        ax2.errorbar(bins[:-1],     
            ratio_arr,
            yerr=errs[i],
            fmt='o',
            alpha=0.8,
            color=colors[i])

    '''
    ax2.errorbar(bins[:-1],     # this is what makes it comparable
        n2 / n1, # maybe check for div-by-zero!
        alpha=0.8,
        yerr = (n2 / n1) * ( (1/np.sqrt(n2)) + (1/np.sqrt(n1)) ),
        fmt='o',
        color=colors[1]        
                            )
    '''
    
    ax1.set_xlim(-10, 210)
    ax2.set_xlim(-10, 210)
    ax2.set_ylim(-0.05, 1.05)
    ax2.grid()
    ax2.set_xlabel("pT [MeV]", fontsize=14)
    
    ax1.set_ylabel('Number of Events', fontsize=13)
    
    ax1.set_title(f'{m} MeV TargetSP Recoil $e^{{-}}$ Transverse Momentum' if m!=0 else 'PN Bkg TargetSP Recoil $e^{{-}}$ Transverse Momentum', fontsize=15)
    ax2.set_ylabel('Ratio', fontsize=13)
    plt.tight_layout()
    plt.savefig(f'pT_bias_{m}', facecolor='w', dpi=250)

<IPython.core.display.Javascript object>

  ratio_arr = val / n1
  err = (val / n1) * ( (1/np.sqrt(val)) + (1/np.sqrt(n1)) )
  err = (val / n1) * ( (1/np.sqrt(val)) + (1/np.sqrt(n1)) )
  err = (val / n1) * ( (1/np.sqrt(val)) + (1/np.sqrt(n1)) )


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [19]:
pn_sig_disc = {}

for m in masses[1:]:
    pn_sig_disc[m] = a['ParticleNet_disc'][test_extra_labels==m]

pn_sig_pass = {}
for m in masses[1:]:
    pn_sig_pass[m] = np.sum( (pn_sig_disc[m] > disc_threshold) * (a['TargetSPRecoilE_pt'][test_extra_labels==m] != -999) * (a['maxPE'][test_extra_labels==m] < 8) )

pn_pass={}
pn_eff={}
for m in masses[1:]:
    pn_pass[m] = np.sum( (pn_sig_disc[m] > disc_threshold) * (a['TargetSPRecoilE_pt'][test_extra_labels==m] != -999) )
    pn_eff[m] = (pn_pass[m] / nEvents[m]) * presel_eff[m]

print(pn_eff)

pn_sig_eff = {}
for m in masses[1:]:
    pn_sig_eff[m] = (pn_sig_pass[m] / nEvents[m]) * presel_eff[m]
    
print(pn_sig_eff)

{1: 0.9835740622755416, 10: 0.9912609347651283}
{1: 0.9339361060557949, 10: 0.9263472029694972}
