In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pylorentz import Momentum4

In [2]:
def gen_Files():

    #Input files to read, scale factor files to write, data files to write, and files for background data from the ATLAS data
    in_files = ['data15.csv', 'data16_p1.csv', 'data16_p2.csv', 'data16_p3.csv','ggF_atlas.csv','yyjj_p1.csv']
    sf_files = ['data15_sf.csv', 'data16_p1_sf.csv', 'data16_p2_sf.csv', 'data16_p3_sf.csv','ggF_sf.csv','yyjj_sf.csv']
    data_files = ['data15_data.csv','data16_p1_data.csv','data16_p2_data.csv','data16_p3_data.csv','ggF_data.csv','yyjj_data.csv']
    data_bkgs = ['data15_bkg.csv','data16_p1_bkg.csv','data16_p2_bkg.csv','data16_p3_bkg.csv']
    
    #Enough column names to deal with ragged CSV file (some fields will be NaN)
    col_names = ['mcweight', 'g1_pt','g1_eta',
                 'g1_phi','g1_E','g1_tight','g1_ptcone','g1_etcone','m_y1_is_isolated','g2_pt','g2_eta',
                 'g2_phi','g2_E','g2_tight','g2_ptcone','g2_etcone','m_y2_is_isolated', 'p_mass', 'jet_n',
                 'j1_pt','j1_eta','j1_phi','j1_E','j1_BTAG','j2_pt','j2_eta','j2_phi','j2_E','j2_BTAG']
    
    #Parse through each input file, process it, then write to the new files
    for i in range(len(in_files)):

        #Read CSV
        df = pd.read_csv('./../CSVfiles/{}'.format(in_files[i]), names = col_names)
        print('Starting {}'.format(in_files[i]))
        
        #Drop unused columns
        df.drop(['m_y1_is_isolated', 'm_y2_is_isolated'], axis=1, inplace = True)
            
        #Calculate photon separation angle (eq: sqrt(eta1-eta2)^2+(phi1-phi2)^2))
        del_r = np.sqrt((df['g1_eta']-df['g2_eta'])**2 + (df['g1_phi']-df['g2_phi'])**2)

        #Add the new column to the start of the dataframe
        df.insert(loc=0, column='photon_sep', value=del_r)
            
        #Isolate scale factor data, then remove this column to be left with useful data
        if (in_files[i] == 'ggF_atlas.csv'):
            sf_df = df['mcweight']/48.5 #Scaling by cross section
        else:
            sf_df = df['mcweight']
        df.drop(['mcweight'], axis=1, inplace = True)
        
        #Generate a label column for signal/background
        if (in_files[i] == 'ggF_atlas.csv'):
            labels = np.ones(len(df))
            df.insert(loc=0, column='label', value=labels)
            #Move mass column to start of dataframe (mass is currently the 16th column)
            cols = df.columns.tolist()
            cols = [cols[0]] + [cols[16]]  + cols[1:16] + cols[17:]
            df = df[cols]
        elif (in_files[i] == 'yyjj_p1.csv'):
            labels = np.zeros(len(df))
            df.insert(loc=0, column='label', value=labels)
            #Move mass column to start of dataframe (mass is currently the 16th column)
            cols = df.columns.tolist()
            cols = [cols[0]] + [cols[16]]  + cols[1:16] + cols[17:]
            df = df[cols]
        else:
            #Move mass column to start of dataframe (mass is currently the 15th column)
            #This is a different column as the raw data files have no labels
            cols = df.columns.tolist()
            cols =  [cols[15]] + [cols[0]] + cols[1:15] + cols[16:]
            df = df[cols]
            
            
        #Can remove rows of data set if at least 1 photon: is not tight, 
        #has ptCone/pt > 0.05, has etCone/eta > 0.065, mass outside 105 - 165 GeV range,
        #or has pt/mass > 0.35 (0.25 for second photon)
        pt1Ratio = df['g1_ptcone']/df['g1_pt']
        pt2Ratio = df['g2_ptcone']/df['g2_pt']
        et1Ratio = df['g1_etcone']/df['g1_pt']
        et2Ratio = df['g2_etcone']/df['g2_pt']
        pt_massRatio1 = df['g1_pt']/df['p_mass']
        pt_massRatio2 = df['g2_pt']/df['p_mass']
        
        #These are for viewing the number of entries each condition removes
        remove_tight = df[(df['g1_tight'] == 0) | (df['g2_tight'] == 0)].index
        print('Tight condition: %.2f'%(len(remove_tight)/len(df)))
        remove_ptratio = df[(pt1Ratio > 0.05) | (pt2Ratio > 0.05)].index
        print('Ptratio condition: %.2f'%(len(remove_ptratio)/len(df)))
        remove_etratio = df[(et1Ratio > 0.065) | (et2Ratio > 0.065)].index
        print('Etratio condition: %.2f'%(len(remove_etratio)/len(df)))
        remove_mass = df[(df['p_mass'] < 105000) | (df['p_mass'] > 165000)].index
        print('Mass condition: %.2f'%(len(remove_mass)/len(df)))
        remove_mass_ratio = df[(pt_massRatio1 < 0.35) & (pt_massRatio2 < 0.25)].index
        print('Pt/Mass condition: %.2f'%(len(remove_mass_ratio)/len(df)))
            
        del remove_tight,remove_ptratio,remove_etratio,remove_mass
        
        #This list is now used to actually remove entries from the dataframe (but are the same conditions)
        inds_to_remove = df[((df['g1_tight'] == 0) | (df['g2_tight'] == 0)) | ((pt1Ratio > 0.05) 
                        | (pt2Ratio > 0.05)) | ((et1Ratio > 0.065) | (et2Ratio > 0.065)) 
                        | ((df['p_mass'] < 105000) | (df['p_mass'] > 165000))
                        | ((pt_massRatio1 < 0.35) & (pt_massRatio2 < 0.25))].index
        print('Overall: %.2f'%(len(inds_to_remove)/len(df)))
        
        #Perform the removal of rows, but if we are processing one of the raw data files
        #then we create a 'data_bkg' file which converts the excluded rows into a 
        #background dataset for optional use in training
        if (in_files[i] == 'ggF_atlas.csv')|(in_files[i] == 'yyjj_p1.csv'):
            df.drop(inds_to_remove, axis = 0, inplace = True)
            #These columns are removed to stop the classifier from cheating
            df.drop(['g1_ptcone','g2_ptcone','g1_etcone', 'g2_etcone', 'g1_tight', 'g2_tight'], axis = 1, inplace = True)
        else:
            bkg_df = df.iloc[inds_to_remove]
            bkg_df.insert(loc=0, column='label', value=np.zeros(len(bkg_df))) #Adds labels
            bkg_df['p_mass'] = bkg_df['p_mass']/1000
            df.drop(['g1_ptcone','g2_ptcone','g1_etcone', 'g2_etcone', 'g1_tight', 'g2_tight'], axis = 1, inplace = True)
            bkg_df.to_csv("./../CSVfiles/{}".format(data_bkgs[i]), index = False, header = True) #Write the background data
            df.drop(inds_to_remove, axis = 0, inplace = True) #Drop rows so that leftover is data passing conditions
        
            
        #Make masses in GeV
        df['p_mass'] = df['p_mass']/1000

        #Write scale factor data
        sf_df.to_csv("./../CSVfiles/{}".format(sf_files[i]), index = False, header = True)
        #Write all remaining data passing conditions
        df.to_csv("./../CSVfiles/{}".format(data_files[i]), index = False, header = True)
                
        print('Done {}'.format(in_files[i]))

In [3]:
#Calculates truth values upon a threshold cut
def calc_Truths(y_pred, y_val, m, sf_val, threshold):
    
    """
    Takes rounded list of clasifier predictions (ints), corresponding true valiation
    labels (ints), masses of each event (ints), scales factors for each event (ints), and threshold value (integer)
    
    Returns 2D lists of each classification where each element holds the mass (position 0)
    and scale factor (position 1) for each respective event
    """

    true_pos = []
    false_pos = []
    true_neg = []
    false_neg = []

    for i in range(len(y_val)):

        if y_val[i] == 1 and y_pred[i] >= threshold:
            true_pos.append([m[i], sf_val[i]])
        elif y_val[i] == 0 and y_pred[i] >= threshold:
            false_pos.append([m[i], sf_val[i]])
        elif y_val[i] == 1 and y_pred[i] < threshold:
            false_neg.append([m[i], sf_val[i]])
        elif y_val[i] == 0 and y_pred[i] < threshold:
            true_neg.append([m[i], sf_val[i]])

    return true_pos, false_pos, true_neg, false_neg

In [4]:
def Sensitivity(true_pos, false_pos, sf_trim, SF_band):
    
    """
    First two inputs are 2D lists where each element holds the mass (position 0)
    and scale factor (position 1) for an event, where the events are correct classifications 
    and misclassified background events respectively. The other inputs are the scale factor resulting
    from the train test split, and the side band scale factor.
    
    Returns a value for the sensitivity
    """
    
    s, b, band_bkg = 0, 0, 0
    #Add up scale factors of true positives which are between 121 - 129 GeV
    for i in range(len(true_pos)): 
        if (true_pos[i][0] > 121) and (true_pos[i][0] < 129):
            s += true_pos[i][1] 
    #Add up scale factors of false positives which are between 121 - 129 GeV
    for i in range(len(false_pos)):
        if (false_pos[i][0] > 121) and (false_pos[i][0] < 129):
            b += false_pos[i][1]
        #Add up scale factors of false positives which are outside 121 - 129 GeV
        elif (false_pos[i][0] <= 121) or (false_pos[i][0] >= 129):
            band_bkg += false_pos[i][1]
    
    #Multiply by train test split factor and (num expected events)/(num simulated events)
    s = s*sf_trim[0]*(2.00419*(10**-3))
    #Multiply by train test split factor and side band scale factor
    b = b*sf_trim[1]*SF_band
    n = (s+b)
    #print('%.2f '%s,' %.2f '%b,' %.2f '%band_bkg)
    sensitivity = np.sqrt(2*(n*np.log(n/b)+b-n))

    return sensitivity

In [5]:
#Used to turn a list into PyLorentz quantities
def lorentzify(lst):

    gamma_objects = []

    #Separate each photon
    each_gamma = np.split(lst, 2)

    #Change each gamma into a PyLorentz object
    for j in range(2):
        gamma_objects.append(Momentum4.e_eta_phi_pt(each_gamma[j][3],each_gamma[j][1], each_gamma[j][2], each_gamma[j][0]))

    return gamma_objects

In [6]:
#Use PyLorentz to calculate parent particle quantities
def parent_Quantities(lst):

    #Set memory placeholders for each list to avoid appends
    inv_masses = np.zeros(len(lst))
    trans_momenta = np.zeros(len(lst))
    energies = np.zeros(len(lst))
    etas = np.zeros(len(lst))
    phis = np.zeros(len(lst))

    for i in range(len(lst)):

        #Turn list into PyLorentz objects
        gammas = lorentzify(lst[i])
        parent = gammas[0] + gammas[1]

        #Calculate quantities
        inv_masses[i] = parent.m
        trans_momenta[i] = parent.p_t
        energies[i] = parent.e
        etas[i] = parent.eta
        phis[i] = parent.phi
    
    return inv_masses, trans_momenta, energies, etas, phis

In [7]:
def profile (m, ys, labels=None, bins=np.linspace(100,160,60,endpoint=True), ax=None):
    """
    ...
    """
    plt.rcParams.update({'font.size': 20})
    # Check(s)
    if isinstance(bins, int):
        bins = np.linspace(m.min(), m.max(), bins + 1, endpoint=True)
        pass

    if not isinstance(ys, list):
        ys = [ys]
        pass

    N = len(ys)
    centres = bins[:-1] + 0.5 * np.diff(bins)

    if labels is None:
        labels = [None for _ in range(N)]
    elif isinstance(labels, str):
        labels = [labels]
        pass

    assert len(labels) == N, "[profile] Number of observables ({}) and associated labels ({}) do not match.".format(N, len(labels))

    # Local background efficiency
    profiles = {ix: list() for ix in range(N)}
    means_NN  = list()
    means_ANN = list()
    for down, up in zip(bins[:-1], bins[1:]):
        msk = (m >= down) & (m < up)
        for ix, y in enumerate(ys):
            profiles[ix].append(y[msk].mean())
            pass
        pass

    # Ensure axes exist
    if ax is None:
        _, ax = plt.subplots(figsize=(6,5))
        pass

    # Plot profile(s)
    for ix in range(N):
        ax.plot(centres, profiles[ix], '.-', label=labels[ix])
        pass

    # Decorations
    ax.set_xlabel('Mass [GeV]')
    ax.set_ylabel('Average Value')
    ax.set_ylim((0,1))
    ax.set_xlim(bins[0], bins[-1])
    ax.legend()

    return ax

In [8]:
def get_Data_Bkg(nrows):
    
    """
    This function is for importing the background data which comes from 
    raw data events that did not pass our conditions
    """
    
    df1 = pd.read_csv('./../CSVfiles/data15_bkg.csv',nrows=int(nrows/4))
    df2 = pd.read_csv('./../CSVfiles/data16_p1_bkg.csv',nrows=int(nrows/4))
    df3 = pd.concat([df1,df2])
    del df1,df2
    df4 = pd.read_csv('./../CSVfiles/data16_p2_bkg.csv',nrows=int(nrows/4))
    df5 = pd.read_csv('./../CSVfiles/data16_p3_bkg.csv',nrows=int(nrows/4))
    df6 = pd.concat([df4,df5])
    del df4,df5
    df7 = pd.concat([df3,df6])
    del df3,df6
    
    return df7

In [9]:
def get_Data_Sf(nrows):
    
    """
    This function is for importing the background data which comes from 
    raw data events that did not pass our initial conditions
    """

    df1 = pd.read_csv('./../CSVfiles/data15_bkg.csv',nrows=int(nrows/4))
    df2 = pd.read_csv('./../CSVfiles/data16_p1_bkg.csv',nrows=int(nrows/4))
    df3 = pd.concat([df1,df2])
    del df1,df2
    df4 = pd.read_csv('./../CSVfiles/data16_p2_bkg.csv',nrows=int(nrows/4))
    df5 = pd.read_csv('./../CSVfiles/data16_p3_bkg.csv',nrows=int(nrows/4))
    df6 = pd.concat([df4,df5])
    del df4,df5
    df7 = pd.concat([df3,df6])
    del df3,df6
    
    return df7