# Training a multivariate discrimination

Below we prepare a simple setup to train a multivariate discriminator to separate the signal from the background.
It's meant just an appetizer for you to explore. After you go through it it may be a good occasion to put in practice what you learned in the short exercise about Machine Learning!

In [None]:
#!pip install --user mplhep #uncomment if loading of mplhep fails

import numpy as np
import pandas as pd
import h5py
import matplotlib.pyplot as plt
import mplhep
from matplotlib.colors import LogNorm

In [None]:
#global configurarables (check the Data-Inspection notebook for consistency)
proton_selection = "MultiRP"
PATH='/eos/user/c/cmsdas/short-exercises/pps-protons-tutorial/data'

## Collect the data and simulation

The following cell is used to load the training data to memory. Further pre-selection will be applied below.

FIXME: It would be good to explain how the data/background was prepared.

In [None]:
def GetData(flist,chunk_size=None):
    
    """ 
    opens a summary file or list of summary files and convert them to a pandas dataframe 
    if given the chunk_size will be used to collect events in chunks of this size
    """
    
    flist=flist if isinstance(flist,list) else [flist]
    
    df,df_counts=[],[]
    
    for filename in flist:
    
        with h5py.File(filename, 'r') as f:

            print('Collecting data from',filename)
            
            dset            = f['protons']
            dset_columns    = f['columns']
            dset_selections = f['selections']
            dset_counts     = f['event_counts']
            
            #read the data
            columns = list( dset_columns )
            columns_str = [ item.decode("utf-8") for item in columns ]
            if chunk_size is None:
                start=[0]
                stop=[dset.shape[0]]
            else:
                entries = dset.shape[0]
                start = list( range( 0, entries, chunk_size ) )
                stop = start[1:] + [entries]
                
            for idx in range( len( start) ):
                print('\tCollecting events',start[idx], stop[idx] )

                df.append( pd.DataFrame( dset[start[idx]:stop[idx]], 
                                         columns=columns_str ) )
                df[-1]=df[-1][['Run', 'LumiSection', 'EventNum', 'CrossingAngle', 
                               'MultiRP', 'Arm', 'RPId1', 'RPId2', 'TrackX1', 'TrackY1', 'TrackX2', 'TrackY2',
                               'Xi', 'T', 'ThX', 'ThY', 'Time',
                               'Muon0Pt', 'Muon1Pt', 'InvMass', 'ExtraPfCands', 'Acopl', 'XiMuMuPlus', 'XiMuMuMinus'] ].astype( { "Run": "int64", "LumiSection": "int64", "EventNum": "int64", "MultiRP": "int32", "Arm": "int32", "RPId1": "int32", "RPId2": "int32", "ExtraPfCands": "int32" } )
              
            #read the selection counters
            selections = list( dset_selections )
            selections_str = [ item.decode("utf-8") for item in selections ]        
            df_counts.append( pd.Series( list( dset_counts ), index=selections_str ) )
    
    n=len( df ) 
    print('Returning the result of %d merged datasets'%n)
    df_final=pd.concat(df)
    
    #merge the counts
    df_counts_final = df_counts[0]
    for idx in range( 1, len(df_counts) ):
        df_counts_final = df_counts_final.add( df_counts[idx] )

    #merge the data
    
    
    return df_final,df_counts_final
    
print('[Signal simulation]')
df_signal,df_counts_signal = GetData(PATH+'/output-MC2017-Elastic-Non3+3-PreSel.h5')
print('Selection counts')
print(df_counts_signal)


print('\n')
print('[Data (to be used as background)]')
#eras=['B','C1','D','F1'] #uncomment to use all data
eras=['B']
data_files = [PATH+'/output-UL2017{}-PreSel-Rnd-Res20.h5'.format(era) for era in eras]
df_bkg,df_counts_bkg = GetData(data_files,chunk_size=1000000)
print('Selection counts')
print(df_counts_signal)

## Prepare the data

We apply the following selection to the signal and data

* require $m_{ll}>110$ GeV
* set the dilepton xi according to the arm where a proton was reconstructed 
* use the proton reconstruction algorithm as required at the start of the notebook

In [None]:
def PrepareData(df):
    
    """applies baseline selection cuts"""

    msk = ( df["InvMass"] >= 110. )

    msk1 = None
    msk2 = None
    if proton_selection == "SingleRP":
        # Single-RP in pixel stations
        msk1_arm = ( df["RPId1"] == 23 )
        msk2_arm = ( df["RPId1"] == 123 )
        multiRP=0
    elif proton_selection == "MultiRP":
        # Multi-RP
        msk1_arm = ( df["Arm"] == 0 )
        msk2_arm = ( df["Arm"] == 1 )
        multiRP=1
   
    df[ "XiMuMu" ] = np.nan
    df[ "XiMuMu" ].where( ~msk1_arm, df[ "XiMuMuPlus" ],  inplace=True )
    df[ "XiMuMu" ].where( ~msk2_arm, df[ "XiMuMuMinus" ], inplace=True )
    msk1 = msk & ( df["MultiRP"] == multiRP) & msk1_arm
    msk2 = msk & ( df["MultiRP"] == multiRP) & msk2_arm
   
    return df[msk1 | msk2].copy()

df_signal_prep = PrepareData(df_signal)
df_bkg_prep    = PrepareData(df_bkg)

In [None]:
print('Signal prepared',df_signal_prep.shape)

train_vars=['Xi', 'Muon0Pt', 'Muon1Pt', 'InvMass', 'ExtraPfCands', 'Acopl', 'XiMuMu']


#draw the correlation matrix for the training variables
print(train_vars)
fig=plt.figure(figsize=(10, 10))
plt.matshow(df_signal_prep[train_vars].corr(), fignum=fig.number)
cb = plt.colorbar()
plt.clim(-1,1)
plt.title('Signal correlation Matrix', fontsize=16)
plt.show()

df_signal_prep[train_vars].head(10)

In [None]:
print('Background prepared',df_bkg_prep.shape)

#draw the correlation matrix for these variables
print(train_vars)
fig=plt.figure(figsize=(10, 10))
plt.matshow(df_bkg_prep[train_vars].corr(), fignum=fig.number)
cb = plt.colorbar()
plt.clim(-1,1)
plt.title('Background correlation Matrix', fontsize=16)
plt.show()

df_bkg_prep[train_vars].head(10)

## Model building

Having the data prepared we now turn to building a simple model using the same training variables inspected above.
`sklearn` is used to define the training and test sample and to instantiate a [DecisionTreeClassifier](https://scikit-learn.org/stable/modules/tree.html#tree).
The [AdaBoostClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html) is used to improve the base training of the DecisionTreeClassifier.

In [None]:
X_sig=df_signal_prep[train_vars].copy()
X_bkg=df_bkg_prep[train_vars].copy()

y_sig = np.ones( len(X_sig) )
y_bkg = np.zeros( len(X_bkg) )

X = pd.concat( [X_sig, X_bkg] ) 
y = np.concatenate( [y_sig, y_bkg] )

In [None]:
#divide into training an testing datasets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, shuffle=True, random_state=42 )

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

#fit the parameters of the BDT
ada_clf = AdaBoostClassifier( DecisionTreeClassifier( max_depth=4 ),
                              n_estimators = 200,
                              algorithm="SAMME.R",
                              learning_rate = 0.5)
ada_clf.fit( X_train, y_train )
clf = ada_clf
print ( clf )

## Inspection of the training result

In [None]:
#check the accuracy of the predictions
from sklearn.metrics import accuracy_score
y_test_pred = clf.predict( X_test )
print('Accuracy on the test dataset is:',accuracy_score( y_test, y_test_pred ))

y_sig_pred = clf.predict( X_sig )
y_bkg_pred = clf.predict( X_bkg )
print ('Accuracy on the full signal data is', accuracy_score( y_sig, y_sig_pred ) )
print ('Accuracy on the full background data is', accuracy_score( y_bkg, y_bkg_pred ) )

In [None]:
def Plot2D(x,y,data_sig,data_bkg,msk_sig,msk_bkg,xran=[0.,0.15],yran=[0.,0.15]):
 
    """a simple routine to plot the signal and background components"""

    fig= plt.figure( figsize=(10,10) )
    plt.plot( data_bkg[x][ msk_bkg ], data_bkg[y][ msk_bkg ], 'ro', label='Background' )
    plt.plot( data_sig[x][ msk_sig ], data_sig[y][ msk_sig ], 'bo', label='Signal' )
    
    if 'Xi' in x and 'Xi' in y:
        plt.plot( xran,yran, 'k--', linewidth=1 )
        plt.plot( xran, [j*0.90 for j in yran], 'k:', linewidth=1 )
        plt.plot( xran, [j*1.10 for j in yran], 'k:', linewidth=1 )

    plt.xlim(*xran)
    plt.xlabel(x)
    plt.ylim(*yran)
    plt.ylabel(y)
    plt.legend(loc='best')

Test dataset: the events classified as signal correlate more strongly in the reconstucted $\xi$ plane. You can compare the result to the one obtained in the Data-Inspection notebook.

In [None]:
msk_bkg = ( y_test_pred == 0 )
msk_sig = ( y_test_pred == 1 )
Plot2D('Xi','XiMuMu',X_test,X_test,msk_sig,msk_bkg)

Separate the correlation per arm and filtering out only the correct predictions in the signal and background datasets.

In [None]:
msk_bkg_1 = None
msk_bkg_2 = None
msk_sig_1 = None
msk_sig_2 = None
if proton_selection == "SingleRP":
    msk_bkg_1 = ( df_bkg_prep[ "RPId1" ] == 23 ) & ( y_bkg_pred == 0 )
    msk_bkg_2 = ( df_bkg_prep[ "RPId1" ] == 123 ) & ( y_bkg_pred == 0 )
    msk_sig_1 = ( df_signal_prep[ "RPId1" ] == 23 ) & ( y_sig_pred == 1 )
    msk_sig_2 = ( df_signal_prep[ "RPId1" ] == 123 ) & ( y_sig_pred == 1 )
elif proton_selection == "MultiRP":
    msk_bkg_1 = ( df_bkg_prep[ "Arm" ] == 0 ) & ( y_bkg_pred == 0 )
    msk_bkg_2 = ( df_bkg_prep[ "Arm" ] == 1 ) & ( y_bkg_pred == 0 )
    msk_sig_1 = ( df_signal_prep[ "Arm" ] == 0 ) & ( y_sig_pred == 1 )
    msk_sig_2 = ( df_signal_prep[ "Arm" ] == 1 ) & ( y_sig_pred == 1 )
    
Plot2D('Xi','XiMuMuPlus',df_signal_prep,df_bkg_prep,msk_sig_1,msk_bkg_1)
Plot2D('Xi','XiMuMuMinus',df_signal_prep,df_bkg_prep,msk_sig_2,msk_bkg_2)

Analogous to the previous set of plots but now we plot the background which was assigned as signal in the prediction.

In [None]:
msk_bkgerr_1 = None
msk_bkgerr_2 = None
if proton_selection == "SingleRP":
    msk_bkgerr_1 = ( df_bkg_prep[ "RPId1" ] == 23 ) & ( y_bkg_pred == 1 )
    msk_bkgerr_2 = ( df_bkg_prep[ "RPId1" ] == 123 ) & ( y_bkg_pred == 1 )
elif proton_selection == "MultiRP":
    msk_bkgerr_1 = ( df_bkg_prep[ "Arm" ] == 0 ) & ( y_bkg_pred == 1 )
    msk_bkgerr_2 = ( df_bkg_prep[ "Arm" ] == 1 ) & ( y_bkg_pred == 1 )
Plot2D('Xi','XiMuMuPlus', df_bkg_prep,df_bkg_prep, msk_bkgerr_1, msk_bkg_1)
Plot2D('Xi','XiMuMuMinus',df_bkg_prep,df_bkg_prep, msk_bkgerr_2, msk_bkg_2)

Analogous distributions but now using events where the signal was misassigned as background

In [None]:
msk_sigerr_1 = None
msk_sigerr_2 = None
if proton_selection == "SingleRP":
    msk_sigerr_1 = ( df_signal_prep[ "RPId1" ] == 23 ) & ( y_sig_pred == 0 )
    msk_sigerr_2 = ( df_signal_prep[ "RPId1" ] == 123 ) & ( y_sig_pred == 0 )
elif proton_selection == "MultiRP":
    msk_sigerr_1 = ( df_signal_prep[ "Arm" ] == 0 ) & ( y_sig_pred == 0 )
    msk_sigerr_2 = ( df_signal_prep[ "Arm" ] == 1 ) & ( y_sig_pred == 0 )

Plot2D('Xi','XiMuMuPlus', df_signal_prep,df_signal_prep, msk_sigerr_1, msk_sig_1)
Plot2D('Xi','XiMuMuMinus',df_signal_prep,df_signal_prep, msk_sigerr_2, msk_sig_2)

Compare the training variable distributions for events classified as signal or background in the test sample.

In [None]:
def showDist(x,data,mask_sig, mask_bkg,nbins):
    
    """a simple function to compare signal-like and background-like"""
    
    fig = plt.figure( figsize=(6,6) )
    _,bins=np.histogram(data[x],bins=nbins)
    plt.hist( data[ x ][ mask_sig ], histtype='step', bins=bins, density=True, label='Signal' )
    plt.hist( data[ x ][ mask_bkg ], histtype='step', bins=bins, density=True, label='Background' )
    plt.xlabel(x)
    plt.show()
    
for x in train_vars:
    showDist(x,X_test,(y_test_pred==1),(y_test_pred==0),50)

In [None]:
pfs - i stopped here this will crash on purpose

y_test_probs = clf.predict_proba( X_test )

In [None]:
fig = plt.figure( figsize=(10,10) )
plt.hist( y_test_probs[:,0], color='orange', bins=30, range=(0.,1.) )
plt.hist( y_test_probs[:,1], color='skyblue', bins=30, range=(0.,1.) )
plt.hist( y_test_probs[:,0][ y_test == 0 ], color='red', bins=30, range=(0.,1.) )
plt.hist( y_test_probs[:,1][ y_test == 1 ], color='blue', bins=30, range=(0.,1.) )

## Data

In [None]:
df_data_list = []
df_counts_data_list = []

fileNames = [
    'output/output-UL2017B-PreSel.h5',
    'output/output-UL2017C1-PreSel.h5',
    'output/output-UL2017D-PreSel.h5',
    'output/output-UL2017F1-PreSel.h5'
]

for file_ in fileNames:
    with h5py.File( file_, 'r' ) as f:
        print ( f )
        print ( list(f.keys()) )
        dset = f['protons']
        print ( dset.shape )
        print ( dset[:,:] )
        
        dset_columns = f['columns']
        print ( dset_columns.shape )
        columns = list( dset_columns )
        print ( columns )
        columns_str = [ item.decode("utf-8") for item in columns ]
        print ( columns_str )

        dset_selections = f['selections']
        selections_ = [ item.decode("utf-8") for item in dset_selections ]
        print ( selections_ )  
        
        dset_counts = f['event_counts']
        df_counts_data_list.append( pd.Series( dset_counts, index=selections_ ) )
        print ( df_counts_data_list[-1] )
        
        chunk_size = 1000000
        entries = dset.shape[0]
        start_ = list( range( 0, entries, chunk_size ) )
        stop_ = start_[1:]
        stop_.append( entries )
        print ( start_ )
        print ( stop_ )
        for idx in range( len( start_ ) ):
            print ( start_[idx], stop_[idx] )
            #print ( dset[ start_[idx] : stop_[idx] ] )
            df_ = pd.DataFrame( dset[ start_[idx] : stop_[idx] ], columns=columns_str )
            df_ = df_[ ['Run', 'LumiSection', 'EventNum', 'CrossingAngle', 
                        'MultiRP', 'Arm', 'RPId1', 'RPId2', 'TrackX1', 'TrackY1', 'TrackX2', 'TrackY2',
                        'Xi', 'T', 'ThX', 'ThY', 'Time',
                        'Muon0Pt', 'Muon1Pt', 'InvMass', 'ExtraPfCands', 'Acopl', 'XiMuMuPlus', 'XiMuMuMinus'] ].astype( { "Run": "int64", "LumiSection": "int64", "EventNum": "int64", "MultiRP": "int32", "Arm": "int32", "RPId1": "int32", "RPId2": "int32", "ExtraPfCands": "int32" } )
            df_data_list.append( df_ )
            print ( df_data_list[-1].head() )
            print ( len( df_data_list[-1] ) )
        

In [None]:
df_counts_data = df_counts_data_list[0];
for idx in range( 1, len(df_counts_data_list) ):
    df_counts_data = df_counts_data.add( df_counts_data_list[idx] )
df_counts_data

In [None]:
df_data = pd.concat( df_data_list )

In [None]:
msk_data = ( df_data["InvMass"] >= 110. )

msk1_data = None
msk2_data = None
if proton_selection == "SingleRP":
    # Single-RP in pixel stations
    msk1_arm = ( df_data["RPId1"] == 23 )
    msk2_arm = ( df_data["RPId1"] == 123 )
    df_data[ "XiMuMu" ] = np.nan
    df_data[ "XiMuMu" ].where( ~msk1_arm, df_data[ "XiMuMuPlus" ], inplace=True )
    df_data[ "XiMuMu" ].where( ~msk2_arm, df_data[ "XiMuMuMinus" ], inplace=True )
    msk1_data = msk_data & ( df_data["MultiRP"] == 0) & msk1_arm
    msk2_data = msk_data & ( df_data["MultiRP"] == 0) & msk2_arm
elif proton_selection == "MultiRP":
    # Multi-RP
    msk1_arm = ( df_data["Arm"] == 0 )
    msk2_arm = ( df_data["Arm"] == 1 )
    df_data[ "XiMuMu" ] = np.nan
    df_data[ "XiMuMu" ].where( ~msk1_arm, df_data[ "XiMuMuPlus" ], inplace=True )
    df_data[ "XiMuMu" ].where( ~msk2_arm, df_data[ "XiMuMuMinus" ], inplace=True )
    msk1_data = msk_data & ( df_data["MultiRP"] == 1 ) & msk1_arm
    msk2_data = msk_data & ( df_data["MultiRP"] == 1 ) & msk2_arm

df_data = df_data[ msk1_data | msk2_data ]
df_data[:20]

In [None]:
X_data = df_data[ ['Xi', 'Muon0Pt', 'Muon1Pt', 'InvMass', 'ExtraPfCands', 'Acopl', 'XiMuMu'] ]
X_data[:20]

In [None]:
y_data = clf.predict( X_data )
y_data

In [None]:
fig, axes = plt.subplots( 1, 2, figsize=(20,10) )

msk_bkg_1 = None
msk_bkg_2 = None
msk_sig_1 = None
msk_sig_2 = None
if proton_selection == "SingleRP":
    msk_bkg_1 = ( df_data[ "RPId1" ] == 23 ) & ( y_data == 0 )
    msk_bkg_2 = ( df_data[ "RPId1" ] == 123 ) & ( y_data == 0 )
    msk_sig_1 = ( df_data[ "RPId1" ] == 23 ) & ( y_data == 1 )
    msk_sig_2 = ( df_data[ "RPId1" ] == 123 ) & ( y_data == 1 )
elif proton_selection == "MultiRP":
    msk_bkg_1 = ( df_data[ "Arm" ] == 0 ) & ( y_data == 0 )
    msk_bkg_2 = ( df_data[ "Arm" ] == 1 ) & ( y_data == 0 )
    msk_sig_1 = ( df_data[ "Arm" ] == 0 ) & ( y_data == 1 )
    msk_sig_2 = ( df_data[ "Arm" ] == 1 ) & ( y_data == 1 )

axes[0].plot( df_data[ "Xi" ][ msk_bkg_1 ], df_data[ "XiMuMuPlus" ][ msk_bkg_1 ], 'ro' )
axes[0].plot( df_data[ "Xi" ][ msk_sig_1 ], df_data[ "XiMuMuPlus" ][ msk_sig_1 ], 'bo' )
axes[0].plot( (0.,0.15), (0.,0.15), 'k--', linewidth=1 )
axes[0].plot( (0.,0.15), (0.,0.90*0.15), 'k:', linewidth=1 )
axes[0].plot( (0.,0.15), (0.,1.10*0.15), 'k:', linewidth=1 )
axes[0].set_xlim(0.,0.15)
axes[0].set_ylim(0.,0.15)
axes[1].plot( df_data[ "Xi" ][ msk_bkg_2 ], df_data[ "XiMuMuMinus" ][ msk_bkg_2 ], 'ro' )
axes[1].plot( df_data[ "Xi" ][ msk_sig_2 ], df_data[ "XiMuMuMinus" ][ msk_sig_2 ], 'bo' )
axes[1].plot( (0.,0.15), (0.,0.15), 'k--', linewidth=1 )
axes[1].plot( (0.,0.15), (0.,0.90*0.15), 'k:', linewidth=1 )
axes[1].plot( (0.,0.15), (0.,1.10*0.15), 'k:', linewidth=1 )
axes[1].set_xlim(0.,0.15)
axes[1].set_ylim(0.,0.15)

In [None]:
fig = plt.figure( figsize=(10,10) )
plt.hist( df_data[ "Acopl" ][ y_data == 1 ], histtype='step', color='skyblue', bins=50, range=(0.,0.02), density=True )
plt.hist( df_data[ "Acopl" ][ y_data == 0 ], histtype='step', color='orange', bins=50, range=(0.,0.02), density=True )

In [None]:
fig = plt.figure( figsize=(10,10) )
plt.hist( df_data[ "ExtraPfCands" ][ y_data == 1 ], histtype='step', color='skyblue', bins=50, range=(0,50), density=True )
plt.hist( df_data[ "ExtraPfCands" ][ y_data == 0 ], histtype='step', color='orange', bins=50, range=(0,50), density=True )

In [None]:
fig, axes = plt.subplots( 1, 2, figsize=(20,10) )

#var_ = "Xi"
#bins_ = 10
#range_ = (0.,0.2)

#var_ = "ThX"
#bins_ = 10
#range_ = (-0.0005,0.0005)

#var_ = "ThY"
#bins_ = 10
#range_ = (-0.0005,0.0005)

#var_ = "T"
#bins_ = 10
#range_ = (-4.,0.)

#var_ = "Time"
#bins_ = 20
#range_ = (-0.5,0.5)

#var_ = "Muon0Pt"
#bins_ = 10
#range_ = (50.,200.)

#var_ = "InvMass"
#bins_ = 10
#range_ = (110.,500.)

#var_ = "Acopl"
#bins_ = 10
#range_ = (0.,0.01)

#var_ = "ExtraPfCands"
#bins_ = 20
#range_ = (0,20)

msk_data_1 = None
msk_data_2 = None
if proton_selection == "SingleRP":
    msk_data_1 = ( df_data[ "RPId1" ] == 23 ) & ( y_data == 1 )
    msk_data_2 = ( df_data[ "RPId1" ] == 123 ) & ( y_data == 1 )
elif proton_selection == "MultiRP":
    msk_data_1 = ( df_data[ "Arm" ] == 0 ) & ( y_data == 1 )
    msk_data_2 = ( df_data[ "Arm" ] == 1 ) & ( y_data == 1 )

counts_1, bin_edges_1 = np.histogram( df_data[ var_ ][ msk_data_1 ], bins=bins_, range=range_ )
errors_1 = np.sqrt( counts_1 )
bin_centres_1 = ( bin_edges_1[:-1] + bin_edges_1[1:] ) / 2.
axes[0].errorbar(bin_centres_1, counts_1, yerr=errors_1, fmt='o')

counts_2, bin_edges_2 = np.histogram( df_data[ var_ ][ msk_data_2 ], bins=bins_, range=range_ )
errors_2 = np.sqrt( counts_2 )
bin_centres_2 = ( bin_edges_2[:-1] + bin_edges_2[1:] ) / 2.
axes[1].errorbar(bin_centres_2, counts_2, yerr=errors_2, fmt='o')

idx_ymax_ = np.argmax( np.concatenate( [counts_1, counts_2] ) )
y_max = np.concatenate( [counts_1, counts_2] )[idx_ymax_] + 2*np.concatenate( [errors_1, errors_2] )[idx_ymax_]
print ( "y max. = {}".format(y_max) )

msk_bkg_pred_1 = None
msk_bkg_pred_2 = None
if proton_selection == "SingleRP":
    msk_bkg_pred_1 = ( df_bkg[ "RPId1" ] == 23 ) & ( y_bkg_pred == 1 )
    msk_bkg_pred_2 = ( df_bkg[ "RPId1" ] == 123 ) & ( y_bkg_pred == 1 )
elif proton_selection == "MultiRP":
    msk_bkg_pred_1 = ( df_bkg[ "Arm" ] == 0 ) & ( y_bkg_pred == 1 )
    msk_bkg_pred_2 = ( df_bkg[ "Arm" ] == 1 ) & ( y_bkg_pred == 1 )

weights_1 = None
if resample_factor > 1:
    weights_1 = np.full_like( df_bkg[ "Xi" ][ msk_bkg_pred_1 ], ( 1./resample_factor ) )

weights_2 = None
if resample_factor > 1:
    weights_2 = np.full_like( df_bkg[ "Xi" ][ msk_bkg_pred_2 ], ( 1./resample_factor ) )

axes[0].hist( df_bkg[ var_ ][ msk_bkg_pred_1 ], bins=bins_, range=range_, weights=weights_1 )
axes[1].hist( df_bkg[ var_ ][ msk_bkg_pred_2 ], bins=bins_, range=range_, weights=weights_2 )
    
axes[0].set_ylim( top=y_max )
axes[1].set_ylim( top=y_max )

In [None]:
fig, axes = plt.subplots( 1, 2, figsize=(20,10) )

bins_ = 20
range_ = (-1.,1.)

vals_data = ( 1. - df_data[ "Xi" ] / df_data[ "XiMuMu" ] )

counts_1, bin_edges_1 = np.histogram( vals_data[ msk_data_1 ], bins=bins_, range=range_ )
errors_1 = np.sqrt( counts_1 )
bin_centres_1 = ( bin_edges_1[:-1] + bin_edges_1[1:] ) / 2.
axes[0].errorbar(bin_centres_1, counts_1, yerr=errors_1, fmt='o')

counts_2, bin_edges_2 = np.histogram( vals_data[ msk_data_2 ], bins=bins_, range=range_ )
errors_2 = np.sqrt( counts_2 )
bin_centres_2 = ( bin_edges_2[:-1] + bin_edges_2[1:] ) / 2.
axes[1].errorbar(bin_centres_2, counts_2, yerr=errors_2, fmt='o')

idx_ymax_ = np.argmax( np.concatenate( [counts_1, counts_2] ) )
y_max = np.concatenate( [counts_1, counts_2] )[idx_ymax_] + 2*np.concatenate( [errors_1, errors_2] )[idx_ymax_]
print ( "y max. = {}".format(y_max) )

vals_bkg = ( 1. - df_bkg[ "Xi" ] / df_bkg[ "XiMuMu" ] )

axes[0].hist( vals_bkg[ msk_bkg_pred_1 ], bins=bins_, range=range_, weights=weights_1 )
axes[1].hist( vals_bkg[ msk_bkg_pred_2 ], bins=bins_, range=range_, weights=weights_2 )
    
axes[0].set_ylim( top=y_max )
axes[1].set_ylim( top=y_max )