In [None]:
import uproot

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns

import sys

In [None]:
plt.rcParams.update({
    "axes.labelsize": 14,
    "xtick.labelsize": 12,
    "ytick.labelsize": 12,
    "legend.fontsize": 12,
    "figure.titlesize": 18
})

In [None]:
sys.path.append("/home/belle2/amubarak/Ds2D0enue_Analysis/07-Python_Functions/")

# Prep-Work

### Import Data

In [None]:
# In this notebook we only process the main signal and the generic events,
# for illustration purposes.
# You can add other backgrounds after if you wish.
samples = ["Signal","All","ccbar"]

DataFrames = {}  # define empty dictionary to hold dataframes
Date = "0406"
Attempt = "0"

# Signal:
DataFrames["Signal"] =  uproot.concatenate("/home/belle2/amubarak/C01-Simulated_Events/Ds2D0enu-Signal.root:Dstree",library='pd')
DataFrames["True"] =  uproot.concatenate("/home/belle2/amubarak/C01-Simulated_Events/Ds2D0enu-Signal_True.root:Dstree",library='pd')
DataFrames["False"] =  uproot.concatenate("/home/belle2/amubarak/C01-Simulated_Events/Ds2D0enu-Signal_False.root:Dstree",library='pd')
# Background
for s in samples[1:]: # loop over samples
    DataFrames[s] =  uproot.concatenate("/group/belle2/users2022/amubarak/02-Grid/Completed/Ds2D0e-Generic_Ds_" + Date +"25_"+ Attempt +"_"+ s +".root:Dstree",library='pd')

In [None]:
pd.set_option('display.max_rows', 200000)
pd.set_option('display.max_columns', 200000)

The line below is to look at the available variables.

In [None]:
DataFrames["Signal"].columns.tolist()

### Setup
The code below will be used to apply cuts to the data.  
The range of the plots.

In [None]:
# Electron ID
#-------------------
# DataFrames["Signal"] = DataFrames["Signal"][DataFrames["Signal"]['e_electronID']>=0.95]
# DataFrames["ccbar"] = DataFrames["ccbar"][DataFrames["ccbar"]['e_electronID']>=0.95]
# DataFrames["Signal"] = DataFrames["Signal"][DataFrames["Signal"]['Ds_gammaveto_em_electronID']>=0.95]
# DataFrames["ccbar"] = DataFrames["ccbar"][DataFrames["ccbar"]['Ds_gammaveto_em_electronID']>=0.95]

# Fake D0 Suppression
#------------------------
# DataFrames["Signal"] = DataFrames["Signal"][DataFrames["Signal"]['Ds_extraInfo_FastBDT']>=0.58]
# DataFrames["All"] = DataFrames["All"][DataFrames["All"]['Ds_extraInfo_FastBDT']>=0.58]

# Peaking Background Removal
#----------------------------
# DataFrames["ccbar"] = DataFrames["ccbar"][(DataFrames["ccbar"]['Ds_diff_D0pi']>=0.15)]
# DataFrames["Signal"] = DataFrames["Signal"][(DataFrames["Signal"]['Ds_diff_D0pi']>=0.15)]

# Photon Conversion
#-------------------
# DataFrames["Signal"] = DataFrames["Signal"][DataFrames["Signal"]['Ds_gammaveto_M_Correction']<=0.1]
# DataFrames["All"] = DataFrames["All"][DataFrames["All"]['Ds_gammaveto_M_Correction']<=0.1]

# # Vertex Fitting
# #----------------
# DataFrames["Signal"] = DataFrames["Signal"][DataFrames["Signal"]['Ds_chiProb']>=0.01]
# DataFrames["ccbar"] = DataFrames["ccbar"][DataFrames["ccbar"]['Ds_chiProb']>=0.01]

# Vertex Fit
#----------------
# DataFrames["Signal"] = DataFrames["Signal"][DataFrames["Signal"]['Ds_chiProb_Ds_rank']==1]
# DataFrames["All"] = DataFrames["All"][DataFrames["All"]['Ds_chiProb_Ds_rank']==1]

# Single Variable  
The code below is to look at any specific variable closely.

In [None]:
Stacked = True
Density = False
Bins = 5
var = '__ncandidates__'
Range = [0.0, 5.0]
BD = -1
perBin = ((Range[1] - Range[0])/Bins)*1000
print("Width Per Bin: {width:.2f} MeV".format(width = perBin))

label1= r'$isSignal(D_s^{+}) \neq 1$'
label2= r'$isSignal(D_s^{+})=1$'

labels=[label1,label2]
colors=["#d62728","#1f77b4"]

data = [ 
    DataFrames["False"][var],
    DataFrames["True"][var],
       ]


plt.hist(data, color=colors, label=labels, alpha=1, range=Range, linewidth=2, stacked=Stacked, density=Density, bins=Bins, histtype='step')
# plt.axvspan(Range[0],-0.02,color='gray',alpha=0.2)
# plt.axvspan(0.02,Range[1],color='gray',alpha=0.2)
# plt.axvline(-0.02,ls='--',color='gray')
# plt.axvline(0.02,ls='--',color='gray')

# Title
#---------
# Signal
plt.title(r'$2M\;Events$', loc = "left")
plt.title(r'$\bf Signal\;Events$', loc = "right")
# # Background
# plt.title(r'$\int\mathcal{L}dt\approx\;100$ fb$^{-1}$', loc = "left")
# plt.title(r'$\bf Generic\;c\bar{c}\;Events$', loc = "right")
# Label
#---------
plt.ylabel(r'$Entries$')
plt.xlabel(r'# of Candidates Per Event')
# plt.yscale("log") 
plt.legend()
plt.show()

In [None]:
Stacked = False
Density = True
Bins = 50
var = 'Ds_MminusMtrue_D0e_emass'
Range = [-0.1, 0.1]
BD = -1
perBin = ((Range[1] - Range[0])/Bins)*1000
print("Width Per Bin: {width:.2f} MeV".format(width = perBin))

label1= r'Best Candidate'
label2= r'Rejected Candidate'
# label3= r'Candidate 3'

labels=[label1,label2]
colors=["#1f77b4","#9467bd"]

data = [ 
    DataFrames["True"][(DataFrames["True"]['__ncandidates__']>=2) & (DataFrames["True"]['Ds_random_Ds_rank']==1)][var],
    DataFrames["True"][(DataFrames["True"]['__ncandidates__']>=2) & (DataFrames["True"]['Ds_random_Ds_rank']>1)][var],
    # DataFrames["Signal"][(DataFrames["Signal"]['__ncandidates__']>=2) & (DataFrames["Signal"]['Ds_chiProb_Ds_rank']>2)][var],
       ]


plt.hist(data, color=colors, label=labels, alpha=1, range=Range, linewidth=2, stacked=Stacked, density=Density, bins=Bins, histtype='step')
# plt.axvspan(Range[0],-0.02,color='gray',alpha=0.2)
# plt.axvspan(0.02,Range[1],color='gray',alpha=0.2)
# plt.axvline(-0.02,ls='--',color='gray')
# plt.axvline(0.02,ls='--',color='gray')

# Title
#---------
# Signal
plt.title(r'BCS: random, isSignal($D_s^{+}$)=1', loc = "left")
plt.title(r'$\bf Signal\;Events$', loc = "right")
# # Background
# plt.title(r'$\int\mathcal{L}dt\approx\;100$ fb$^{-1}$', loc = "left")
# plt.title(r'$\bf Generic\;c\bar{c}\;Events$', loc = "right")
# Label
#---------
plt.ylabel(r'$Entries/(\; {width:.2f}\;MeV/c^2)$'.format(width = perBin))
plt.xlabel(r'$m(D^{0} e^{+}) - m_{True}(D^{0} e^{+}) [GeV/c^{2}]$')
# plt.yscale("log") 
plt.legend()
plt.show()

In [None]:
Stacked = False
Density = True
Bins = 50
var = 'Ds_MminusMtrue_D0e_emass'
Range = [-0.1, 0.1]
BD = -1
perBin = ((Range[1] - Range[0])/Bins)*1000
print("Width Per Bin: {width:.2f} MeV".format(width = perBin))

label1= r'Best Candidate'
label2= r'Rejected Candidate'
# label3= r'Candidate 3'

labels=[label1,label2]
colors=["#1f77b4","#9467bd"]

data = [ 
    DataFrames["Signal"][(DataFrames["Signal"]['__ncandidates__']>=2) & (DataFrames["Signal"]['Ds_random_Ds_rank']==1)][var],
    DataFrames["Signal"][(DataFrames["Signal"]['__ncandidates__']>=2) & (DataFrames["Signal"]['Ds_random_Ds_rank']>1)][var],
    # DataFrames["Signal"][(DataFrames["Signal"]['__ncandidates__']>=2) & (DataFrames["Signal"]['Ds_chiProb_Ds_rank']>2)][var],
       ]


plt.hist(data, color=colors, label=labels, alpha=1, range=Range, linewidth=2, stacked=Stacked, density=Density, bins=Bins, histtype='step')
# plt.axvspan(Range[0],-0.02,color='gray',alpha=0.2)
# plt.axvspan(0.02,Range[1],color='gray',alpha=0.2)
# plt.axvline(-0.02,ls='--',color='gray')
# plt.axvline(0.02,ls='--',color='gray')

# Title
#---------
# Signal
plt.title(r'BCS: random', loc = "left")
plt.title(r'$\bf Signal\;Events$', loc = "right")
# # Background
# plt.title(r'$\int\mathcal{L}dt\approx\;100$ fb$^{-1}$', loc = "left")
# plt.title(r'$\bf Generic\;c\bar{c}\;Events$', loc = "right")
# Label
#---------
plt.ylabel(r'$Entries/(\; {width:.2f}\;MeV/c^2)$'.format(width = perBin))
plt.xlabel(r'$m(D^{0} e^{+}) - m_{True}(D^{0} e^{+}) [GeV/c^{2}]$')
# plt.yscale("log") 
plt.legend()
plt.show()

In [None]:
Stacked = False
Density = True
Bins = 50
var = 'Ds_MminusMtrue_D0e_emass'
Range = [-0.1, 0.1]
BD = -1
perBin = ((Range[1] - Range[0])/Bins)*1000
print("Width Per Bin: {width:.2f} MeV".format(width = perBin))

label1= r'isCurl($e^{+}$)=1'
label2= r'isCurl($e^{+}$)=0'
# label3= r'Candidate 3'

labels=[label1,label2]
colors=["#1f77b4","#9467bd"]

data = [ 
    DataFrames["Signal"][(DataFrames["Signal"]['__ncandidates__']>=2) & (DataFrames["Signal"]['e_isCurl']==1)][var],
    DataFrames["Signal"][(DataFrames["Signal"]['__ncandidates__']>=2) & (DataFrames["Signal"]['e_isCurl']==0)][var],
    # DataFrames["Signal"][(DataFrames["Signal"]['__ncandidates__']>=2) & (DataFrames["Signal"]['Ds_chiProb_Ds_rank']>2)][var],
       ]


plt.hist(data, color=colors, label=labels, alpha=1, range=Range, linewidth=2, stacked=Stacked, density=Density, bins=Bins, histtype='step')
# plt.axvspan(Range[0],-0.02,color='gray',alpha=0.2)
# plt.axvspan(0.02,Range[1],color='gray',alpha=0.2)
# plt.axvline(-0.02,ls='--',color='gray')
# plt.axvline(0.02,ls='--',color='gray')

# Title
#---------
# Signal
# plt.title(r'BCS: p-value($D_s^{+}$), isSignal($D_s^{+}$)=1', loc = "left")
plt.title(r'$\bf Signal\;Events$', loc = "right")
# # Background
# plt.title(r'$\int\mathcal{L}dt\approx\;100$ fb$^{-1}$', loc = "left")
# plt.title(r'$\bf Generic\;c\bar{c}\;Events$', loc = "right")
# Label
#---------
plt.ylabel(r'$Entries/(\; {width:.2f}\;MeV/c^2)$'.format(width = perBin))
plt.xlabel(r'$m(D^{0} e^{+}) - m_{True}(D^{0} e^{+}) [GeV/c^{2}]$')
# plt.yscale("log") 
plt.legend()
plt.show()

In [None]:
Stacked = False
Density = False
Bins = 4
var = 'e_isCurl'
Range = [0.0, 1.0]
BD = -1
perBin = ((Range[1] - Range[0])/Bins)*1000
print("Width Per Bin: {width:.2f} MeV".format(width = perBin))

label1= r'Best Candidate'

labels=[label1]
colors=["#1f77b4"]

data = [ 
    DataFrames["Signal"][var],
    # DataFrames["Signal"][(DataFrames["Signal"]['__ncandidates__']>=2) & (DataFrames["Signal"]['Ds_chiProb_Ds_rank']>2)][var],
       ]


plt.hist(data, color=colors, label=labels, alpha=1, range=Range, linewidth=2, stacked=Stacked, density=Density, bins=Bins, histtype='step')
# plt.axvspan(Range[0],-0.02,color='gray',alpha=0.2)
# plt.axvspan(0.02,Range[1],color='gray',alpha=0.2)
# plt.axvline(-0.02,ls='--',color='gray')
# plt.axvline(0.02,ls='--',color='gray')

# Title
#---------
# Signal
# plt.title(r'BCS: p-value($D_s^{+}$), isSignal($D_s^{+}$)=1', loc = "left")
plt.title(r'$\bf Signal\;Events$', loc = "right")
# # Background
# plt.title(r'$\int\mathcal{L}dt\approx\;100$ fb$^{-1}$', loc = "left")
# plt.title(r'$\bf Generic\;c\bar{c}\;Events$', loc = "right")
# Label
#---------
# plt.ylabel(r'$Entries/(\; {width:.2f}\;MeV/c^2)$'.format(width = perBin))
plt.ylabel(r'$Entries$')
plt.xlabel(r'$isCurl(e^{+})$')
plt.yscale("log") 
# plt.legend()
plt.show()

In [None]:
Stacked = False
Density = False
Bins = 4
var = 'e_isCurl'
Range = [0.0, 1.0]
BD = -1
perBin = ((Range[1] - Range[0])/Bins)*1000
print("Width Per Bin: {width:.2f} MeV".format(width = perBin))

label1= r'Best Candidate'
label2= r'Rejected Candidate'
# label3= r'Candidate 3'

labels=[label1]
colors=["#d62728"]

data = [ 
    DataFrames["All"][var],
    # DataFrames["Signal"][(DataFrames["Signal"]['__ncandidates__']>=2) & (DataFrames["Signal"]['Ds_chiProb_Ds_rank']>2)][var],
       ]


plt.hist(data, color=colors, label=labels, alpha=1, range=Range, linewidth=2, stacked=Stacked, density=Density, bins=Bins, histtype='step')
# plt.axvspan(Range[0],-0.02,color='gray',alpha=0.2)
# plt.axvspan(0.02,Range[1],color='gray',alpha=0.2)
# plt.axvline(-0.02,ls='--',color='gray')
# plt.axvline(0.02,ls='--',color='gray')

# Title
#---------
# Signal
# plt.title(r'BCS: p-value($D_s^{+}$), isSignal($D_s^{+}$)=1', loc = "left")
# plt.title(r'$\bf Signal\;Events$', loc = "right")
# # Background
plt.title(r'$\int\mathcal{L}dt\approx\;200$ fb$^{-1}$', loc = "left")
plt.title(r'$\bf Generic\;Events$', loc = "right")
# Label
#---------
# plt.ylabel(r'$Entries/(\; {width:.2f}\;MeV/c^2)$'.format(width = perBin))
plt.ylabel(r'$Entries$')
plt.xlabel(r'$isCurl(e^{+})$')
plt.yscale("log") 
# plt.legend()
plt.show()

In [None]:
print(abs(DataFrames["Signal"][['e_isCurl']]).value_counts(normalize=False,dropna=False))
print(abs(DataFrames["Signal"][['__ncandidates__','e_isCurl']]).value_counts(normalize=False,dropna=False))

In [None]:
print(abs(DataFrames["Signal"][['e_isCurl']]).value_counts(normalize=False,dropna=False))
print(abs(DataFrames["Signal"][['Ds_random_Ds_rank','e_isCurl']]).value_counts(normalize=False,dropna=False))

In [None]:
print(abs(DataFrames["All"][['Ds_random_Ds_rank','e_isCurl']]).value_counts(normalize=False,dropna=False))

# Accuracy

In [None]:
def compute_selection_accuracy_signal(
    df,
    is_signal_column='Ds_ifNANgiveX_isSignal_0',
    event_column='__event__',
    rank_column=None,
    selection_variable=None,
    pick_max=True,
    n_candidate_column='__ncandidates__'  # Optional helper
):
    """
    Computes how often the best candidate (based on rank or variable) is the signal one,
    but only for events with ‚â• 2 candidates and at least one signal candidate.

    Parameters:
    - df: DataFrame of candidates
    - is_signal_column: column name for isSignal (1 = signal)
    - event_column: column to group events (e.g. '__event__')
    - rank_column: if given, use this column (value==1) to choose best candidate
    - selection_variable: if given, pick candidate with highest/lowest value
    - pick_max: if True, choose candidate with max value of selection_variable (default: True)
    - n_candidate_column: optional column storing number of candidates per event

    Returns:
    - accuracy (float)
    - total_signal_events (int)
    - correct_picks (int)
    """
    assert rank_column or selection_variable, "Must provide either rank_column or selection_variable."

    grouped = df.groupby(event_column)
    correct_picks = 0
    total_signal_events = 0

    for event_id, group in grouped:
        # Require at least 2 candidates
        n_cands = group[n_candidate_column].iloc[0] if n_candidate_column in group.columns else len(group)
        if n_cands < 2:
            continue

        # Require at least one signal candidate
        if not (group[is_signal_column] == 1).any():
            continue

        total_signal_events += 1

        # Choose best candidate
        if rank_column:
            best_cands = group[group[rank_column] == 1]
        else:
            best_idx = group[selection_variable].idxmax() if pick_max else group[selection_variable].idxmin()
            best_cands = group.loc[[best_idx]]

        if best_cands.empty:
            continue  # no valid best candidate

        best_cand = best_cands.iloc[0]
        if best_cand[is_signal_column] == 1:
            correct_picks += 1

    accuracy = correct_picks / total_signal_events if total_signal_events > 0 else 0.0
    return accuracy, total_signal_events, correct_picks

def report_selection_accuracy(df, **kwargs):
    accuracy, total, correct = compute_selection_accuracy_signal(df, **kwargs)

    method_desc = ""
    if kwargs.get("rank_column"):
        method_desc = f"using rank column '{kwargs['rank_column']}'"
    elif kwargs.get("selection_variable"):
        direction = "maximum" if kwargs.get("pick_max", True) else "minimum"
        method_desc = f"using {direction} of variable '{kwargs['selection_variable']}'"

    print(f"\nüîé Selection Accuracy Report ({method_desc}):")
    print(f"--------------------------------------------")
    print(f"  ‚úîÔ∏è  Correct Picks       : {correct}")
    print(f"  üì¶  Signal Events Total : {total}")
    print(f"  üìà  Selection Accuracy  : {accuracy:.4f} ({accuracy*100:.2f}%)\n")


In [None]:
report_selection_accuracy(
    df=DataFrames["Signal"],
    is_signal_column='Ds_ifNANgiveX_isSignal_0',
    event_column='__event__',
    # selection_variable='Ds_chiProb', #'Ds_chiProb_Ds_rank','Ds_Electron_p_Ds_rank','Ds_random_Ds_rank',
    rank_column='Ds_Electron_p_Ds_rank',
    pick_max=True,
    n_candidate_column='__ncandidates__'  # Optional helper
)
