In [None]:
# !pip uninstall -y scikit-learn
# !pip install scikit-learn==1.3.1

In [None]:
# ! pip install --upgrade pip
# ! pip install torch
# ! pip install hiddenlayer
# ! pip install --user bayesian-optimization

In [None]:
# import mplhep
import sys

import seaborn as sns
import pydot

import numpy as np
import pandas as pd
import uproot
from matplotlib import pyplot as plt

from sklearn.datasets import make_classification,make_regression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import auc,roc_curve,confusion_matrix,classification_report,precision_recall_curve,mean_squared_error,accuracy_score,roc_auc_score
from sklearn.model_selection import GridSearchCV, cross_validate, validation_curve,train_test_split,KFold,learning_curve,cross_val_score
from scipy.stats import ks_2samp
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from sklearn.utils import class_weight
import numpy as np

In [None]:
plt.rcParams.update({
    "axes.labelsize": 16,
    "xtick.labelsize": 12,
    "ytick.labelsize": 12,
    "legend.fontsize": 12,
    "figure.titlesize": 18
})

In [None]:
sys.path.append("/home/belle2/amubarak/Ds2D0enue_Analysis/07-Python_Functions/")

# Prep-Work

### Import Data

In [None]:
import os
import uproot
import pandas as pd

# === Load only selected branches ===
with open("/home/belle2/amubarak/Ds2D0enue_Analysis/03-Grid/Save_var.txt") as f:
    variables_to_load = [
        line.strip().strip(",").strip('"').strip("'")
        for line in f
        if line.strip() and not line.strip().startswith("#")
    ]

# Make sure to include BDT output variable
if "Ds_FakeD0BDT" not in variables_to_load:
    variables_to_load.append("Ds_FakeD0BDT")

# === Sample list ===
samples = ["Signal", "BB", "ccbar", "ddbar", "ssbar", "taupair", "uubar"]
GenEvents = samples.copy()

# === Input configuration ===
Date = "0530"
Attempt = "0"
input_dir = "/group/belle2/users2022/amubarak/03-ML/FakeD0/"

# === Load ROOT files into DataFrames ===
DataFrames = {}

for s in samples:
    if s == "Signal":
        file_path = os.path.join(input_dir, "Ds2D0enu-Signal_withBDT.root")
    else:
        file_path = os.path.join(
            input_dir, f"Ds2D0e-Generic_Ds_{Date}25_{Attempt}_{s}_withBDT.root"
        )

    print(f"Loading: {file_path}")
    DataFrames[s] = uproot.concatenate(
        f"{file_path}:Dstree",
        filter_name=variables_to_load,
        library="pd"
    )

# === Define combined background ===
background_samples = ["BB", "ccbar", "ddbar", "ssbar", "taupair", "uubar"]
DataFrames["All"] = pd.concat([DataFrames[s] for s in background_samples], ignore_index=True)

# === Combine uds backgrounds for convenience ===
DataFrames["uds"] = pd.concat(
    [DataFrames["uubar"], DataFrames["ddbar"], DataFrames["ssbar"]],
    ignore_index=True
)

In [None]:
pd.set_option('display.max_rows', 200000)
pd.set_option('display.max_columns', 200000)

The line below is to look at the available variables.

In [None]:
DataFrames["All"].columns.tolist()

### Setup
The code below will be used to apply cuts to the data.  
The range of the plots.

In [None]:
# Electron ID
#-------------------
# DataFrames["Signal"] = DataFrames["Signal"][DataFrames["Signal"]['e_electronID']>=0.95]
# DataFrames["ccbar"] = DataFrames["ccbar"][DataFrames["ccbar"]['e_electronID']>=0.95]
# DataFrames["Signal"] = DataFrames["Signal"][DataFrames["Signal"]['Ds_gammaveto_em_electronID']>=0.95]
# DataFrames["ccbar"] = DataFrames["ccbar"][DataFrames["ccbar"]['Ds_gammaveto_em_electronID']>=0.95]

# Photon Conversion
#-------------------
# DataFrames[samples[0]] = DataFrames[samples[0]][DataFrames[samples[0]]['Ds_gammaveto_M_Correction']>=0.1]
# DataFrames[samples[1]] = DataFrames[samples[1]][DataFrames[samples[1]]['Ds_gammaveto_M_Correction']>=0.1]

# Peaking Background Removal
#----------------------------
# DataFrames["ccbar"] = DataFrames["ccbar"][(DataFrames["ccbar"]['Ds_diff_D0pi']>=0.15)]
# DataFrames["Signal"] = DataFrames["Signal"][(DataFrames["Signal"]['Ds_diff_D0pi']>=0.15)]

# # Vertex Fitting
# #----------------
# DataFrames["Signal"] = DataFrames["Signal"][DataFrames["Signal"]['Ds_chiProb']>=0.01]
# DataFrames["ccbar"] = DataFrames["ccbar"][DataFrames["ccbar"]['Ds_chiProb']>=0.01]

# Dalitz Removal
#----------------------------
# DataFrames["ccbar"] = DataFrames["ccbar"][(DataFrames["ccbar"]['Ds_pi0veto_M_Correction']<=0.08) | (DataFrames["ccbar"]['Ds_pi0veto_M_Correction']>=0.16)]
# DataFrames["Signal"] = DataFrames["Signal"][(DataFrames["Signal"]['Ds_pi0veto_M_Correction']<=0.08) | (DataFrames["Signal"]['Ds_pi0veto_M_Correction']>=0.16)]

# Vertex Fit
#----------------
# DataFrames[samples[0]] = DataFrames[samples[0]][DataFrames[samples[0]]['Ds_chiProb_rank']==1]
# DataFrames[samples[1]] = DataFrames[samples[1]][DataFrames[samples[1]]['Ds_chiProb_rank']==1]

# D0 Invariant Mass
#-----------------------
# DataFrames[samples[0]] = DataFrames[samples[0]][(DataFrames[samples[0]]['Ds_D0_sideband']==1)]
# DataFrames[samples[1]] = DataFrames[samples[1]][(DataFrames[samples[1]]['Ds_D0_sideband']==1)]

# $D^{*+}$ Suppression

In [None]:
# === Settings ===
Stacked = True
Density = False
Bins = 50
Range = [0.0, 0.25]
perBin = ((Range[1] - Range[0]) / Bins) * 1000
print("Width Per Bin: {:.2f} MeV".format(perBin))

# Cut range on 'Ds_diff_D0pi'
cut_low = 0.142
cut_high = 0.15

# Variable to plot after cut
var = 'Ds_massDifference_0'

# Labels and colors
labels = [
    r'$c \bar{c}$',
    r'$u \bar{u}, \; d \bar{d}, \;s \bar{s}$',
    r'$BB$',
    r'$\tau^{+} \tau^{-}$'
]

# Apply sideband cut (outside signal region) and collect data
data = [
    DataFrames["ccbar"].query("Ds_diff_D0pi <= @cut_low or Ds_diff_D0pi >= @cut_high")[var],
    DataFrames["uds"].query("Ds_diff_D0pi <= @cut_low or Ds_diff_D0pi >= @cut_high")[var],
    DataFrames["BB"].query("Ds_diff_D0pi <= @cut_low or Ds_diff_D0pi >= @cut_high")[var],
    DataFrames["taupair"].query("Ds_diff_D0pi <= @cut_low or Ds_diff_D0pi >= @cut_high")[var],
]

# === Plot ===
# plt.figure(figsize=(8, 5))
plt.hist(data[::-1],
         label=labels[::-1],
         density=Density,
         stacked=Stacked,
         bins=Bins,
         range=Range,
         histtype='step',
         linewidth=2)

# Titles
plt.title(r'$D_s^{+} \rightarrow [D^{0} \rightarrow K^{-} \pi^{+}] e^{+} \nu_{e}$' + '\n' + r'$\Delta m_{\pi}(D_s^{+} - D^{0}) \notin [0.142,\; 0.15] \; \mathrm{GeV}/c^{2}$', loc="left")
plt.title(r'$\int\mathcal{L}dt\approx\;1444$ fb$^{-1}$', loc="right")

# Labels
plt.xlabel(r'$\Delta m_{e}(D_s^{+} - D^{0})\;[GeV/c^{2}]$')
plt.ylabel(r'$Entries/(\;{:.2f}\;MeV/c^2)$'.format(perBin))
plt.legend()
# plt.tight_layout()
plt.show()

In [None]:
# === Settings ===
Stacked = True
Density = False
Bins = 50
Range = [0.0, 0.25]
perBin = ((Range[1] - Range[0]) / Bins) * 1000
print("Width Per Bin: {:.2f} MeV".format(perBin))

# Data source and variables
df_cut = DataFrames["All"]
cut_var = "Ds_diff_D0pi"
plot_var = 'Ds_massDifference_0'
pdg_var = 'Ds_mcPDG'

# === Categories based on true Ds_mcPDG ===
dstar_plus = df_cut[abs(df_cut[pdg_var]) == 413][plot_var]
dstar_zero = df_cut[abs(df_cut[pdg_var]) == 423][plot_var]
other = df_cut[(abs(df_cut[pdg_var]) != 413) & (abs(df_cut[pdg_var]) != 423)][plot_var]

# === Plot ===
plt.hist([other, dstar_zero, dstar_plus],
         color=["#2E2E2E", "#4C6EB1", "#007C91"],
         label=["Other", r"$D^{*0}$", r"$D^{*+}$"],
         density=Density,
         stacked=Stacked,
         bins=Bins,
         range=Range,
         histtype='step',
         linewidth=2)

# Titles and labels
plt.title(r'$D_s^{+} \rightarrow [D^{0} \rightarrow K^{-} \pi^{+}] e^{+} \nu_{e}$', loc="left")
plt.title(r'$\int\mathcal{L}dt\approx\;1444$ fb$^{-1}$', loc="right")
plt.xlabel(r'$\Delta m_{e}(D_s^{+} - D^{0})\;[GeV/c^{2}]$')
plt.ylabel(r'$Entries/(\;{:.2f}\;MeV/c^2)$'.format(perBin))
plt.legend()
# plt.tight_layout()
plt.show()

In [None]:
# === Settings ===
Stacked = True
Density = False
Bins = 50
Range = [0.0, 0.25]
perBin = ((Range[1] - Range[0]) / Bins) * 1000
print("Width Per Bin: {:.2f} MeV".format(perBin))

# Data source and variables
df = DataFrames["All"]
cut_var = "Ds_diff_D0pi"
plot_var = 'Ds_massDifference_0'
pdg_var = 'Ds_mcPDG'

# Sideband cut (exclude D*⁺ peak)
cut_low = 0.142
cut_high = 0.15
df_cut = df.query(f"{cut_var} <= @cut_low or {cut_var} >= @cut_high")

# === Categories based on true Ds_mcPDG ===
dstar_plus = df_cut[abs(df_cut[pdg_var]) == 413][plot_var]
dstar_zero = df_cut[abs(df_cut[pdg_var]) == 423][plot_var]
other = df_cut[(abs(df_cut[pdg_var]) != 413) & (abs(df_cut[pdg_var]) != 423)][plot_var]

# === Plot ===
plt.hist([other, dstar_zero, dstar_plus],
         color=["#2E2E2E", "#4C6EB1", "#007C91"],
         label=["Other", r"$D^{*0}$", r"$D^{*+}$"],
         density=Density,
         stacked=Stacked,
         bins=Bins,
         range=Range,
         histtype='step',
         linewidth=2)

# Titles and labels
plt.title(r'$D_s^{+} \rightarrow [D^{0} \rightarrow K^{-} \pi^{+}] e^{+} \nu_{e}$' + '\n' +
          r'$\Delta m_{\pi}(D_s^{+} - D^{0}) \notin [0.142,\; 0.15] \; \mathrm{GeV}/c^{2}$', loc="left")
plt.title(r'$\int\mathcal{L}dt\approx\;1444$ fb$^{-1}$', loc="right")
plt.xlabel(r'$\Delta m_{e}(D_s^{+} - D^{0})\;[GeV/c^{2}]$')
plt.ylabel(r'$Entries/(\;{:.2f}\;MeV/c^2)$'.format(perBin))
plt.legend()
# plt.tight_layout()
plt.show()


In [None]:
# === Settings ===
Stacked = True
Density = False
Bins = 50
Range = [0.0, 0.25]
perBin = ((Range[1] - Range[0]) / Bins) * 1000
print("Width Per Bin: {:.2f} MeV".format(perBin))

# Data source and variables
df = DataFrames["All"]
cut_var = "Ds_diff_D0pi"
plot_var = 'Ds_massDifference_0'
pdg_var = 'Ds_mcPDG'

# Sideband cut (exclude D*⁺ peak)
cut_low = 0.142
cut_high = 0.15
df_cut = df.query(f"{cut_var} <= @cut_low or {cut_var} >= @cut_high")

# === Categories based on true Ds_mcPDG ===
dstar_plus = df_cut[abs(df_cut[pdg_var]) == 413][plot_var]
dstar_zero = df_cut[abs(df_cut[pdg_var]) == 423][plot_var]
other = df_cut[(abs(df_cut[pdg_var]) != 413) & (abs(df_cut[pdg_var]) != 423)][plot_var]

# === Plot with D*+ in the middle ===
plt.hist([other, dstar_plus, dstar_zero],  # <== reordered
         color=["#2E2E2E", "#007C91", "#4C6EB1"],
         label=["Other", r"$D^{*+}$", r"$D^{*0}$"],
         density=Density,
         stacked=Stacked,
         bins=Bins,
         range=Range,
         histtype='step',
         linewidth=2)

# Titles and labels
plt.title(r'$D_s^{+} \rightarrow [D^{0} \rightarrow K^{-} \pi^{+}] e^{+} \nu_{e}$' + '\n' +
          r'$\Delta m_{\pi}(D_s^{+} - D^{0}) \notin [0.142,\; 0.15] \; \mathrm{GeV}/c^{2}$', loc="left")
plt.title(r'$\int\mathcal{L}dt\approx\;1444$ fb$^{-1}$', loc="right")
plt.xlabel(r'$\Delta m_{e}(D_s^{+} - D^{0})\;[GeV/c^{2}]$')
plt.ylabel(r'$Entries/(\;{:.2f}\;MeV/c^2)$'.format(perBin))
plt.legend()
# plt.tight_layout()
plt.show()

In [None]:
def plot_2d_with_marginals(df, xvar, yvar, xrange=None, yrange=None, bins=50, title=None):
    import matplotlib.pyplot as plt
    import matplotlib.gridspec as gridspec
    import pandas as pd

    # Extract variables
    x = df[xvar]
    y = df[yvar]

    # Apply range cuts
    if xrange is not None:
        xmask = (x >= xrange[0]) & (x <= xrange[1])
    else:
        xmask = pd.Series([True] * len(x))

    if yrange is not None:
        ymask = (y >= yrange[0]) & (y <= yrange[1])
    else:
        ymask = pd.Series([True] * len(y))

    mask = xmask & ymask
    x = x[mask]
    y = y[mask]

    # === Adjusted GridSpec ===
    fig = plt.figure(figsize=(7, 7))
    gs = gridspec.GridSpec(6, 6, hspace=0.05, wspace=0.05)  # More divisions

    ax_main = fig.add_subplot(gs[2:6, 0:4])
    ax_xhist = fig.add_subplot(gs[0:2, 0:4], sharex=ax_main)  # Taller top histogram
    ax_yhist = fig.add_subplot(gs[2:6, 4:6], sharey=ax_main)  # Wider side histogram
    ax_cbar = fig.add_subplot(gs[0:2, 4:6])  # Optional: move cbar if needed

    # Histogram range
    hist_range = [xrange, yrange] if xrange and yrange else None

    # 2D histogram
    counts, xedges, yedges, im = ax_main.hist2d(x, y, bins=bins, range=hist_range, cmap="viridis")
    cbar = fig.colorbar(im, cax=ax_cbar)
    cbar.set_label("Entries")

    # Marginal histograms
    ax_xhist.hist(x, bins=bins, range=xrange, color="#2E2E2E", histtype='step', linewidth=1.5)
    ax_yhist.hist(y, bins=bins, range=yrange, orientation="horizontal", color="#007C91", histtype='step', linewidth=1.5)

    # Labels
    ax_main.set_xlabel(r'$\Delta m_{e}(D_s^{+} - D^{0})\;[GeV/c^{2}]$')
    ax_main.set_ylabel(r'$\Delta m_{\pi}(D_s^{+} - D^{0})\;[GeV/c^{2}]$')
    ax_xhist.set_ylabel("Entries")
    ax_yhist.set_xlabel("Entries")

    # Clean ticks
    plt.setp(ax_xhist.get_xticklabels(), visible=False)
    plt.setp(ax_yhist.get_yticklabels(), visible=False)

    ax_yhist.set_xticklabels([
        "" if np.isclose(t, 0) else f"{int(t)}" for t in ax_yhist.get_xticks()
    ])

    # Title
    if title:
        plt.suptitle(title, fontsize=20)

    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.show()


In [None]:
# plot_2d_with_marginals(DataFrames["All"][(abs(DataFrames["All"]["Ds_mcPDG"])==423)],
#     xvar="Ds_massDifference_0",
#     yvar="Ds_diff_D0pi",
#     xrange=(0.0, 0.25),
#     yrange=(0.1, 0.55),
#     bins=60,
#     title=r"$\bf Generic \; Events$"
# )

In [None]:
# plot_2d_with_marginals(DataFrames["All"],
#     xvar="Ds_massDifference_0",
#     yvar="Ds_diff_D0pi",
#     xrange=(0.0, 0.25),
#     yrange=(0.1, 0.55),
#     bins=60,
#     title=r"$\bf Generic \; Events$"
# )

In [None]:
Stacked = True
Density = True
Bins = 50
i = 'Ds_massDifference_0'
# i = 'Ds_diff_D0pi'
Range = [0.0,0.25]
dM = -1
BD = -1
perBin = ((Range[1] - Range[0])/Bins)*1000
print("Width Per Bin: {width:.2f} MeV".format(width = perBin))

label1= r'$D^{*0} \rightarrow D^{0} [\gamma \rightarrow  e^{+} e^{-}]$'
label2= r'$D^{*0} \rightarrow D^{0} [\pi^0 \rightarrow \gamma [\gamma \rightarrow e^{+} e^{-}]]$'
label3= r'$D^{*0} \rightarrow D^{0} [\pi^0 \rightarrow \gamma  e^{+} e^{-}]$'

labels=[label1,label2,label3]
data=[
      DataFrames["All"][(abs(DataFrames["All"]['Ds_mcPDG'])==423) & (abs(DataFrames["All"]['Ds_genNStepsToDaughter_1'])==2) & (abs(DataFrames["All"]['e_genMotherPDG'])==22) & (DataFrames["All"]['Ds_gammaveto_M_Correction']>=dM)][i],
      DataFrames["All"][(abs(DataFrames["All"]['Ds_mcPDG'])==423) & (abs(DataFrames["All"]['Ds_genNStepsToDaughter_1'])==3) & (abs(DataFrames["All"]['e_genMotherPDG'])==22) & (DataFrames["All"]['Ds_gammaveto_M_Correction']>=dM)][i],
      DataFrames["All"][(abs(DataFrames["All"]['Ds_mcPDG'])==423) & (abs(DataFrames["All"]['Ds_genNStepsToDaughter_1'])==2) & (abs(DataFrames["All"]['e_genMotherPDG'])==111) & (DataFrames["All"]['Ds_gammaveto_M_Correction']>=dM)][i],
      ]

factor = 1
plt.hist(DataFrames["Signal"][(DataFrames["Signal"]['Ds_gammaveto_M_Correction']>=dM)][i], color=["#D62728"], label="Signal", histtype='step', density=Density, bins=Bins, alpha=1, range=Range, weights=factor*np.ones_like(DataFrames["Signal"][(DataFrames["Signal"]['Ds_gammaveto_M_Correction']>=dM)][i]), linewidth=1.5)
plt.hist(data, color= ["#007C91", "#4C6EB1", "#2E2E2E"], label=labels, density=Density, stacked=Stacked, bins=Bins, alpha=1, histtype='step', linewidth=1.5, range=Range)
# plt.axvspan(Range[0],0.15,color='gray',alpha=0.2)
# plt.axvline(0.15,ls='--',color='gray')

# Title
#--------
plt.title(r'$Electron \; Assigned \; Electron \; Mass$', loc = "left")
# plt.title(r'$\bf Generic \; Events$', loc = "Right")
# Label
#-------
plt.ylabel(r'$Entries/(\; {width:.2f}\;MeV/c^2)$'.format(width = perBin))
plt.xlabel(r'$\Delta m(D_s^{+} - D^{0})\;[GeV/c^{2}]$')
# plt.yscale("log")
plt.legend()
plt.show()

In [None]:
# === Define the veto cut
cut_low = 0.142
cut_high = 0.15
veto_query = "Ds_diff_D0pi <= @cut_low or Ds_diff_D0pi >= @cut_high"

# === Apply veto
df_vetoed = DataFrames["All"].query(veto_query)

# === Select D*+ only (|Ds_mcPDG| == 413)
df_dstarp = df_vetoed[df_vetoed["Ds_mcPDG"].abs() == 413]

# === Count e_genMotherPDG values
print(df_dstarp[["e_mcPDG","e_genMotherPDG","D0_mcPDG","D0_genMotherPDG"]].value_counts(dropna=False))

In [None]:
# === Settings ===
cut_low = 0.142
cut_high = 0.15
Bins = 50
Range = (0.13, 0.2)
var = "Ds_diff_D0pi"  # <== correct variable to match veto

# === Select true D*+ events
df = DataFrames["All"]
df_dstarp = df[df["Ds_mcPDG"].abs() == 413]

# === Inside and outside veto region (based on pion-mass Δm)
inside = df_dstarp[(df_dstarp[var] > cut_low) & (df_dstarp[var] < cut_high)]
outside = df_dstarp[(df_dstarp[var] <= cut_low) | (df_dstarp[var] >= cut_high)]

# === Print stats
total = len(df_dstarp)
n_inside = len(inside)
n_outside = len(outside)
print(f"Total D*⁺ events:      {total}")
print(f"Inside veto region:   {n_inside} ({n_inside/total:.2%})")
print(f"Outside veto region:  {n_outside} ({n_outside/total:.2%})")

# === Plot Ds_diff_D0pi
bins = np.linspace(Range[0], Range[1], Bins + 1)

# plt.figure(figsize=(8, 5))
plt.hist(inside[var], bins=bins, histtype='step', linewidth=2, color="#D1495B", density=True, label="Inside veto region")
plt.hist(outside[var], bins=bins, histtype='step', linewidth=2, color="#007C91", density=True, label="Outside veto region")
plt.axvspan(cut_low, cut_high, color='gray', alpha=0.2, label="Veto Window")

plt.xlabel(r'$\Delta m_{\pi}(D_s^{+} - D^{0})\;[GeV/c^{2}]$')
plt.ylabel("Entries")
plt.title("Δm(π) for True D*⁺ Events")
plt.legend()
# plt.tight_layout()
plt.show()

In [None]:
# === Settings ===
cut_low = 0.142
cut_high = 0.15
Bins = 50
Range = (0.0, 0.7)  # Range for e_p, not Δm
var = "Ds_diff_D0pi"

# === Select true D*+ events
df = DataFrames["All"]
df_dstarp = df[df["Ds_mcPDG"].abs() == 413]

# === Inside and outside veto region (based on pion-mass Δm)
inside = df_dstarp[(df_dstarp[var] > cut_low) & (df_dstarp[var] < cut_high)]
outside = df_dstarp[(df_dstarp[var] <= cut_low) | (df_dstarp[var] >= cut_high)]

# === Print stats
total = len(df_dstarp)
n_inside = len(inside)
n_outside = len(outside)
print(f"Total D*⁺ events:      {total}")
print(f"Inside veto region:   {n_inside} ({n_inside/total:.2%})")
print(f"Outside veto region:  {n_outside} ({n_outside/total:.2%})")

# === Plot electron momentum (e_p)
bins = np.linspace(Range[0], Range[1], Bins + 1)

# plt.figure(figsize=(8, 5))
plt.hist(inside["e_p"], bins=bins, histtype='step', linewidth=2, color="#D1495B", density=True, label="Inside veto region")
plt.hist(outside["e_p"], bins=bins, histtype='step', linewidth=2, color="#007C91", density=True, label="Outside veto region")

plt.xlabel(r'$p_e \; [\mathrm{GeV}/c]$')
plt.ylabel("Normalized Entries")
plt.title("Electron Momentum for True D*⁺ Events")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# # === Settings ===
# cut_low = 0.142
# cut_high = 0.15
# Bins = 50
# dm_var = "Ds_diff_D0pi"

# # === DataFrames ===
# df_sig = DataFrames["Signal"]
# df_all = DataFrames["All"]

# # === Restrict All to only D*+ truth-matched events
# df_dstarp = df_all[df_all["Ds_mcPDG"].abs() == 413]

# # === Define veto categories for D*+ only
# inside_mask = (df_dstarp[dm_var] > cut_low) & (df_dstarp[dm_var] < cut_high)
# outside_mask = ~inside_mask

# # === Variable list ===
# variables_to_plot = [
#     'e_chi2','e_ndf','e_trackTime','e_pionID','e_electronID','e_binaryPID_11_211','e_omega',
#     'e_daughter_0_isCloneTrack','e_flightTime','e_formula_E_p','e_mcVirtual','e_isMisidentified',
#     'e_ImpactXY','e_cos_theta','e_phi','e_mcP','e_M','e_pt','e_E','e_p','e_px','e_py','e_pz','e_abs_pz',
#     'e_isOrHasCloneTrack','e_charge','e_isCloneTrack','e_dr','e_dz','e_abs_dr','e_abs_dz','e_z0','e_d0',
#     'e_pValue','e_firstCDCLayer','e_firstPXDLayer','e_firstSVDLayer','e_nPXDHits','e_nVXDHits',
#     'e_nSVDHits','e_nCDCHits','e_inARICHAcceptance','e_inCDCAcceptance','e_inTOPAcceptance','e_mcE',
#     'Ds_M_uncorrected','Ds_M_pi','Ds_massDifference_0','Ds_diff_D0pi','Ds_diff_D0K','Ds_diff_D0e_noVF',
#     'Ds_diff_D0pi_noVF','Ds_diff_D0K_noVF','Ds_cos_theta_D0e_noVF','Ds_formula_diff_D0pi_massDifference_0',
#     'Ds_InvMLambda','Ds_Mode1Veto','Ds_Mode2Veto','Ds_DstarplusVeto','Ds_chiProb','Ds_useCMSFrame_p',
#     'Ds_useCMSFrame_E','Ds_phi_diff','Ds_mcM','Ds_mcM_D0e_emass','Ds_mcM_D0e_pimass',
#     'Ds_MminusMtrue_D0e_emass','Ds_MminusMtrue_D0e_pimass','Ds_decayAngle_0','Ds_cos_decayAngle_0',
#     'Ds_decayAngle_1','Ds_cos_decayAngle_1','Ds_pointingAngle_0','Ds_daughterDiffOfPhi_0_1',
#     'Ds_pointangle','Ds_daughterAngle_0_1','Ds_cos_daughterAngle_0_1','Ds_Angle_D0e','Ds_Angle_Ke',
#     'Ds_Angle_pie','Ds_useCMSFrame_daughterAngle_0_1','Ds_useRestFrame_daughterAngle_0_1',
#     'Ds_useDaughterRestFrame_daughterAngle_0_1_0_1','Ds_useDaughterRestFrame_daughterAngle_0_1_0_1_1',
#     'Ds_psi','Ds_azimuthalAngleInDecayPlane_0_1','Ds_daughterDiffOf_0_1_cos_theta',
#     'Ds_cosAngleBetweenMomentumAndVertexVector','Ds_cosAngleBetweenMomentumAndVertexVectorInXYPlane',
#     'Ds_useRestFrame_daughterDiffOf_0_1_p','Ds_useRestFrame_daughterMotherDiffOf_0_p',
#     'Ds_flightDistance','Ds_distance','Ds_daughterDiffOf_0_1_p','Ds_daughterDiffOf_0_1_E',
#     'Ds_daughterDiffOf_0_1_px','Ds_daughterDiffOf_0_1_py','Ds_daughterDiffOf_0_1_pz',
#     'Ds_daughterDiffOf_0_1_x','Ds_daughterDiffOf_0_1_y','Ds_daughterDiffOf_0_1_z',
#     'Ds_daughterMotherDiffOf_0_p','Ds_daughterMotherDiffOf_0_E','Ds_daughterMotherDiffOf_0_px',
#     'Ds_daughterMotherDiffOf_0_py','Ds_daughterMotherDiffOf_0_pz','Ds_daughterMotherDiffOf_0_x',
#     'Ds_daughterMotherDiffOf_0_y','Ds_daughterMotherDiffOf_0_z','Ds_abs_daughterMotherDiffOf_0_distance',
#     'Ds_L_diff','Ds_daughterMotherDiffOf_0_flightDistance','Ds_daughterMotherDiffOf_0_vertexDistance',
#     'Ds_flightDistanceOfDaughter_0','Ds_gammaveto_M','Ds_gammaveto_M_Correction',
#     'Ds_Dstar0Mode1_M','Ds_Dstar0Mode1_M_Correction','Ds_Dstar0Mode2_M_Correction',
#     'Ds_Dstarplus_massDifference_Correction','Ds_Dstarplus_M_Correction','Ds_Ds_starminusDs',
#     'Ds_Ds_starminusDs_M_Correction','Ds_goodDsplus','Ds_ImpactXY','Ds_cos_theta','Ds_phi','Ds_mcP',
#     'Ds_M','Ds_pt','Ds_E','Ds_p','Ds_px','Ds_py','Ds_pz','Ds_abs_pz','Ds_isOrHasCloneTrack',
#     'Ds_charge','Ds_mcE','D0_charged_product','D0_MomentumAsymmetry','D0_useCMSFrame_E',
#     'D0_useCMSFrame_p','D0_dM','D0_useAlternativeDaughterHypothesis_M_1_K','D0_chiProb',
#     'D0_cos_theta','D0_daughterAngle_0_1','D0_daughterDiffOf_0_1_theta','D0_daughterDiffOf_0_1_cos_theta',
#     'D0_daughterDiffOf_0_1_phi','D0_daughterMotherDiffOf_0_theta','D0_daughterMotherDiffOf_0_cos_theta',
#     'D0_daughterMotherDiffOf_0_phi','D0_decayAngle_0','D0_cos_decayAngle_0','D0_decayAngle_1',
#     'D0_cos_decayAngle_1','D0_significanceOfDistance','D0_flightDistance','D0_useRestFrame_daughterAngle_0_1',
#     'D0_formula_daughter_0_dr_daughter_1_dr','D0_formula_daughter_0_dz_daughter_1_dz','D0_ImpactXY',
#     'D0_phi','D0_mcP','D0_M','D0_pt','D0_E','D0_p','D0_px','D0_py','D0_pz','D0_abs_pz'
# ]

# # === Loop and plot ===
# for var in variables_to_plot:
#     if var not in df_sig.columns or var not in df_dstarp.columns:
#         continue
#     if not np.issubdtype(df_sig[var].dtype, np.number) or np.issubdtype(df_sig[var].dtype, np.bool_):
#         continue

#     # Clean values
#     sig_vals = df_sig[var].replace([np.inf, -np.inf], np.nan).dropna()
#     in_vals = df_dstarp.loc[inside_mask, var].replace([np.inf, -np.inf], np.nan).dropna()
#     out_vals = df_dstarp.loc[outside_mask, var].replace([np.inf, -np.inf], np.nan).dropna()

#     if len(sig_vals) < 10 or len(in_vals) < 10 or len(out_vals) < 10:
#         continue

#     # Smart range
#     all_vals = np.concatenate([sig_vals, in_vals, out_vals])
#     low = np.percentile(all_vals, 1)
#     high = np.percentile(all_vals, 99)
# #     bins = np.linspace(low, high, Bins)
#     bins = np.linspace(-0.4, 0.4, Bins)

#     # Plot
#     plt.clf()
#     plt.hist(sig_vals, bins=bins, histtype='step', label="Signal", density=True, linewidth=2)
#     plt.hist(in_vals, bins=bins, histtype='step', label="Inside veto (D*⁺)", density=True, linewidth=2)
#     plt.hist(out_vals, bins=bins, histtype='step', label="Outside veto (D*⁺)", density=True, linewidth=2)
#     plt.xlabel(var)
#     plt.ylabel("Normalized Entries")
#     plt.title(f"Comparison: {var}")
#     plt.legend()
#     plt.tight_layout()
#     plt.show()

## $D^{*+}$ BDT Cut

In [None]:
# cut_low = 0.142
# cut_high = 0.15

# # Apply sideband cut to 'All' sample
# DataFrames["All"] = DataFrames["All"][
#     (DataFrames["All"]["Ds_diff_D0pi"] <= cut_low) | (DataFrames["All"]["Ds_diff_D0pi"] >= cut_high)
# ]

# # Apply sideband cut to each generator-level sample
# for s in GenEvents:
#     DataFrames[s] = DataFrames[s][
#         (DataFrames[s]["Ds_diff_D0pi"] <= cut_low) | (DataFrames[s]["Ds_diff_D0pi"] >= cut_high)
#     ]

In [None]:
DataFrames["All"]["D0_isSignal"] = DataFrames["All"]["D0_isSignal"].replace(np.nan, 0)

for s in GenEvents[0:]: # loop over samples
    DataFrames[s]["D0_isSignal"] = DataFrames[s]["D0_isSignal"].replace(np.nan, 0)

In [None]:
DataFrames["All"]["Ds_isSignal"] = DataFrames["All"]["Ds_isSignal"].replace(np.nan, 0)

for s in GenEvents[0:]: # loop over samples
    DataFrames[s]["Ds_isSignal"] = DataFrames[s]["Ds_isSignal"].replace(np.nan, 0)

## Fake $D^0$ BDT Cut

In [None]:
# DataFrames["All"] = DataFrames["All"][(DataFrames["All"]["Ds_FakeD0BDT"]>=0.556)]

# for s in GenEvents[0:]: # loop over samples
#     DataFrames[s] = DataFrames[s][(DataFrames[s]["Ds_FakeD0BDT"]>=0.556)]

# Background Suppression

In [None]:
DataFrames[samples[0]].isna().sum()

In [None]:
print("Signal Number: ",len(DataFrames[samples[0]]))
print("Background Number: ",len(DataFrames[samples[1]]))

## Variable Comparison

In [None]:
plt.style.use('default')
plt.rcParams.update({
    "axes.labelsize": 14,
    "xtick.labelsize": 12,
    "ytick.labelsize": 12,
    "legend.fontsize": 12,
    "figure.titlesize": 16
})

In [None]:
Stacked = False
Density = True
Bins = 50
Range = [0, 0.4]
Op = -1
dM = -1
# i = "Ds_gammaveto_M_Correction"
i = 'Ds_Ds_starminusDs_M_Correction'
# i = 'Ds_extraInfo_FakeD0BDT'
# i = 'Ds_chiProb'
perBin = ((Range[1] - Range[0])/Bins)*1000
# perBin = ((Range[1] - Range[0])/Bins)
print("Width Per Bin: {width:.2f} MeV".format(width = perBin))

label1= r'$Signal$'
label2= r'$Background$'

labels=[label1,label2]
colors=["#1f77b4","#d62728"]

data = [
        DataFrames["Signal"][i], # (DataFrames["Signal"]['Ds_charge']==-1) & 
        DataFrames["All"][i]
       ]


plt.hist(data, color=colors, label=labels, alpha=1, range=Range, stacked=Stacked, density=Density, linewidth=2, bins=Bins, histtype='step')
# plt.axvspan(Range[0],0.15,color='gray',alpha=0.2)
# plt.axvline(0.58,ls='--',color='gray')

# Title
#---------
# plt.title(r'$Signal: Charge(D_s^{+})=Positive$', loc = "left")
# plt.title(r'$\bf Signal\;Events$', loc = "right")
# plt.title(r'$\int\mathcal{L}dt\approx\;100$ fb$^{-1}$', loc = "left")
# plt.title(r'$\bf Generic\;c\bar{c}\;Events$', loc = "right")
# Label
#---------
plt.ylabel(r'$Entries/(\; {width:.2f}\;MeV/c^2)$'.format(width = perBin))
# plt.ylabel(r'$Entries/(\; {width:.2f}\;)$'.format(width = perBin))
# plt.xlabel(r'$m(e_{sig}^{+} e_{ROE}^{-})\;[GeV/c^{2}]$')
plt.xlabel(r'$\Delta m(D_s^{*+} - D_{s}^{+})\;[GeV/c^{2}]$')
# plt.xlabel(r'$Fake \; D^{0} \; Suppression(D^{0})$')
# plt.xlabel(r'$p-value_{IP}(D_{s}^{+})$')
# plt.yscale("log") 
# plt.xscale("log") 
plt.legend()
plt.show()

## Variable Correlation

In [None]:
Variables = [
            # "Ds_FakeD0BDT",
            'Ds_chiProb',
            'Ds_gammaveto_M_Correction',
            'Ds_Ds_starminusDs_M_Correction',
#             'e_omega'
             ]

features = [
            # r'$Fake D^{0} Suppresion$',
            r'$p-value_{IP}(D_{s}^{+})$',
            r'$m(e_{sig}^{+} e_{ROE}^{-})$',
            r'$\Delta m(D_s^{*+} - D_{s}^{+})$',
#             r'$\omega (e)$'
            ]

In [None]:
plt.figure(figsize=(18, 15))

heatmap = sns.heatmap(DataFrames["Signal"][Variables].corr(), annot=True, cmap="coolwarm",vmin=-1, vmax=1)

heatmap.set_title('Signal Correlation Heatmap', fontdict={'fontsize':20}, pad=16)

In [None]:
plt.figure(figsize=(18, 15))

heatmap = sns.heatmap(DataFrames["All"][Variables].corr(), annot=True, cmap="coolwarm",vmin=-1, vmax=1)

heatmap.set_title('Background Correlation Heatmap', fontdict={'fontsize':20}, pad=16)

## Data Preprocessing

In [None]:
# #  Organise data ready for the machine learning model

# # for sklearn data are usually organised
# # into one 2D array of shape (n_samples x n_features)
# # containing all the data and one array of categories
# # of length n_samples

# all_MC = []  # define empty list that will contain all features for the MC
# for s in GenEvents:  # loop over the different samples
#     if s != "data":  # only MC should pass this
#         all_MC.append(
#             DataFrames[s][Variables]
#         )  # append the MC dataframe to the list containing all MC features
# X = np.concatenate(
#     all_MC
# )  # concatenate the list of MC dataframes into a single 2D array of features, called X

# all_y = (
#     []
# )  # define empty list that will contain labels whether an event in signal or background
# for s in GenEvents:  # loop over the different samples
#     if s != "data":  # only MC should pass this
#         if "Signal" in s:  # only signal MC should pass this
#             all_y.append(
#                 np.ones(DataFrames[s].shape[0], dtype=np.int32)
#             )  # signal events are labelled with 1
#         else:  # only background MC should pass this
#             all_y.append(
#                 np.zeros(DataFrames[s].shape[0], dtype=np.int32)
#             )  # background events are labelled 0
# y = np.concatenate(
#     all_y
# )  # concatenate the list of labels into a single 1D array of labels, called y

In [None]:
# === Organize data for ML: features (X) and labels (y) ===

all_MC = []  # list of all MC feature arrays
all_y = []   # list of all MC label arrays

for s in GenEvents:
    if s == "data":
        continue  # skip data

    df = DataFrames[s]

    if "Signal" in s:
        # Use only true signal (Ds_isSignal == 1)
        true_signal = df[df["Ds_isSignal"] == 1]
        all_MC.append(true_signal[Variables])
        all_y.append(np.ones(true_signal.shape[0], dtype=np.int32))
    else:
        # All background MC
        all_MC.append(df[Variables])
        all_y.append(np.zeros(df.shape[0], dtype=np.int32))

# Concatenate into final training arrays
X = np.concatenate(all_MC)
y = np.concatenate(all_y)

#splitting with  Holdout method for eval_set
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.30,
                                                    random_state=42,
                                                    # stratify=y
                                                    )

In [None]:
scaler = StandardScaler()  # initialise StandardScaler

# Fit only to the training data
scaler.fit(X_train)

In [None]:
X_train_scaled = scaler.transform(X_train)

X_test_scaled = scaler.transform(X_test)
X_scaled = scaler.transform(X)

## Model Training

Here is the step where we train the model.  There are a few things that need to be explained.  

There are a couple of things implemented below called "callbacks".  Callback allow you to control how the learning is performed.  There are two callbacks used here.

1. `ReduceLROnPlateau` is a callback that reduces the learning rate when the model training does not find an improvement in (this case) the validation loss after 5 epochs.  It reduces the learning rate by a factor of 0.2 down to a minimum of 0.00001.  This can help with problems like getting stuck in a local minimum.

2. `EarlyStopping` stops the training after a certain number of epochs that you set (`patience`).  This helps reduce the chance of overfitting.  A rule of thumb is to set the patience to 10\% the total number of epochs, though this really depends on you and your project.

Callbacks are optional and you can remove or alter them here as you like.

There is also the optional `validation_split` argument in the fit (train) function.  What this does is hold out a user-defined random portion of the training data at each epoch to perform a self-evaluation.

In [None]:
# Convert to torch tensors
X_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)  # Make it (N, 1)

# Create dataset and dataloader
dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Compute class weights for imbalance handling
weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
pos_weight = torch.tensor([weights[0] / weights[1]], dtype=torch.float32)

In [None]:
class BinaryClassifier(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 8)
        self.fc2 = nn.Linear(8, 4)
        self.out = nn.Linear(4, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.out(x)  # Note: No sigmoid here — handled by loss

In [None]:
model = BinaryClassifier(input_dim=X_tensor.shape[1])
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
n_epochs = 10

for epoch in range(n_epochs):
    model.train()
    running_loss = 0.0

    for xb, yb in dataloader:
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    avg_loss = running_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{n_epochs}, Loss: {avg_loss:.4f}")

In [None]:
# Only run this cell if hiddenlayer is installed
import hiddenlayer as hl

# Pass a dummy batch to trace model
dummy_input = X_tensor[:1]
hl.build_graph(model, dummy_input).save("pytorch_model_diagram", format="png")

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# === Track losses manually ===
loss_history = []

for epoch in range(n_epochs):
    model.train()
    running_loss = 0.0

    for xb, yb in dataloader:
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    avg_loss = running_loss / len(dataloader)
    loss_history.append(avg_loss)
    print(f"Epoch {epoch+1}/{n_epochs}, Loss: {avg_loss:.4f}")

# === Plotting loss curve ===
history_df = pd.DataFrame({'loss': loss_history})

plt.figure(figsize=(12, 6))
history_df.plot(ax=plt.gca(), linewidth=2)
plt.ylim(bottom=0)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.title("Training Loss History", fontsize=16)
plt.xlabel("Epoch", fontsize=14)
plt.ylabel("Loss", fontsize=14)
plt.grid(False)

# Save to file
plt.savefig("Ds2D0enue_PyTorch_training_loss.png", bbox_inches="tight", pad_inches=0.1)
plt.show()

In [None]:
from sklearn.metrics import accuracy_score

# === Predict probabilities ===
y_proba = model.predict(X_test_scaled).flatten()  # gives values between 0 and 1

# === Apply threshold to get binary prediction ===
y_pred = (y_proba > 0.5).astype(int)

# === Evaluate ===
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

## Parameter Optimization

This optimization is pulling too much resources and ending the connection

In [None]:
from sklearn.metrics import roc_curve, auc
import pandas as pd

results = []
best_auc = 0
best_model = None
best_config = None

# Grid search over architectures and learning rates
for n_hidden in [1, 2, 3]:
    for n_neurons in [4, 8, 16]:
        for lr in [1e-3, 1e-2]:
            print(f"Training: n_hidden={n_hidden}, n_neurons={n_neurons}, learning_rate={lr}")
            model = build_model(
                n_hidden=n_hidden,
                n_neurons=n_neurons,
                learning_rate=lr,
                input_shape=[X_train_scaled.shape[1]]
            )
            history = model.fit(
                X_train_scaled,
                y_train,
                epochs=30,
                validation_split=0.2,
                batch_size=64,
                callbacks=my_callbacks,
                class_weight=class_weights,
                verbose=0
            )
            y_proba = model.predict(X_test_scaled).flatten()
            fpr, tpr, _ = roc_curve(y_test, y_proba)
            auc_score = auc(fpr, tpr)
            results.append((n_hidden, n_neurons, lr, auc_score))

            # Save best model
            if auc_score > best_auc:
                best_auc = auc_score
                best_model = model
                best_config = (n_hidden, n_neurons, lr)

# Save to DataFrame
df_opt = pd.DataFrame(results, columns=["n_hidden", "n_neurons", "learning_rate", "AUC"])

# Show best result
print(f"Best config: n_hidden={best_config[0]}, n_neurons={best_config[1]}, learning_rate={best_config[2]}, AUC={best_auc:.4f}")

In [None]:
# Pivot only for a single learning rate (e.g. 1e-3) for heatmap simplicity
pivot = df_opt[df_opt["learning_rate"] == 1e-3].pivot(index="n_hidden", columns="n_neurons", values="AUC")

plt.figure(figsize=(8, 6))
sns.heatmap(pivot, annot=True, cmap="viridis", fmt=".3f")
plt.title("AUC vs Hidden Layers and Neurons (lr=1e-3)")
plt.xlabel("Neurons per Layer")
plt.ylabel("Number of Hidden Layers")
plt.show()

## Feature Importance

In [None]:
from sklearn.metrics import roc_auc_score
import numpy as np

baseline_auc = roc_auc_score(y_test, best_model.predict(X_test_scaled).flatten())
print(f"Baseline AUC: {baseline_auc:.4f}")

importances = []

for i, var in enumerate(features):
    X_permuted = X_test_scaled.copy()
    np.random.shuffle(X_permuted[:, i])  # permute one column at a time
    y_pred_perm = best_model.predict(X_permuted).flatten()
    perm_auc = roc_auc_score(y_test, y_pred_perm)
    drop = baseline_auc - perm_auc
    importances.append((var, drop))

# Sort and plot
importances.sort(key=lambda x: x[1], reverse=True)

vars_sorted, drops = zip(*importances)
plt.figure(figsize=(8, 5))
plt.barh(vars_sorted, drops)
plt.xlabel("AUC Drop After Permutation")
plt.title("Feature Importance via Permutation")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Explainer assumes the model is Keras-based
explainer = shap.DeepExplainer(best_model, X_train_scaled[:100])  # background samples
shap_values = explainer.shap_values(X_test_scaled[:100])

# Summary plot
shap.summary_plot(shap_values[0], features=X_test_scaled[:100], feature_names=features)


## Overfitting Check

In [None]:
from scipy.stats import ks_2samp
import numpy as np

def get_pulls(counts, errors, pdf):
    return (counts - pdf) / errors

def compare_train_test_nn(model, X_train, y_train, X_test, y_test):
    decisions = []

    for X, y in ((X_train, y_train), (X_test, y_test)):
        d1 = model.predict(X[y < 0.5]).flatten()  # background
        d2 = model.predict(X[y > 0.5]).flatten()  # signal
        decisions += [d1, d2]

    lw = 3
    fig, axs = plt.subplots(3, 1, figsize=(10, 10), gridspec_kw={'height_ratios': [1, 0.2, 0.2]})
    bins = 50
    bin_edges = np.linspace(0, 1, bins + 1)

    test_bkg_count_weight = bins / len(decisions[2])
    test_sig_count_weight = bins / len(decisions[3])
    test_bkg_counts, test_bkg_bins = np.histogram(decisions[2], bins=bin_edges)
    test_sig_counts, test_sig_bins = np.histogram(decisions[3], bins=bin_edges)

    train_bkg_counts, _, _ = axs[0].hist(decisions[0], color='tab:blue',
        histtype='step', bins=bin_edges, density=True, linewidth=lw, label='Train Background')
    train_sig_counts, _, _ = axs[0].hist(decisions[1], color='tab:red',
        histtype='step', bins=bin_edges, density=True, linewidth=lw, label='Train Signal')

    axs[0].hist(decisions[0], color='tab:blue', histtype='stepfilled', alpha=0.4, bins=bin_edges, density=True)
    axs[0].hist(decisions[1], color='tab:red', histtype='stepfilled', alpha=0.4, bins=bin_edges, density=True)

    bin_width = bin_edges[1] - bin_edges[0]
    bin_centers = bin_edges[:-1] + 0.5 * bin_width

    axs[0].errorbar(bin_centers, test_bkg_count_weight * test_bkg_counts,
        yerr=test_bkg_count_weight * np.sqrt(test_bkg_counts), label='Test Background',
        color='tab:blue', marker='o', linewidth=lw, ls='')
    axs[0].errorbar(bin_centers, test_sig_count_weight * test_sig_counts,
        yerr=test_sig_count_weight * np.sqrt(test_sig_counts), label='Test Signal',
        color='tab:red', marker='o', linewidth=lw, ls='')

    axs[0].set_title(r'$D_{s}^{+} \rightarrow D^{0} e^{+} \nu_{e}$', loc='left')
    axs[0].set_xlim(0, 1)
    axs[0].set_ylim(0)
    axs[0].set_ylabel('Event Density')

    ks_sig = ks_2samp(decisions[1], decisions[3])[1]
    ks_bkg = ks_2samp(decisions[0], decisions[2])[1]

    leg = axs[0].legend(loc='upper center', title=f"Sig K-S: {ks_sig:.3f}\nBkg K-S: {ks_bkg:.3f}")
    leg._legend_box.align = "left"

    pulls_bkg = get_pulls(test_bkg_count_weight * test_bkg_counts,
                          test_bkg_count_weight * np.sqrt(test_bkg_counts),
                          train_bkg_counts)
    axs[1].bar(bin_centers, pulls_bkg, width=bin_width)
    axs[1].set_xlim(0, 1)
    axs[1].set_ylabel('Pulls')
    axs[1].set_ylim(-5, 5)

    pulls_sig = get_pulls(test_sig_count_weight * test_sig_counts,
                          test_sig_count_weight * np.sqrt(test_sig_counts),
                          train_sig_counts)
    axs[2].bar(bin_centers, pulls_sig, width=bin_width, color='tab:red')
    axs[2].set_xlim(0, 1)
    axs[2].set_ylabel('Pulls')
    axs[2].set_ylim(-5, 5)
    axs[2].set_xlabel('NN Output')

    return decisions

In [None]:
decisions = compare_train_test_nn(best_model, X_train_scaled, y_train, X_test_scaled, y_test)

## Model Check

### Basf2 ROC

In [None]:
# # compute ROC 
# sig_train=decisions[1]
# sig_test=decisions[3]
# bkg_train=decisions[0]
# bkg_test=decisions[2]

# bdt_cuts=np.linspace(0,1,100)
# sig_efficiency_train=[]
# bkg_rejection_train=[]
# den_sig_train=len(sig_train)
# den_bkg_train=len(bkg_train)

# sig_efficiency_test=[]
# bkg_rejection_test=[]
# den_sig_test=len(sig_test)
# den_bkg_test=len(bkg_test)


# for cut in bdt_cuts:
#     num_sig_train=len([el for el in sig_train if el>cut])
#     num_bkg_train=len([el for el in bkg_train if el>cut])
#     num_sig_test=len([el for el in sig_test if el>cut])
#     num_bkg_test=len([el for el in bkg_test if el>cut])
    
#     sig_efficiency_test.append(num_sig_test/den_sig_test)
#     bkg_rejection_test.append(1-(num_bkg_test/den_bkg_test))
#     sig_efficiency_train.append(num_sig_train/den_sig_train)
#     bkg_rejection_train.append(1-(num_bkg_train/den_bkg_train))

# fig,axs=plt.subplots(1,1,figsize=(8,6))
# lw=2
# axs.plot([1, 0], [0, 1], color='grey', linestyle='--')
# axs.plot(bkg_rejection_train,sig_efficiency_train,color='tab:blue',marker='',linewidth=lw,label='Train')
# axs.plot(bkg_rejection_test,sig_efficiency_test,color='tab:red',marker='',linewidth=lw,ls='--',label='Test')
# axs.set_title(r'$D_{s}^{+} \rightarrow D^{0} e^{+} \nu_{e}$',loc='left')

# axs.set_ylim(0,1.05)
# axs.set_xlim(0,1.05)
# axs.legend(loc='lower left')
# axs.set_xlabel('Background rejection')
# axs.set_ylabel('Signal efficiency')
# plt.tight_layout()

# plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc
import numpy as np

# === NN Probabilities ===
y_score_test = best_model.predict(X_test_scaled).flatten()
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_score_test)
area_test = auc(fpr_test, tpr_test)

y_score_train = best_model.predict(X_train_scaled).flatten()
fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_score_train)
area_train = auc(fpr_train, tpr_train)

# === Separate signal and background scores ===
sig_train = y_score_train[y_train == 1]
bkg_train = y_score_train[y_train == 0]
sig_test  = y_score_test[y_test == 1]
bkg_test  = y_score_test[y_test == 0]

# Group them if needed
decisions = [bkg_train, sig_train, bkg_test, sig_test]

# === Threshold Scan ===
bdt_cuts = np.linspace(0, 1, 100)
sig_eff_train, bkg_rej_train = [], []
sig_eff_test, bkg_rej_test = [], []
fom_vals = []

for cut in bdt_cuts:
    num_sig_train = np.sum(sig_train > cut)
    num_bkg_train = np.sum(bkg_train > cut)
    num_sig_test = np.sum(sig_test > cut)
    num_bkg_test = np.sum(bkg_test > cut)

    fom = num_sig_test / np.sqrt(num_sig_test + num_bkg_test) if (num_sig_test + num_bkg_test) > 0 else 0
    fom_vals.append(fom)

    sig_eff_train.append(num_sig_train / len(sig_train))
    bkg_rej_train.append(1 - (num_bkg_train / len(bkg_train)))
    sig_eff_test.append(num_sig_test / len(sig_test))
    bkg_rej_test.append(1 - (num_bkg_test / len(bkg_test)))

# === Optimal FoM ===
fom_vals = np.array(fom_vals)
best_idx = np.argmax(fom_vals)
best_cut = bdt_cuts[best_idx]

# === Plot ===
fig, axs = plt.subplots(1, 1, figsize=(7, 6))
lw = 2

axs.plot(bkg_rej_train, sig_eff_train, color='tab:blue', linewidth=lw, label=f'Train (AUC = {area_train:.2f})')
axs.plot(bkg_rej_test, sig_eff_test, color='tab:red', linestyle='--', linewidth=lw, label=f'Test (AUC = {area_test:.2f})')

# Overfitting gap shading
axs.fill_between(bkg_rej_test,
                 sig_eff_train,
                 sig_eff_test,
                 where=(np.array(sig_eff_train) > np.array(sig_eff_test)),
                 color='gray', alpha=0.2, label='Overfit Gap')

# Best FoM cut
axs.axhline(sig_eff_test[best_idx], color='black', ls='--', linewidth=1.6,
            label=f'Best FoM Cut = {best_cut:.3f}')
axs.axvline(bkg_rej_test[best_idx], color='black', ls='--', linewidth=1.6)
axs.scatter(bkg_rej_test[best_idx], sig_eff_test[best_idx], color='green', s=50)

# Labels
axs.set_title(r'$D_{s}^{+} \rightarrow D^{0} e^{+} \nu_{e}$', loc='left')
axs.set_ylim(0, 1.05)
axs.set_xlim(0, 1.05)
axs.set_xlabel('Background rejection')
axs.set_ylabel('Signal efficiency')
axs.legend(loc='lower left')
axs.grid(True)
plt.tight_layout()
plt.show()


### Machine Learing ROC

In [None]:
from sklearn.metrics import roc_curve, auc

# === Predict probabilities ===
y_score_test = best_model.predict(X_test_scaled).flatten()
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_score_test)
area_test = auc(fpr_test, tpr_test)

y_score_train = best_model.predict(X_train_scaled).flatten()
fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_score_train)
area_train = auc(fpr_train, tpr_train)

# === Plot ROC Curves ===
plt.figure(figsize=(7, 6))
plt.plot([0, 1], [0, 1], color='grey', linestyle='--', label='Random Classifier')
plt.plot(fpr_test, tpr_test, label=f'Test ROC curve (AUC = {area_test:.2f})', color='tab:red')
plt.plot(fpr_train, tpr_train, label=f'Train ROC curve (AUC = {area_train:.2f})', color='tab:blue')
plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title('ROC Curve: Train vs Test', fontsize=16)
plt.legend(loc='lower right')
plt.grid(True)
plt.gca().set_aspect('equal', adjustable='box')
plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import roc_auc_score

# === Predict probabilities ===
y_pred_proba = best_model.predict(X_test_scaled).flatten()

# === Compute ROC AUC ===
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC Score: {roc_auc:.2f}")

### Other Checks

Check if XGBoost Is Overfitting

In [None]:
# Convert training history to DataFrame
history_df = pd.DataFrame(history.history)

# Plot training and validation loss
plt.figure(figsize=(10, 6))
plt.plot(history_df["loss"], label="Training Loss", linewidth=2)
plt.plot(history_df["val_loss"], label="Validation Loss", linewidth=2)
plt.xlabel("Epoch", fontsize=14)
plt.ylabel("Loss", fontsize=14)
plt.title("Overfitting Check: Loss vs Epoch", fontsize=16)
plt.legend()
plt.ylim(bottom=0)
plt.grid(True)
plt.show()

Accuracy

In [None]:
# Plot accuracy (if you're using it as a metric)
plt.figure(figsize=(10, 6))
plt.plot(history_df["accuracy"], label="Training Accuracy", linewidth=2)
plt.plot(history_df["val_accuracy"], label="Validation Accuracy", linewidth=2)
plt.xlabel("Epoch", fontsize=14)
plt.ylabel("Accuracy", fontsize=14)
plt.title("Overfitting Check: Accuracy vs Epoch", fontsize=16)
plt.legend()
plt.ylim(0, 1)
plt.grid(True)
plt.show()

## BDT Cut Optimization

In [None]:
def compute_fom_curve(scores, labels, weights=None, n_thresholds=200):
    thresholds = np.linspace(0, 1, n_thresholds)
    foms = []

    for t in thresholds:
        mask = scores > t
        if weights is not None:
            S = np.sum(weights[(labels == 1) & mask])
            B = np.sum(weights[(labels == 0) & mask])
        else:
            S = np.sum((labels == 1) & mask)
            B = np.sum((labels == 0) & mask)

        fom = S / np.sqrt(S + B) if (S + B) > 0 else 0
        foms.append(fom)

    foms = np.array(foms)
    best_idx = np.argmax(foms)
    return thresholds, foms, thresholds[best_idx], foms[best_idx]


In [None]:
# Predict scores with best_model (use scaled test input)
scores = best_model.predict(X_test_scaled).flatten()
weights = np.ones_like(y_test)  # or use MC weights

thresholds, foms, best_thresh, best_fom = compute_fom_curve(scores, y_test, weights=weights)

print(f"Best threshold: {best_thresh:.3f}")
print(f"Best FoM: {best_fom:.3f}")

In [None]:
plt.plot(thresholds, foms)
plt.axvline(best_thresh, color='red', linestyle='--', label=f'Best = {best_thresh:.3f}')
plt.axvspan(0, best_thresh, color='gray', alpha=0.2)
plt.xlabel("NN Output Threshold")
plt.ylabel("FoM = S / √(S + B)")
plt.title("FoM Scan vs NN Output Threshold")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler

# Scale signal and all inputs
df_signal_scaled = scaler.transform(DataFrames["Signal"][Variables])
df_all_scaled    = scaler.transform(DataFrames["All"][Variables])

# Predict scores with best_model
DataFrames["Signal"]["Ds_BkgNN"] = best_model.predict(df_signal_scaled).flatten()
DataFrames["All"]["Ds_BkgNN"]    = best_model.predict(df_all_scaled).flatten()

In [None]:
from Functions import optimize_cut, plot_save

cut = optimize_cut(
    df_sig=DataFrames["Signal"],                  # used for plotting signal vs background
    df_bkg=DataFrames["All"],
    Signal=DataFrames["Signal"],                  # used for FoM numerator (truth-matched signal)
    Background=DataFrames["All"],                 # used for FoM denominator (everything else)
    var="Ds_BkgBDT",                              # new classifier variable
    FoM="Ds_BkgBDT",                              # same as var here
    xlabel="Background Classifier Output",
    Bins=50,
    Range=[0, 1],
    varmin=0,
    varmax=0.99,
    select="right",                               # keep events with higher classifier output
    Width=False,
    query_signal="Ds_isSignal == 1"
)

print(f"Best cut is: {cut:.3f}")

# Plots

In [None]:
plt.style.use('default')
plt.rcParams.update({
    "axes.labelsize": 14,
    "xtick.labelsize": 12,
    "ytick.labelsize": 12,
    "legend.fontsize": 12,
    "figure.titlesize": 16
})

In [None]:
print(DataFrames["Signal"][['Ds_isSignal']].value_counts(normalize=False,dropna=False).apply(lambda x: f"{x:.6f}"))
print(DataFrames["Signal"].query('Ds_BkgBDT>=0.455')[['Ds_isSignal']].value_counts(normalize=False,dropna=False).apply(lambda x: f"{x:.6f}"))

In [None]:
print(DataFrames["All"][['D0_isSignal']].value_counts(normalize=False,dropna=False).apply(lambda x: f"{x:.6f}"))
print(DataFrames["All"].query('Ds_BkgBDT>=0.455')[['D0_isSignal']].value_counts(normalize=False,dropna=False).apply(lambda x: f"{x:.6f}"))

In [None]:
Stacked = False
Density = False
Bins = 50
var = 'D0_dM'
Range = [-0.02, 0.02]
BD = 0.657
perBin = ((Range[1] - Range[0])/Bins)*1000
print("Width Per Bin: {width:.2f} MeV".format(width = perBin))

# label1= r'$Other$'
label1= r'$Other \; (FDS \; BDT \geq 0.8)$'

labels=[label1]
colors = ['C5']
data=[
      DataFrames["All"][((abs(DataFrames["All"]['Ds_D0_Other'])==1) | ((abs(DataFrames["All"]['D0_mcPDG'])==421) & (abs(DataFrames["All"]['D0_isSignal'])==0))) & (DataFrames["All"]["Ds_FakeD0BDT"]>=BD)][var],
      ]

plt.hist(data, color=colors, label=labels, density=Density, stacked=Stacked, bins=Bins, alpha=1, histtype='step', linewidth=1.5, range=Range)
# plt.axvspan(0.02,0.04,color='gray',alpha=0.2)
# plt.axvline(0.02,ls='--',color='gray')
# plt.axvline(0.04,ls='--',color='gray')

# Title
#--------
plt.title(r'$\bf Generic \; Events$', loc = "left")
plt.title(r'$\int\mathcal{L}dt\approx\;200$ fb$^{-1}$', loc = "right")
# Label
#-------
plt.ylabel(r'$Entries/(\; {width:.2f}\;MeV/c^2)$'.format(width = perBin))
plt.xlabel(r'$m(D^{0}) - m_{PDG}(D^{0}) \;[GeV/c^{2}]$')
# plt.yscale("log")
# plt.xscale("log")
# plt.ylim(0, 500)
plt.legend()
plt.show()

Suggested Background Break-up

In [None]:
# Stacked = False
# Density = False
# Bins = 50
# Range = [0.1, 0.6]
# Op = 0.721
# dM = -1
# # i = 'e_cos_theta'
# # i = 'Ds_vpho_CMS_daughterAngle'
# i = 'Ds_diff_D0pi'
# # i = 'Ds_chiProb_noIP'
# # i = 'Ds_chiProb'
# # i = 'Ds_extraInfo_FastBDT'
# # i = 'D0_chiProb'
# # i = 'Ds_Ds_starminusDs_M_Correction'
# # i = "Ds_gammaveto_M_Correction"
# # i = 'D0_chiProb'
# # i = "Ds_L_diff"
# # var = 'e_cos_theta'
# # i = 'e_pt'
# perBin = ((Range[1] - Range[0])/Bins)*1000
# # perBin = ((Range[1] - Range[0])/Bins)
# print("Width Per Bin: {width:.2f} MeV".format(width = perBin))

# label1= r'$D^{*+} \rightarrow D^{0} \pi^{+}$'
# label3= r'$D^{0}$'
# label4= r'$Other$'

# labels1=[label1,label3,label4]
# colors1=['C1','C2','C3']
# data1=[
#       DataFrames["ccbar"][(abs(DataFrames["ccbar"]['Ds_D0_Dstarplus'])==1) & (DataFrames["ccbar"]['Ds_gammaveto_M_Correction']>=dM) & (DataFrames["ccbar"]["Ds_BkgBDT"]>=Op)][i],
#       DataFrames["ccbar"][(abs(DataFrames["ccbar"]['Ds_D0_NoDstarplusDstar0'])==1) & (DataFrames["ccbar"]['Ds_gammaveto_M_Correction']>=dM) & (DataFrames["ccbar"]["Ds_BkgBDT"]>=Op)][i],
#       DataFrames["ccbar"][(abs(DataFrames["ccbar"]['Ds_D0_Other'])==1) & (DataFrames["ccbar"]['Ds_gammaveto_M_Correction']>=dM) & (DataFrames["ccbar"]["Ds_BkgBDT"]>=Op)][i],
#       ]
# labels2=[r'$D^{*0} \; (Comb.)$',r'$D^{*0} \; (Peak)$']
# colors2=['C4','C5']
# data2=[
#       DataFrames["ccbar"][(abs(DataFrames["ccbar"]['Ds_mcPDG'])!=423) & (abs(DataFrames["ccbar"]['Ds_D0_Dstar0'])==1) & (DataFrames["ccbar"]["Ds_BkgBDT"]>=Op)][i],
#       DataFrames["ccbar"][(abs(DataFrames["ccbar"]['Ds_mcPDG'])==423) & (abs(DataFrames["ccbar"]['Ds_D0_Dstar0'])==1) & (DataFrames["ccbar"]["Ds_BkgBDT"]>=Op)][i],
#       ]

# # factor = 0.1
# # plt.hist(DataFrames["Signal"][(DataFrames["Signal"]['Ds_gammaveto_M_Correction']>=dM) & (DataFrames["Signal"]["Ds_BS"]>=Op)][i], label="Signal", histtype='step', density=Density, bins=Bins, alpha=1, range=Range, weights=factor*np.ones_like(DataFrames["Signal"][(DataFrames["Signal"]['Ds_gammaveto_M_Correction']>=dM) & (DataFrames["Signal"]["Ds_BS"]>=Op)][i]), ls='--', linewidth=1.5)
# plt.hist(data1, color=colors1, label=labels1, density=Density, stacked=Stacked, bins=Bins, alpha=1, histtype='step', linewidth=1.5, range=Range)
# plt.hist(data2, color=colors2, label=labels2, density=Density, stacked=False, bins=Bins, alpha=1, histtype='step', linewidth=1.5, range=Range)
# # plt.axvspan(Range[0],0.15,color='gray',alpha=0.2)
# # plt.axvline(0.58,ls='--',color='gray')

# # Title
# #--------
# plt.title(r'$BDT \; \geq 0.721$', loc = "left")
# # Label
# #-------
# # plt.ylabel(r'$Entries/(\; {width:.2f}\;)$'.format(width = perBin))
# # plt.ylabel(r'$Entries/(\; {width:.2f}\;MeV/c)$'.format(width = perBin))
# plt.ylabel(r'$Entries/(\; {width:.2f}\;MeV/c^2)$'.format(width = perBin))
# # plt.xlabel(r'$p_{t} (e^{+}) [GeV/c]$')
# # plt.xlabel(r'$\Delta \theta(D_s^{+} \; K^{+/-}/K_{s}^{0}) \; [rad]$')
# # plt.xlabel(r'$cos\theta \; (e^{+})$')
# # plt.xlabel(r'$p-value(D^{0})$')
# # plt.xlabel(r'$p-value(D_{s}^{+})$')
# # plt.xlabel(r'$p-value_{IP}(D_{s}^{+})$')
# # plt.xlabel(r'$Fake D^{0} Suppression(D^{0})$')
# # plt.xlabel(r'$m(e_{sig}^{+} e_{ROE}^{-})\;[GeV/c^{2}]$')
# # plt.xlabel(r'$p_{t} \; (e^{+})\;[GeV/c]$')
# plt.xlabel(r'$\Delta m(D_s^{+} - D^{0})\;[GeV/c^{2}]$')
# # plt.xlabel(r'$\Delta m(D_s^{*+} - D_{s}^{+})\;[GeV/c^{2}]$')
# # plt.xlabel(r'$cos\theta \; (e^{+})$')
# # plt.xlabel(r'$p-value(D^{0})$')
# # plt.xlabel(r'$\mid \vec{x}_{D_{s}^{+}} - \vec{x}_{D^{0}} \mid \; [cm]$')
# # plt.xlabel(r'$dz \; (e^{+}) \; [cm]$')
# # plt.yscale("log")
# # plt.xscale("log")
# # plt.ylim(0, 500)
# plt.legend()
# plt.show()

In [None]:
Stacked = True
Density = False
Bins = 50
# var = 'Ds_diff_D0pi'
var = 'Ds_massDifference_0'
Range = [0.0, 0.25]
BS = 0.404
Samples = "All"
perBin = ((Range[1] - Range[0])/Bins)*1000
print("Width Per Bin: {width:.2f} MeV".format(width = perBin))

label1= r'$Other$'
label2= r'$Prompt \; D^{0}$'
label3= r'$D^{*0} \rightarrow D^{0} X$'
label4= r'$D^{*+} \rightarrow D^{0} X$'

labels=[label1,label2,label3,label4]
colors=['C5','C4','C1','C2',]
data=[
      DataFrames["All"][((DataFrames["All"]['D0_isSignal'].isna()) | (abs(DataFrames["All"]['D0_isSignal']) == 0)) & (DataFrames[Samples]["Ds_BkgBDT"]>=BS)][var],
      DataFrames["All"][((abs(DataFrames["All"]['D0_genMotherPDG']) != 413) & (abs(DataFrames["All"]['D0_genMotherPDG']) != 423)) & (abs(DataFrames["All"]['D0_isSignal']) == 1) & (DataFrames["All"]["Ds_BkgBDT"]>=BS)][var],
      DataFrames["All"][(abs(DataFrames["All"]['D0_genMotherPDG']) == 423) & (abs(DataFrames["All"]['D0_isSignal']) == 1) & (DataFrames["All"]["Ds_BkgBDT"]>=BS)][var],
      DataFrames["All"][(abs(DataFrames["All"]['D0_genMotherPDG']) == 413) & (abs(DataFrames["All"]['D0_isSignal']) == 1) & (DataFrames["All"]["Ds_BkgBDT"]>=BS)][var],
      ]

# factor = 0.7
# plt.hist(DataFrames["Signal"][(DataFrames["Signal"]["Ds_BkgBDT"]>=BS)][var], label="Signal", histtype='step', density=Density, bins=Bins, alpha=1, range=Range, weights=factor*np.ones_like(DataFrames["Signal"][(DataFrames["Signal"]["Ds_BkgBDT"]>=BS)][var]), ls='--', linewidth=1.5)
plt.hist(data, color=colors, label=labels, density=Density, stacked=Stacked, bins=Bins, alpha=1, histtype='step', linewidth=2, range=Range)
# plt.axvspan(Range[0],0.16,color='gray',alpha=0.2)
# plt.axvline(0.16,ls='--',color='gray')

# Title
#--------
# plt.title(r'$\bf Generic \; Events$', loc = "left")
plt.title(r'$\bf Generic \; Events$' + "\n" + r"$BDT \geq 0.404$", loc = "left")
plt.title(r'$\int\mathcal{L}dt\approx\;1443.999$ fb$^{-1}$', loc = "right")
# Label
#-------
plt.ylabel(r'$Entries/(\; {width:.2f}\;MeV/c^2)$'.format(width = perBin))
plt.xlabel(r'$\Delta m_{e}(D_s^{+} - D^{0})\;[GeV/c^{2}]$')
# plt.yscale("log")
# plt.xscale("log")
# plt.ylim(0, 30000)
plt.legend()
plt.show()

In [None]:
Stacked = True
Density = False
Bins = 50
var = 'Ds_diff_D0pi'
# var = 'Ds_massDifference_0'
Range = [0.1, 0.4]
BS = -1
Samples = "All"
perBin = ((Range[1] - Range[0])/Bins)*1000
print("Width Per Bin: {width:.2f} MeV".format(width = perBin))

label1= r'$Other$'
label2= r'$Prompt \; D^{0}$'
label3= r'$D^{*0} \rightarrow D^{0} X$'
label4= r'$D^{*+} \rightarrow D^{0} X$'

labels=[label1,label2,label3,label4]
colors=['C5','C4','C1','C2',]
data=[
      DataFrames["All"][((DataFrames["All"]['D0_isSignal'].isna()) | (abs(DataFrames["All"]['D0_isSignal']) == 0)) & (DataFrames[Samples]["Ds_BkgBDT"]>=BS)][var],
      DataFrames["All"][((abs(DataFrames["All"]['D0_genMotherPDG']) != 413) & (abs(DataFrames["All"]['D0_genMotherPDG']) != 423)) & (abs(DataFrames["All"]['D0_isSignal']) == 1) & (DataFrames["All"]["Ds_BkgBDT"]>=BS)][var],
      DataFrames["All"][(abs(DataFrames["All"]['D0_genMotherPDG']) == 423) & (abs(DataFrames["All"]['D0_isSignal']) == 1) & (DataFrames["All"]["Ds_BkgBDT"]>=BS)][var],
      DataFrames["All"][(abs(DataFrames["All"]['D0_genMotherPDG']) == 413) & (abs(DataFrames["All"]['D0_isSignal']) == 1) & (DataFrames["All"]["Ds_BkgBDT"]>=BS)][var],
      ]

# factor = 0.7
# plt.hist(DataFrames["Signal"][(DataFrames["Signal"]["Ds_BkgBDT"]>=BS)][var], label="Signal", histtype='step', density=Density, bins=Bins, alpha=1, range=Range, weights=factor*np.ones_like(DataFrames["Signal"][(DataFrames["Signal"]["Ds_BkgBDT"]>=BS)][var]), ls='--', linewidth=1.5)
plt.hist(data, color=colors, label=labels, density=Density, stacked=Stacked, bins=Bins, alpha=1, histtype='step', linewidth=2, range=Range)
# plt.axvspan(Range[0],0.16,color='gray',alpha=0.2)
# plt.axvline(0.16,ls='--',color='gray')

# Title
#--------
plt.title(r'$\bf Generic \; Events$', loc = "left")
# plt.title(r'$\bf Generic \; Events$' + "\n" + r"$BDT \geq 0.525$", loc = "left")
plt.title(r'$\int\mathcal{L}dt\approx\;1443.999$ fb$^{-1}$', loc = "right")
# Label
#-------
plt.ylabel(r'$Entries/(\; {width:.2f}\;MeV/c^2)$'.format(width = perBin))
plt.xlabel(r'$\Delta m_{\pi}(D_s^{+} - D^{0})\;[GeV/c^{2}]$')
# plt.yscale("log")
# plt.xscale("log")
plt.ylim(0, 10000)
plt.legend()
plt.show()

In [None]:
Stacked = True
Density = False
Bins = 50
# i = 'Ds_diff_D0pi'
i = 'Ds_massDifference_0'
Range = [0.0, 0.25]
BS = 0.5
Op = -1
dM = -1
Hits = -1
perBin = ((Range[1] - Range[0])/Bins)*1000
print("Width Per Bin: {width:.2f} MeV".format(width = perBin))

label1= r'$Comb.$'
label2= r'$NaN$'
label3= r'$D^{*0}$'
label4= r'$D^{*+} \rightarrow D^{0} \pi^{+}$'

labels=[label1,label2,label3,label4]
colors=["#DD8452","#C44E52","#55A868","#4C72B0"]
data=[
      DataFrames["All"][((abs(DataFrames["All"]["Ds_mcPDG"])!=413) & (abs(DataFrames["All"]["Ds_mcPDG"])!=423) & (~DataFrames["All"]["Ds_mcPDG"].isna())) & (DataFrames["All"]['Ds_gammaveto_M_Correction']>=dM) & (DataFrames["All"]["Ds_BkgBDT"]>=BS)][i],
      DataFrames["All"][(DataFrames["All"]["Ds_mcPDG"].isna()) & (DataFrames["All"]['Ds_gammaveto_M_Correction']>=dM) & (DataFrames["All"]["Ds_BkgBDT"]>=BS)][i],
      DataFrames["All"][(abs(DataFrames["All"]["Ds_mcPDG"])==423) & (DataFrames["All"]['Ds_gammaveto_M_Correction']>=dM) & (DataFrames["All"]["Ds_BkgBDT"]>=BS)][i],
      DataFrames["All"][(abs(DataFrames["All"]["Ds_mcPDG"])==413) & (DataFrames["All"]['Ds_gammaveto_M_Correction']>=dM) & (DataFrames["All"]["Ds_BkgBDT"]>=BS)][i]
      ]

# factor = 0.5
# plt.hist(DataFrames["Signal"][(DataFrames["Signal"]['Ds_gammaveto_M_Correction']>=dM) & (DataFrames["Signal"]['e_nPXDHits']>Hits)][i], label="Signal", histtype='step', density=Density, bins=Bins, alpha=1, range=Range, weights=factor*np.ones_like(DataFrames["Signal"][(DataFrames["Signal"]['Ds_gammaveto_M_Correction']>=dM)][i]), ls='--', linewidth=1.5)
plt.hist(data, color=colors, label=labels, density=Density, stacked=Stacked, bins=Bins, alpha=1, histtype='step', linewidth=1.5, range=Range)
# plt.axvspan(Range[0],0.15,color='gray',alpha=0.2)
# plt.axvline(0.15,ls='--',color='gray')

# Title
#--------
plt.title(r'$e^{+}$ mass hypothesis: pion', loc = "left")
plt.title(r'$\int\mathcal{L}dt\approx\;1443.999$ fb$^{-1}$', loc = "right")
# Label
#-------
plt.ylabel(r'$Entries/(\; {width:.2f}\;MeV/c^2)$'.format(width = perBin))
plt.xlabel(r'$\Delta m(D_s^{+} - D^{0})\;[GeV/c^{2}]$')
# plt.yscale("log")
# plt.xscale("log")
# plt.ylim(0, 30000)
plt.legend()
plt.show()

In [None]:
Bins=50
Density = False
Stacked = True
Range = [0.0,0.25]
BS = -1
perBin = ((Range[1] - Range[0])/Bins)*1000
# var = 'Ds_diff_D0pi'
var = 'Ds_massDifference_0'
print("Width Per Bin: {width:.2f} MeV".format(width = perBin))

label1= r'$isSignal(D_s^{+})=1$'
label2= r'$isSignal(D_s^{+})=0$'
label3= r'$NaN$'

labels=[label1,label2,label3]
colors=['#7eb0d5','#fd7f6f','purple']

data = [DataFrames["Signal"][(DataFrames["Signal"]['Ds_isSignal']==1) & (DataFrames["Signal"]["Ds_BkgBDT"]>=BS)][var],
        DataFrames["Signal"][(DataFrames["Signal"]['Ds_isSignal']==0) & (DataFrames["Signal"]["Ds_BkgBDT"]>=BS)][var],
        DataFrames["Signal"][(DataFrames["Signal"]['Ds_isSignal'].isna()) & (DataFrames["Signal"]["Ds_BkgBDT"]>=BS)][var]
       ]


plt.hist(data[::-1], color=colors[::-1], label=labels[::-1], alpha=1, range=Range, linewidth=2, stacked=Stacked, density=Density, bins=Bins, histtype='step')
# plt.axvspan(Range[0],0.16,color='gray',alpha=0.2)
# plt.axvline(0.16,ls='--',color='gray')

# Title
#---------
# Signal
# plt.title(r'$2M\;Events$', loc = "left")
plt.title(r'$2M\;Events$'+"\n"+r"$BDT \geq 0.525$", loc = "left")
plt.title(r'$\bf Signal\;Events$', loc = "right")
# # Background
# plt.title(r'$\int\mathcal{L}dt\approx\;100$ fb$^{-1}$', loc = "left")
# plt.title(r'$\bf Generic\;c\bar{c}\;Events$', loc = "right")
# Label
#---------
plt.ylabel(r'$Entries/(\; {width:.2f}\;MeV/c^2)$'.format(width = perBin))
plt.xlabel(r'$\Delta m_{e}(D_s^{+} - D^{0})\;[GeV/c^{2}]$')
# plt.yscale("log") 
plt.legend()
plt.show()

In [None]:
# === Settings ===
Stacked = True
Density = False
Bins = 50
Range = [0.0, 0.25]
BDT = 0.616
perBin = ((Range[1] - Range[0]) / Bins) * 1000
print("Width Per Bin: {:.2f} MeV".format(perBin))

# Data source and variables
df = DataFrames["All"][(DataFrames["All"]["Ds_BkgBDT"]>=BDT)]
cut_var = "Ds_diff_D0pi"
plot_var = 'Ds_massDifference_0'
pdg_var = 'Ds_mcPDG'

# Sideband cut (exclude D*⁺ peak)
cut_low = 0.142
cut_high = 0.15
df_cut = df.query(f"{cut_var} <= @cut_low or {cut_var} >= @cut_high")

# === Categories based on true Ds_mcPDG ===
dstar_plus = df_cut[abs(df_cut[pdg_var]) == 413][plot_var]
dstar_zero = df_cut[abs(df_cut[pdg_var]) == 423][plot_var]
other = df_cut[(abs(df_cut[pdg_var]) != 413) & (abs(df_cut[pdg_var]) != 423)][plot_var]

# === Plot ===
plt.hist([other, dstar_zero, dstar_plus],
         color=["#2E2E2E", "#4C6EB1", "#007C91"],
         label=["Other", r"$D^{*0}$", r"$D^{*+}$"],
         density=Density,
         stacked=Stacked,
         bins=Bins,
         range=Range,
         histtype='step',
         linewidth=2)

# Titles and labels
plt.title(r'$BDT \geq 0.596$' + '\n' +
          r'$\Delta m_{\pi}(D_s^{+} - D^{0}) \notin [0.142,\; 0.15] \; \mathrm{GeV}/c^{2}$', loc="left")
plt.title(r'$\int\mathcal{L}dt\approx\;1444$ fb$^{-1}$', loc="right")
plt.xlabel(r'$\Delta m_{e}(D_s^{+} - D^{0})\;[GeV/c^{2}]$')
plt.ylabel(r'$Entries/(\;{:.2f}\;MeV/c^2)$'.format(perBin))
plt.legend()
# plt.tight_layout()
plt.show()

In [None]:
# === Settings ===
Stacked = True
Density = False
Bins = 50
Range = [0.0, 0.25]
BDT = 0.616
perBin = ((Range[1] - Range[0]) / Bins) * 1000
print("Width Per Bin: {:.2f} MeV".format(perBin))

# Data source and variables
df = DataFrames["All"][(DataFrames["All"]["Ds_BkgBDT"] >= BDT)]
cut_var = "Ds_diff_D0pi"
plot_var = 'Ds_massDifference_0'
pdg_var = 'Ds_mcPDG'

# Sideband cut (exclude D*⁺ peak)
cut_low = 0.142
cut_high = 0.15
df_cut = df.query(f"{cut_var} <= @cut_low or {cut_var} >= @cut_high")

# === Categories based on true Ds_mcPDG ===
dstar_plus  = df_cut[abs(df_cut[pdg_var]) == 413][plot_var]
dstar_zero  = df_cut[abs(df_cut[pdg_var]) == 423][plot_var]
other       = df_cut[(abs(df_cut[pdg_var]) != 413) & (abs(df_cut[pdg_var]) != 423)][plot_var]

# === Plot ===
plt.hist([other, dstar_plus, dstar_zero],
         color=["#2E2E2E", "#007C91", "#4C6EB1"],
         label=["Other", r"$D^{*+}$", r"$D^{*0}$"],
         density=Density,
         stacked=Stacked,
         bins=Bins,
         range=Range,
         histtype='step',
         linewidth=2)

# Titles and labels
plt.title(r'$BDT \geq 0.616$' + '\n' +
          r'$\Delta m_{\pi}(D_s^{+} - D^{0}) \notin [0.142,\; 0.15] \; \mathrm{GeV}/c^{2}$', loc="left")
plt.title(r'$\int\mathcal{L}dt\approx\;1444$ fb$^{-1}$', loc="right")
plt.xlabel(r'$\Delta m_{e}(D_s^{+} - D^{0})\;[GeV/c^{2}]$')
plt.ylabel(r'$Entries/(\;{:.2f}\;MeV/c^2)$'.format(perBin))
plt.legend()
# plt.tight_layout()
plt.show()

# Save BDT Output

In [None]:
# # === Output directory ===
# output_dir = "/group/belle2/users2022/amubarak/02-Grid/ML_Trained/"
# os.makedirs(output_dir, exist_ok=True)

# # === Base input path for original files ===
# base_input_dir = "/group/belle2/users2022/amubarak/02-Grid/Sample_Grid"
# Date = "0526"
# Attempt = "0"

# # === Save each DataFrame using original filename with _withBDT suffix ===
# for s in samples:
#     if s == "Signal":
#         original_name = "Ds2D0enu-Signal.root"
#         original_path = "/home/belle2/amubarak/C01-Simulated_Events/" + original_name
#     else:
#         original_name = f"Ds2D0e-Generic_Ds_{Date}25_{Attempt}_{s}.root"
#         original_path = os.path.join(base_input_dir, original_name)

#     # Strip .root and append _withBDT.root
#     output_name = original_name.replace(".root", "_withBDT.root")
#     out_path = os.path.join(output_dir, output_name)

#     # Save to ROOT file
#     with uproot.recreate(out_path) as f:
#         f["Dstree"] = DataFrames[s]