In [None]:
import psutil

mem = psutil.virtual_memory()
print(f"Total RAM: {mem.total / 1e9:.2f} GB")
print(f"Available: {mem.available / 1e9:.2f} GB")

In [None]:
# !pip uninstall -y scikit-learn
# !pip install scikit-learn==1.3.1

In [None]:
# ! pip install --upgrade pip
# ! pip install --user xgboost seaborn
# ! pip install --user bayesian-optimization

In [None]:
# import mplhep
import sys

import seaborn as sns

import numpy as np
import pandas as pd
import uproot
from matplotlib import pyplot as plt

from sklearn.datasets import make_classification,make_regression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import auc,roc_curve,confusion_matrix,classification_report,precision_recall_curve,mean_squared_error,accuracy_score,roc_auc_score
from sklearn.model_selection import GridSearchCV, cross_validate, validation_curve,train_test_split,KFold,learning_curve,cross_val_score
from sklearn.utils import compute_sample_weight
from scipy.stats import ks_2samp
from sklearn.tree import DecisionTreeClassifier

from hep_ml import gradientboosting as ugb
from hep_ml.uboost import uBoostClassifier

from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import randint, uniform

In [None]:
plt.rcParams.update({
    "axes.labelsize": 16,
    "xtick.labelsize": 12,
    "ytick.labelsize": 12,
    "legend.fontsize": 16,
    "figure.titlesize": 20
})

In [None]:
pd.set_option('display.max_rows', 200000)
pd.set_option('display.max_columns', 200000)

In [None]:
sys.path.append("/home/belle2/amubarak/Ds2D0enue_Analysis/07-Python_Functions/")

# Prep-Work

### Import Data

Correct Charge

In [None]:
import os
import uproot
import pandas as pd

# === Load only selected branches ===
with open("/home/belle2/amubarak/Ds2D0enue_Analysis/03-Grid/Save_var.txt") as f:
    variables_to_load = [
        line.strip().strip(",").strip('"').strip("'")
        for line in f
        if line.strip() and not line.strip().startswith("#")
    ]

# Make sure to include BDT output variable
if "Ds_FakeD0BDT" not in variables_to_load:
    variables_to_load.append("Ds_FakeD0BDT")

# === Sample list ===
samples = ["Signal", "BB", "ccbar", "ddbar", "ssbar", "taupair", "uubar"]
GenEvents = samples.copy()

# === Input configuration ===
Date = "0530"
Attempt = "0"
input_dir = "/group/belle/users/amubarak/03-ML/FakeD0/"

# === Load ROOT files into DataFrames ===
DataFrames = {}

for s in samples:
    if s == "Signal":
        file_path = os.path.join(input_dir, "Ds2D0enu-Signal_withBDT.root")
    else:
        file_path = os.path.join(
            input_dir, f"Ds2D0e-Generic_Ds_{Date}25_{Attempt}_{s}_withBDT.root"
        )

    print(f"Loading: {file_path}")
    DataFrames[s] = uproot.concatenate(
        f"{file_path}:Dstree",
        filter_name=variables_to_load,
        library="pd"
    )

# === Define combined background ===
background_samples = ["BB", "ccbar", "ddbar", "ssbar", "taupair", "uubar"]
DataFrames["All"] = pd.concat([DataFrames[s] for s in background_samples], ignore_index=True)

# === Combine uds backgrounds for convenience ===
DataFrames["uds"] = pd.concat(
    [DataFrames["uubar"], DataFrames["ddbar"], DataFrames["ssbar"]],
    ignore_index=True
)

Wrong Charge

In [None]:
import os
import uproot
import pandas as pd

# === Load only selected branches ===
with open("/home/belle2/amubarak/Ds2D0enue_Analysis/03-Grid/Save_var.txt") as f:
    variables_to_load = [
        line.strip().strip(",").strip('"').strip("'")
        for line in f
        if line.strip() and not line.strip().startswith("#")
    ]

# Ensure BDT variable is included
if "Ds_FakeD0BDT" not in variables_to_load:
    variables_to_load.append("Ds_FakeD0BDT")

# === Wrong-charge samples ===
samples_WCh = ["Signal_WCh", "BB_WCh", "ccbar_WCh", "ddbar_WCh", "ssbar_WCh", "taupair_WCh", "uubar_WCh", "Data_WCh"]
background_WCh = ["BB_WCh", "ccbar_WCh", "ddbar_WCh", "ssbar_WCh", "taupair_WCh", "uubar_WCh"]

Date_WCh = "0630"
Attempt_WCh = "0"
input_dir_WCh = "/group/belle/users/amubarak/03-ML/FakeD0_WCh/"

# === Load wrong-charge ROOT files into DataFrames ===
DataFrames = {} if "DataFrames" not in globals() else DataFrames

for s in samples_WCh:
    if s == "Signal_WCh":
        file_path = os.path.join(input_dir_WCh, "Ds2D0enu-Signal_WCh_withBDT.root")
    else:
        tag = s.replace("_WCh", "")
        file_path = os.path.join(
            input_dir_WCh,
            f"Ds2D0e-Generic_Ds_{Date_WCh}25_{Attempt_WCh}_{tag}_withBDT.root"
        )

    print(f"Loading: {file_path}")
    DataFrames[s] = uproot.concatenate(
        f"{file_path}:Dstree",
        filter_name=variables_to_load,
        library="pd"
    )

# === Combine wrong-charge backgrounds ===
DataFrames["All_WCh"] = pd.concat([DataFrames[s] for s in background_WCh], ignore_index=True)
DataFrames["uds_WCh"] = pd.concat(
    [DataFrames[s] for s in ["uubar_WCh", "ddbar_WCh", "ssbar_WCh"]],
    ignore_index=True
)

Reverse PID

In [None]:
import os
import uproot
import pandas as pd

# === Load only selected branches ===
with open("/home/belle2/amubarak/Ds2D0enue_Analysis/03-Grid/Save_var.txt") as f:
    variables_to_load = [
        line.strip().strip(",").strip('"').strip("'")
        for line in f
        if line.strip() and not line.strip().startswith("#")
    ]

# Ensure BDT variable is included
if "Ds_FakeD0BDT" not in variables_to_load:
    variables_to_load.append("Ds_FakeD0BDT")

# === Wrong-charge samples ===
samples_ReverseID = ["Signal_ReverseID", "BB_ReverseID", "ccbar_ReverseID", "ddbar_ReverseID", "ssbar_ReverseID", "taupair_ReverseID", "uubar_ReverseID", "Data_ReverseID"]
background_ReverseID = ["BB_ReverseID", "ccbar_ReverseID", "ddbar_ReverseID", "ssbar_ReverseID", "taupair_ReverseID", "uubar_ReverseID"]

Date_ReverseID = "0626"
Attempt_ReverseID = "0"
input_dir_ReverseID = "/group/belle/users/amubarak/03-ML/FakeD0_ReverseID/"

# === Load wrong-charge ROOT files into DataFrames ===
DataFrames = {} if "DataFrames" not in globals() else DataFrames

for s in samples_ReverseID:
    if s == "Signal_ReverseID":
        file_path = os.path.join(input_dir_ReverseID, "Ds2D0enu-Signal_ReverseID_withBDT.root")
    else:
        tag = s.replace("_ReverseID", "")
        file_path = os.path.join(
            input_dir_ReverseID,
            f"Ds2D0e-Generic_Ds_{Date_ReverseID}25_{Attempt_ReverseID}_{tag}_withBDT.root"
        )

    print(f"Loading: {file_path}")
    DataFrames[s] = uproot.concatenate(
        f"{file_path}:Dstree",
        filter_name=variables_to_load,
        library="pd"
    )

# === Combine wrong-charge backgrounds ===
DataFrames["All_ReverseID"] = pd.concat([DataFrames[s] for s in background_ReverseID], ignore_index=True)
DataFrames["uds_ReverseID"] = pd.concat(
    [DataFrames[s] for s in ["uubar_ReverseID", "ddbar_ReverseID", "ssbar_ReverseID"]],
    ignore_index=True
)

Reverse PID and Wrong Charge

In [None]:
# import os
# import uproot
# import pandas as pd

# # === Load only selected branches ===
# with open("/home/belle2/amubarak/Ds2D0enue_Analysis/03-Grid/Save_var.txt") as f:
#     variables_to_load = [
#         line.strip().strip(",").strip('"').strip("'")
#         for line in f
#         if line.strip() and not line.strip().startswith("#")
#     ]

# # Ensure BDT variable is included
# if "Ds_FakeD0BDT" not in variables_to_load:
#     variables_to_load.append("Ds_FakeD0BDT")

# # === Wrong-charge samples ===
# samples_ReverseID_WCh = ["BB_ReverseID_WCh", "ccbar_ReverseID_WCh", "ddbar_ReverseID_WCh", "ssbar_ReverseID_WCh", "taupair_ReverseID_WCh", "uubar_ReverseID_WCh", "Data_ReverseID_WCh"]
# background_ReverseID_WCh = ["BB_ReverseID_WCh", "ccbar_ReverseID_WCh", "ddbar_ReverseID_WCh", "ssbar_ReverseID_WCh", "taupair_ReverseID_WCh", "uubar_ReverseID_WCh"]

# Date_ReverseID_WCh = "0708"
# Attempt_ReverseID_WCh = "0"
# input_dir_ReverseID_WCh = "/group/belle/users/amubarak/03-ML/FakeD0_ReverseID_WCh/"

# # === Load wrong-charge ROOT files into DataFrames ===
# DataFrames = {} if "DataFrames" not in globals() else DataFrames

# for s in samples_ReverseID_WCh:
#     if s == "Signal_ReverseID_WCh":
#         file_path = os.path.join(input_dir_ReverseID_WCh, "Ds2D0enu-Signal_ReverseID_WCh_withBDT.root")
#     else:
#         tag = s.replace("_ReverseID_WCh", "")
#         file_path = os.path.join(
#             input_dir_ReverseID_WCh,
#             f"Ds2D0e-Generic_Ds_{Date_ReverseID_WCh}25_{Attempt_ReverseID_WCh}_{tag}_withBDT.root"
#         )

#     print(f"Loading: {file_path}")
#     DataFrames[s] = uproot.concatenate(
#         f"{file_path}:Dstree",
#         filter_name=variables_to_load,
#         library="pd"
#     )

# # === Combine wrong-charge backgrounds ===
# DataFrames["All_ReverseID_WCh"] = pd.concat([DataFrames[s] for s in background_ReverseID_WCh], ignore_index=True)
# DataFrames["uds_ReverseID_WCh"] = pd.concat(
#     [DataFrames[s] for s in ["uubar_ReverseID_WCh", "ddbar_ReverseID_WCh", "ssbar_ReverseID_WCh"]],
#     ignore_index=True
# )

The line below is to look at the available variables.

In [None]:
print(DataFrames.keys())

In [None]:
DataFrames['Signal'].columns.tolist()

### Setup
The code below will be used to apply cuts to the data.  
The range of the plots.

In [None]:
# Electron ID
#-------------------
# DataFrames["Signal"] = DataFrames["Signal"][DataFrames["Signal"]['e_electronID']>=0.95]
# DataFrames["ccbar"] = DataFrames["ccbar"][DataFrames["ccbar"]['e_electronID']>=0.95]
# DataFrames["Signal"] = DataFrames["Signal"][DataFrames["Signal"]['Ds_gammaveto_em_electronID']>=0.95]
# DataFrames["ccbar"] = DataFrames["ccbar"][DataFrames["ccbar"]['Ds_gammaveto_em_electronID']>=0.95]

# Photon Conversion
#-------------------
# DataFrames[samples[0]] = DataFrames[samples[0]][DataFrames[samples[0]]['Ds_gammaveto_M_Correction']>=0.1]
# DataFrames[samples[1]] = DataFrames[samples[1]][DataFrames[samples[1]]['Ds_gammaveto_M_Correction']>=0.1]

# Peaking Background Removal
#----------------------------
# DataFrames["ccbar"] = DataFrames["ccbar"][(DataFrames["ccbar"]['Ds_diff_D0pi']>=0.15)]
# DataFrames["Signal"] = DataFrames["Signal"][(DataFrames["Signal"]['Ds_diff_D0pi']>=0.15)]

# # Vertex Fitting
# #----------------
# DataFrames["Signal"] = DataFrames["Signal"][DataFrames["Signal"]['Ds_chiProb']>=0.01]
# DataFrames["ccbar"] = DataFrames["ccbar"][DataFrames["ccbar"]['Ds_chiProb']>=0.01]

# Dalitz Removal
#----------------------------
# DataFrames["ccbar"] = DataFrames["ccbar"][(DataFrames["ccbar"]['Ds_pi0veto_M_Correction']<=0.08) | (DataFrames["ccbar"]['Ds_pi0veto_M_Correction']>=0.16)]
# DataFrames["Signal"] = DataFrames["Signal"][(DataFrames["Signal"]['Ds_pi0veto_M_Correction']<=0.08) | (DataFrames["Signal"]['Ds_pi0veto_M_Correction']>=0.16)]

# Vertex Fit
#----------------
# DataFrames[samples[0]] = DataFrames[samples[0]][DataFrames[samples[0]]['Ds_chiProb_rank']==1]
# DataFrames[samples[1]] = DataFrames[samples[1]][DataFrames[samples[1]]['Ds_chiProb_rank']==1]

# D0 Invariant Mass
#-----------------------
# DataFrames[samples[0]] = DataFrames[samples[0]][(DataFrames[samples[0]]['Ds_D0_sideband']==1)]
# DataFrames[samples[1]] = DataFrames[samples[1]][(DataFrames[samples[1]]['Ds_D0_sideband']==1)]

## $D^{*+}$ Suppression

In [None]:
print(abs(DataFrames["All_ReverseID"][(DataFrames["All_ReverseID"]["e_electronID"] < 0.5)])[['e_mcPDG']].value_counts(normalize=True,dropna=False).apply(lambda x: f"{x:.6f}"))

In [None]:
# cut_low = 0.14543 - (2 * 0.00041121)
# cut_high = 0.14543 + (2 * 0.00041121)

# for key in DataFrames.keys():
#     df = DataFrames[key]
#     if "Ds_diff_D0pi" in df.columns:
#         DataFrames[key] = df[
#             (df["Ds_diff_D0pi"] <= cut_low) | (df["Ds_diff_D0pi"] >= cut_high)
#         ]

In [None]:
DataFrames["All"]["D0_isSignal"] = DataFrames["All"]["D0_isSignal"].replace(np.nan, 0)

for s in GenEvents[0:]: # loop over samples
    DataFrames[s]["D0_isSignal"] = DataFrames[s]["D0_isSignal"].replace(np.nan, 0)

In [None]:
DataFrames["All"]["Ds_isSignal"] = DataFrames["All"]["Ds_isSignal"].replace(np.nan, 0)

for s in GenEvents[0:]: # loop over samples
    DataFrames[s]["Ds_isSignal"] = DataFrames[s]["Ds_isSignal"].replace(np.nan, 0)

## Fake $D^0$ BDT Cut

In [None]:
# DataFrames["All"] = DataFrames["All"][(DataFrames["All"]["Ds_FakeD0BDT"]>=0.556)]

# for s in GenEvents[0:]: # loop over samples
#     DataFrames[s] = DataFrames[s][(DataFrames[s]["Ds_FakeD0BDT"]>=0.556)]

# Background Suppression

In [None]:
DataFrames[samples[0]].isna().sum()

In [None]:
print("Signal Number: ",len(DataFrames["Signal"]))
print("Background Number: ",len(DataFrames["All"]))

## Variable Comparison

In [None]:
plt.style.use('default')
plt.rcParams.update({
    "axes.labelsize": 16,
    "xtick.labelsize": 12,
    "ytick.labelsize": 12,
    "legend.fontsize": 14,
    "figure.titlesize": 18
})

In [None]:
# === Input Variables and Labels ===
Variables = [
    "Ds_FakeD0BDT",
    "Ds_chiProb",
    "Ds_gammaveto_M_Correction",
    "Ds_Ds_starminusDs_M_Correction"
]

features = [
    r'$Fake\;D^{0}\;Suppression$',
    r'$p$-value$_{IP}(D_{s}^{+})$',
    r'$m(e_{\mathrm{sig}}^{+}e_{\mathrm{ROE}}^{-})\;[\mathrm{GeV}/c^2]$',
    r'$\Delta m(D_{s}^{*+} - D_{s}^{+})\;[\mathrm{GeV}/c^2]$'
]

# === Plot Ranges ===
ranges = {
    "Ds_FakeD0BDT": [0.0, 1.0],
    "Ds_chiProb": [0.0, 1.0],
    "Ds_gammaveto_M_Correction": [0.0, 0.2],
    "Ds_Ds_starminusDs_M_Correction": [0.0, 0.2]
}

bins = 50
density = True

# === Colors and Labels ===
colors = {
    "signal": "#1f77b4",     # Blue
    "background": "#d62728"  # Red
}
labels = {
    "signal": r'$Signal$',
    "background": r'$Background$'
}

# === Extract real signal and background ===
df_true_signal = DataFrames["Signal"][DataFrames["Signal"]["Ds_isSignal"] == 1]
df_background = DataFrames["All"]

# === Plotting ===
for var, label in zip(Variables, features):
    if var not in ranges:
        print(f"Skipping {var}: no range defined.")
        continue

    var_range = ranges[var]
    bin_width = (var_range[1] - var_range[0]) / bins

    signal_data = df_true_signal[var].dropna()
    background_data = df_background[var].dropna()

    plt.hist(signal_data, label=labels["signal"],
             histtype='step', density=density,
             bins=bins, range=var_range, linewidth=2, color=colors["signal"])

    plt.hist(background_data, label=labels["background"],
             histtype='step', density=density,
             bins=bins, range=var_range, linewidth=2, color=colors["background"])

    plt.xlabel(label)
    plt.ylabel(r'$Norm.\;Entries/({:.3f})$'.format(bin_width))
    plt.legend(loc='upper right')
    plt.show()


## Variable Correlation

In [None]:
Variables = [
    # "Ds_Ds_starminusDs_M_Correction",
    "Ds_FakeD0BDT",
    "Ds_chiProb",
    # "D0_cos_theta",
    "e_dr","e_dz",
    "e_cos_theta",
    "e_pt",
    "e_p"
]

features = [
    # r'$\Delta m(D_{s}^{*+} - D_{s}^{+})$',
    r'$Fake\;D^{0}\;Suppression$',
    r'$p-value_{IP}(D_{s}^{+})$',
    # r'$m(e_{\mathrm{sig}}^{+}e_{\mathrm{ROE}}^{-})$',
    # r'$\Delta m(D_{s}^{*+} - D_{s}^{+})$',
    # "D0_cos_theta",
    r'$d_{r}(e)$', r'$d_{z}(e)$', 
    r'$\cos\theta(e)$',
    r'$p_{t}(e)$',
    r'$p(e)$'
]

In [None]:
plt.figure(figsize=(18, 15))

heatmap = sns.heatmap(DataFrames["Signal"][Variables].corr(), annot=True, cmap="coolwarm",vmin=-1, vmax=1)

heatmap.set_title('Signal Correlation Heatmap', fontdict={'fontsize':20}, pad=16)

In [None]:
plt.figure(figsize=(18, 15))

heatmap = sns.heatmap(DataFrames["All"][Variables].corr(), annot=True, cmap="coolwarm",vmin=-1, vmax=1)

heatmap.set_title('Background Correlation Heatmap', fontdict={'fontsize':20}, pad=16)

## Data Preprocessing

In [None]:
# #  Organise data ready for the machine learning model

# # for sklearn data are usually organised
# # into one 2D array of shape (n_samples x n_features)
# # containing all the data and one array of categories
# # of length n_samples

# all_MC = []  # define empty list that will contain all features for the MC
# for s in GenEvents:  # loop over the different samples
#     if s != "data":  # only MC should pass this
#         all_MC.append(
#             DataFrames[s][Variables]
#         )  # append the MC dataframe to the list containing all MC features
# X = np.concatenate(
#     all_MC
# )  # concatenate the list of MC dataframes into a single 2D array of features, called X

# all_y = (
#     []
# )  # define empty list that will contain labels whether an event in signal or background
# for s in GenEvents:  # loop over the different samples
#     if s != "data":  # only MC should pass this
#         if "Signal" in s:  # only signal MC should pass this
#             all_y.append(
#                 np.ones(DataFrames[s].shape[0], dtype=np.int32)
#             )  # signal events are labelled with 1
#         else:  # only background MC should pass this
#             all_y.append(
#                 np.zeros(DataFrames[s].shape[0], dtype=np.int32)
#             )  # background events are labelled 0
# y = np.concatenate(
#     all_y
# )  # concatenate the list of labels into a single 1D array of labels, called y

In [None]:
# === Load Data ===
all_MC = []
all_y = []

fitvar = "Ds_massDifference_0"
Variables_with_fitvar = Variables + [fitvar]

for s in GenEvents:
    if s == "data":
        continue
    df = DataFrames[s]
    if "Signal" in s:
        df = df[df["Ds_isSignal"] == 1]
        all_y.append(np.ones(len(df), dtype=np.int32))
    else:
        all_y.append(np.zeros(len(df), dtype=np.int32))
    all_MC.append(df[Variables_with_fitvar])

# Use pd.concat, not np.concatenate
X = pd.concat(all_MC)
y = np.concatenate(all_y)

# === Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

## Model Training

In [None]:
# === uBoost Setup
weights = compute_sample_weight('balanced', y_train)

uboost = uBoostClassifier(
    uniform_features=[fitvar],
    uniform_label=1,
    train_features=Variables,
    n_estimators=100,
    efficiency_steps=12,
    n_threads=4,
    random_state=42
)

uboost.fit(X_train, y_train, sample_weight=weights)

## Parameter Optimization

This optimization is pulling too much resources and ending the connection

In [None]:
from hep_ml.uboost import uBoostClassifier
from sklearn.model_selection import ParameterSampler
from scipy.stats import randint, uniform

fitvar = "Ds_massDifference_0"  # decorrelation target
uniform_variables = Z_train.reshape(-1, 1)  # Z_train comes from your earlier train_test_split

# Define the parameter space
param_dist = {
    "max_depth": randint(1, 5),
    "n_estimators": randint(100, 201),
    "efficiency_steps": randint(6, 16)  # typical range for uBoost
}

# Sample 50 combinations
param_list = list(ParameterSampler(param_dist, n_iter=50, random_state=42))

best_model = None
best_score = -np.inf
best_params = None

from sklearn.metrics import roc_auc_score
from tqdm import tqdm

for params in tqdm(param_list):
    model = uBoostClassifier(
        uniform_features=[fitvar],
        uniform_label=1,
        train_features=Variables,
        base_estimator=DecisionTreeClassifier(max_depth=params["max_depth"]),
        n_estimators=params["n_estimators"],
        efficiency_steps=params["efficiency_steps"],
        n_threads=4,
        random_state=42
    )

    try:
        model.fit(X_train, y_train, sample_weight=weights)
        y_pred = model.predict_proba(X_test)[:, 1]
        score = roc_auc_score(y_test, y_pred)

        if score > best_score:
            best_score = score
            uboost_final = model
            best_params = params

    except Exception as e:
        print(f"Failed for params: {params} — {e}")

# === Results ===
print("\nBest parameters found:")
for k, v in best_params.items():
    print(f"{k:20s}: {v}")
print(f"Best ROC AUC Score: {best_score:.4f}")

In [None]:
uboost_final = uboost

## Feature Importance

In [None]:
from sklearn.inspection import permutation_importance

# Compute permutation importance
result = permutation_importance(uboost_final, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1)

# Convert to DataFrame
feature_imp = pd.DataFrame({'Value': result.importances_mean, 'Feature': features})
feature_imp = feature_imp.sort_values(by="Value", ascending=False)

# Plot
plt.figure(figsize=(16, 8))
sns.barplot(x="Value", y="Feature", data=feature_imp)
plt.title('Permutation Importance (uBoost)')
plt.tight_layout()
plt.show()

## Overfitting Check

In [None]:
from scipy import stats
def get_pulls(counts,errors,pdf):
    pull = (-pdf + counts) / errors
    return pull

In [None]:
def compare_train_test(clf, X_train, y_train, X_test, y_test):
    decisions = [] # list to hold decisions of classifier
    for X,y in ((X_train, y_train), (X_test, y_test)): # train and test
        if hasattr(clf, "predict_proba"): # if predict_proba function exists
            d1 = clf.predict_proba(X[y<0.5])[:, 1] # background
            d2 = clf.predict_proba(X[y>0.5])[:, 1] # signal
        else: # predict_proba function doesn't exist
            X_tensor = torch.as_tensor(X, dtype=torch.float) # make tensor from X_test_scaled
            y_tensor = torch.as_tensor(y, dtype=torch.long) # make tensor from y_test
            X_var, y_var = Variable(X_tensor), Variable(y_tensor) # make variables from tensors
            d1 = clf(X_var[y_var<0.5])[1][:, 1].cpu().detach().numpy() # background
            d2 = clf(X_var[y_var>0.5])[1][:, 1].cpu().detach().numpy() # signal
        decisions += [d1, d2] # add to list of classifier decision

    #pd.set_option('max_columns', None)
#     %config InlineBackend.figure_format = 'retina'
    # plt.style.use('belle2')
    lw=3

    fig,axs=plt.subplots(3,1,figsize=(10,10),gridspec_kw={'height_ratios':[1,0.2,0.2]})

    bins = 50
    bin_edges = np.linspace(0,1,bins)
    
    test_bkg_count_weight=bins/len(decisions[2])
    test_sig_count_weight=bins/len(decisions[3])
    test_bkg_counts,test_bkg_bins = np.histogram(decisions[2],bins=bins,range=(0,1))
    test_sig_counts,test_sig_bins = np.histogram(decisions[3],bins=bins,range=(0,1))

    train_bkg_counts,train_bkg_bins,_etc=axs[0].hist(decisions[0],color = 'tab:blue',
            histtype='step',bins=bins,density=True,range=(0,1),linewidth=lw,label='Train Background')
    train_sig_counts,train_sig_bins,_etc=axs[0].hist(decisions[1],color = 'tab:red',
            histtype='step',bins=bins,density=True,range=(0,1),linewidth=lw,label=r'Train Signal')
    axs[0].hist(decisions[0],color = 'tab:blue',
            histtype='stepfilled',alpha=0.4,bins=bins,density=True,range=(0,1))
    axs[0].hist(decisions[1],color = 'tab:red',
            histtype='stepfilled',alpha=0.4,bins=bins,density=True,range=(0,1))
    bin_width=test_bkg_bins[1]-test_bkg_bins[0]
    bin_centers=[el+(bin_width/2) for el in test_bkg_bins[:-1]]

    axs[0].errorbar(bin_centers,test_bkg_count_weight*test_bkg_counts,
                yerr=test_bkg_count_weight*np.sqrt(test_bkg_counts),label='Test Background',color='tab:blue',
                marker='o',linewidth=lw,ls='')
    axs[0].errorbar(bin_centers,test_sig_count_weight*test_sig_counts,
                yerr=test_sig_count_weight*np.sqrt(test_sig_counts),label='Test Signal',color='tab:red',
                marker='o',linewidth=lw,ls='')
    axs[0].set_title(r'$D_{s}^{+} \rightarrow D^{0} e^{+} \nu_{e}$',loc='left')
    axs[0].set_xlim(0,1)
    axs[0].set_ylim(0)
    axs[0].set_ylabel('Event Density')

    x= decisions[1]
    y=  decisions[3]
    ks_p_value_sig = ks_2samp(x, y)[1]

    x= decisions[0]
    y= decisions[2]
    ks_p_value_bkg = ks_2samp(x, y)[1]

    leg=axs[0].legend(loc='upper center',title=f"Sig K-S test score: {ks_p_value_sig:0.3f}"+
                      "\n"+f"Bkg K-S test score: {ks_p_value_bkg:0.3f}")
    leg._legend_box.align = "left"  

    pulls=get_pulls(test_bkg_count_weight*test_bkg_counts,test_bkg_count_weight*np.sqrt(test_bkg_counts),np.array(train_bkg_counts))
    axs[1].bar(bin_centers,pulls,width=bin_width)
    axs[1].set_xlim(0,1)
    axs[1].set_ylabel('Pulls')
    axs[1].set_ylim(-5,5)

    pulls=get_pulls(test_sig_count_weight*test_sig_counts,test_sig_count_weight*np.sqrt(test_sig_counts),np.array(train_sig_counts))
    axs[2].bar(bin_centers,pulls,width=bin_width,color='tab:red')
    axs[2].set_xlim(0,1)
    axs[2].set_ylabel('Pulls')
    axs[2].set_ylim(-5,5)
    axs[2].set_xlabel(r'BDT output')

    return decisions

In [None]:
decisions = compare_train_test(uboost_final, X_train, y_train, X_test, y_test)

## Model Check

### Basf2 ROC

In [None]:
# # compute ROC 
# sig_train=decisions[1]
# sig_test=decisions[3]
# bkg_train=decisions[0]
# bkg_test=decisions[2]

# bdt_cuts=np.linspace(0,1,100)
# sig_efficiency_train=[]
# bkg_rejection_train=[]
# den_sig_train=len(sig_train)
# den_bkg_train=len(bkg_train)

# sig_efficiency_test=[]
# bkg_rejection_test=[]
# den_sig_test=len(sig_test)
# den_bkg_test=len(bkg_test)


# for cut in bdt_cuts:
#     num_sig_train=len([el for el in sig_train if el>cut])
#     num_bkg_train=len([el for el in bkg_train if el>cut])
#     num_sig_test=len([el for el in sig_test if el>cut])
#     num_bkg_test=len([el for el in bkg_test if el>cut])
    
#     sig_efficiency_test.append(num_sig_test/den_sig_test)
#     bkg_rejection_test.append(1-(num_bkg_test/den_bkg_test))
#     sig_efficiency_train.append(num_sig_train/den_sig_train)
#     bkg_rejection_train.append(1-(num_bkg_train/den_bkg_train))

# fig,axs=plt.subplots(1,1,figsize=(8,6))
# lw=2
# axs.plot([1, 0], [0, 1], color='grey', linestyle='--')
# axs.plot(bkg_rejection_train,sig_efficiency_train,color='tab:blue',marker='',linewidth=lw,label='Train')
# axs.plot(bkg_rejection_test,sig_efficiency_test,color='tab:red',marker='',linewidth=lw,ls='--',label='Test')
# axs.set_title(r'$D_{s}^{+} \rightarrow D^{0} e^{+} \nu_{e}$',loc='left')

# axs.set_ylim(0,1.05)
# axs.set_xlim(0,1.05)
# axs.legend(loc='lower left')
# axs.set_xlabel('Background rejection')
# axs.set_ylabel('Signal efficiency')
# plt.tight_layout()

# plt.show()

In [None]:
y_score_test = xgbm_final.predict_proba(X_test)[:, 1]
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_score_test)
area_test = auc(fpr_test, tpr_test)

y_score_train = xgbm_final.predict_proba(X_train)[:, 1]
fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_score_train)
area_train = auc(fpr_train, tpr_train)

# Get classifier scores (probabilities for class 1)
train_scores = xgbm_final.predict_proba(X_train)[:, 1]
test_scores  = xgbm_final.predict_proba(X_test)[:, 1]

# Use y_train and y_test to separate signal/background
sig_train = train_scores[y_train == 1]
bkg_train = train_scores[y_train == 0]
sig_test  = test_scores[y_test == 1]
bkg_test  = test_scores[y_test == 0]

# Optionally, group them into one list like this:
decisions = [bkg_train, sig_train, bkg_test, sig_test]

bdt_cuts = np.linspace(0, 1, 100)

sig_eff_train = []
bkg_rej_train = []
sig_eff_test = []
bkg_rej_test = []
fom_vals = []

for cut in bdt_cuts:
    num_sig_train = np.sum(sig_train > cut)
    num_bkg_train = np.sum(bkg_train > cut)
    num_sig_test = np.sum(sig_test > cut)
    num_bkg_test = np.sum(bkg_test > cut)

    # FoM calculation
    fom = num_sig_test / np.sqrt(num_sig_test + num_bkg_test) if (num_sig_test + num_bkg_test) > 0 else 0
    fom_vals.append(fom)

    sig_eff_train.append(num_sig_train / len(sig_train))
    bkg_rej_train.append(1 - (num_bkg_train / len(bkg_train)))
    sig_eff_test.append(num_sig_test / len(sig_test))
    bkg_rej_test.append(1 - (num_bkg_test / len(bkg_test)))

# Find optimal FoM point
fom_vals = np.array(fom_vals)
best_idx = np.argmax(fom_vals)
best_cut = bdt_cuts[best_idx]

# Plot
fig, axs = plt.subplots(1, 1, figsize=(7, 6))
lw = 2

# axs.plot([0, 1], [0, 1], color='grey', linestyle='--', label='Random')
axs.plot(bkg_rej_train, sig_eff_train, color='tab:blue', linewidth=lw, label=f'Train (AUC = {area_train:.2f})')
axs.plot(bkg_rej_test, sig_eff_test, color='tab:red', linestyle='--', linewidth=lw, label=f'Test (AUC = {area_test:.2f})')

# ① Shade the overfit gap
axs.fill_between(bkg_rej_test,
                 sig_eff_train,
                 sig_eff_test,
                 where=(np.array(sig_eff_train) > np.array(sig_eff_test)),
                 color='gray', alpha=0.2, label='Overfit Gap')

# ② Mark the optimal cut point (from test curve)
axs.axhline(sig_eff_test[best_idx], color='black', ls='--', linewidth=1.6,
            label=f'Best FoM Cut = {best_cut:.3f}')
axs.axvline(bkg_rej_test[best_idx], color='black', ls='--', linewidth=1.6)
axs.scatter(bkg_rej_test[best_idx], sig_eff_test[best_idx], color='green', s=50)

# Axis labels and formatting
axs.set_title(r'$D_{s}^{+} \rightarrow D^{0} e^{+} \nu_{e}$', loc='left')
axs.set_ylim(0, 1.05)
axs.set_xlim(0, 1.05)
axs.set_xlabel('Background rejection')
axs.set_ylabel('Signal efficiency')
axs.legend(loc='lower left')
axs.grid(True)
plt.tight_layout()
plt.show()


### Machine Learing ROC

In [None]:
y_score_test = xgbm_final.predict_proba(X_test)[:, 1]
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_score_test)
area_test = auc(fpr_test, tpr_test)

y_score_train = xgbm_final.predict_proba(X_train)[:, 1]
fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_score_train)
area_train = auc(fpr_train, tpr_train)

plt.plot([0, 1], [0, 1], color='grey', linestyle='--')
plt.plot(fpr_test, tpr_test, label=f'Test ROC curve (AUC = {area_test:.2f})')
plt.plot(fpr_train, tpr_train, label=f'Train ROC curve (AUC = {area_train:.2f})')
plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
# We can make the plot look nicer by forcing the grid to be square
plt.gca().set_aspect('equal', adjustable='box')

In [None]:
# Make predictions on the test set
y_pred_proba = xgbm_final.predict_proba(X_test)[:, 1]

# Calculate the ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"ROC AUC Score: {roc_auc:.2f}")

### Other Checks

Check if XGBoost Is Overfitting

In [None]:
# Predict on training and validation sets
train_preds = xgbm_final.predict(X_train)
val_preds = xgbm_final.predict(X_test)

# Calculate accuracy scores
train_accuracy = accuracy_score(y_train, train_preds)
val_accuracy = accuracy_score(y_test, val_preds)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")

# Check for large difference between train and validation accuracy
if train_accuracy - val_accuracy > 0.1:
    print("Warning: The model may be overfitting!")

Check if XGBoost Is Underfitting

In [None]:
# Predict on training and validation sets
train_preds = xgbm_final.predict(X_train)
val_preds = xgbm_final.predict(X_test)

# Calculate MSE for training and validation sets
train_mse = mean_squared_error(y_train, train_preds)
val_mse = mean_squared_error(y_test, val_preds)

print(f"Training MSE: {train_mse:.4f}")
print(f"Validation MSE: {val_mse:.4f}")

# Check if both training and validation MSE are high
if train_mse > 100 and val_mse > 100:
    print("Warning: The model may be underfitting!")
    print("Consider increasing model complexity by adding more estimators, reducing learning rate, or adjusting other hyperparameters.")

## BDT Cut Optimization

In [None]:
# Apply BDT to all DataFrames that contain the required Variables
for key in DataFrames.keys():
    df = DataFrames[key]
    
    # Check: make sure all input BDT variables exist in this DataFrame
    if all(var in df.columns for var in Variables):
        # Apply BDT and store the result
        DataFrames[key]["Ds_BkgBDT"] = uboost_final.predict_proba(df[Variables])[:, 1].astype(np.float32)

In [None]:
from Functions import optimize_cut, plot_save

cut = optimize_cut(
    df_sig=DataFrames["Signal"],                  # used for plotting signal vs background
    df_bkg=DataFrames["All"],
    Signal=DataFrames["Signal"],                  # used for FoM numerator (truth-matched signal)
    Background=DataFrames["All"],                 # used for FoM denominator (everything else)
    var="Ds_BkgBDT",                              # new classifier variable
    FoM="Ds_BkgBDT",                              # same as var here
    xlabel="Background Classifier Output",
    Bins=50,
    Range=[0, 1],
    varmin=0,
    varmax=0.98,
    select="right",                               # keep events with higher classifier output
    Width=False,
    query_signal="Ds_isSignal == 1"
)

print(f"Best cut is: {cut:.3f}")

# Plots

In [None]:
plt.style.use('default')
plt.rcParams.update({
    "axes.labelsize": 16,
    "xtick.labelsize": 12,
    "ytick.labelsize": 12,
    "legend.fontsize": 16,
    "figure.titlesize": 20
})

Suggested Background Break-up

In [None]:
Stacked = False
Density = False
Bins = 50
# var = 'Ds_diff_D0pi'
var = 'Ds_massDifference_0'
Range = [0.0, 0.25]
BS = 0.82
Samples = "All"
perBin = ((Range[1] - Range[0])/Bins)*1000
print("Width Per Bin: {width:.2f} MeV".format(width = perBin))

label1= r'$Comb.$'
label2= r'$NaN$'
label3= r'$D^{*0} \rightarrow D^{0} \; \pi^0 / \gamma$'
label4= r'$D^{*+} \rightarrow D^{0} \pi^+$'

labels=[label1,label2,label3,label4]
colors=[
    "#2E2E2E",  # Comb. (dark gray-black)
    "#D62728",  # NaN (dark red)
    "#4C6EB1",  # D*0 (muted blue)
    "#006400",  # D*+ → D0π+ (deep green)
]
df_all = DataFrames["All"][(DataFrames["All"]["Ds_BkgBDT"]>=BS)]
data = [
    df_all[(~df_all["Ds_mcPDG"].isna()) & (abs(df_all["Ds_mcPDG"]) != 413) & (abs(df_all["Ds_mcPDG"]) != 423)][var],
    df_all[df_all["Ds_mcPDG"].isna()][var],
    df_all[abs(df_all["Ds_mcPDG"]) == 423][var],
    df_all[abs(df_all["Ds_mcPDG"]) == 413][var],
]

# factor = 0.7
# plt.hist(DataFrames["Signal"][(DataFrames["Signal"]["Ds_BkgBDT"]>=BS)][var], label="Signal", histtype='step', density=Density, bins=Bins, alpha=1, range=Range, weights=factor*np.ones_like(DataFrames["Signal"][(DataFrames["Signal"]["Ds_BkgBDT"]>=BS)][var]), ls='--', linewidth=1.5)
plt.hist(data, color=colors, label=labels, density=Density, stacked=Stacked, bins=Bins, alpha=1, histtype='step', linewidth=2, range=Range)
# plt.axvspan(Range[0],0.16,color='gray',alpha=0.2)
# plt.axvline(0.16,ls='--',color='gray')

# Title
#--------
# plt.title(r'$\bf Generic \; Events$', loc = "left")
plt.title(r'$\bf Generic \; Events$' + "\n" + r"$BDT \geq 0.869$", loc = "left")
plt.title(r'$\int\mathcal{L}dt\approx\;1443.999$ fb$^{-1}$', loc = "right")
# Label
#-------
plt.ylabel(r'$Entries/(\; {width:.2f}\;MeV/c^2)$'.format(width = perBin))
plt.xlabel(r'$\Delta m_{e}(D_s^{+} - D^{0})\;[GeV/c^{2}]$')
# plt.yscale("log")
# plt.xscale("log")
# plt.ylim(0, 30000)
plt.legend()
plt.show()

In [None]:
Stacked = True
Density = False
Bins = 50
# var = 'Ds_diff_D0pi'
var = 'Ds_massDifference_0'
Range = [0.0, 0.15]
BS = 0.828
Samples = "All"
perBin = ((Range[1] - Range[0])/Bins)*1000
print("Width Per Bin: {width:.2f} MeV".format(width = perBin))

label1= r'$Comb.$'
label2= r'$NaN$'
label3= r'$D^{*0} \rightarrow D^{0} \; \pi^0 / \gamma$'
label4= r'$D^{*+} \rightarrow D^{0} \pi^+$'

labels=[label1,label2,label3,label4]
colors=[
    "#2E2E2E",  # Comb. (dark gray-black)
    "#D62728",  # NaN (dark red)
    "#4C6EB1",  # D*0 (muted blue)
    "#006400",  # D*+ → D0π+ (deep green)
]
df_all = DataFrames["All_ReverseID"][(DataFrames["All_ReverseID"]["e_electronID"]<=0.5) & (DataFrames["All_ReverseID"]["Ds_BkgBDT"]>=BS)]
data = [
    df_all[(~df_all["Ds_mcPDG"].isna()) & (abs(df_all["Ds_mcPDG"]) != 413) & (abs(df_all["Ds_mcPDG"]) != 423)][var],
    df_all[df_all["Ds_mcPDG"].isna()][var],
    df_all[abs(df_all["Ds_mcPDG"]) == 423][var],
    df_all[abs(df_all["Ds_mcPDG"]) == 413][var],
]

# factor = 0.7
# plt.hist(DataFrames["Signal"][(DataFrames["Signal"]["Ds_BkgBDT"]>=BS)][var], label="Signal", histtype='step', density=Density, bins=Bins, alpha=1, range=Range, weights=factor*np.ones_like(DataFrames["Signal"][(DataFrames["Signal"]["Ds_BkgBDT"]>=BS)][var]), ls='--', linewidth=1.5)
plt.hist(data, color=colors, label=labels, density=Density, stacked=Stacked, bins=Bins, alpha=1, histtype='step', linewidth=2, range=Range)
# plt.axvspan(Range[0],0.16,color='gray',alpha=0.2)
# plt.axvline(0.16,ls='--',color='gray')

# Title
#--------
# plt.title(r'$\bf Generic \; Events$', loc = "left")
plt.title(r'$\bf Generic \; Events$' + "\n" + r"$BDT \geq 0.869$", loc = "left")
plt.title(r'$\int\mathcal{L}dt\approx\;1443.999$ fb$^{-1}$', loc = "right")
# Label
#-------
plt.ylabel(r'$Entries/(\; {width:.2f}\;MeV/c^2)$'.format(width = perBin))
plt.xlabel(r'$\Delta m_{e}(D_s^{+} - D^{0})\;[GeV/c^{2}]$')
# plt.yscale("log")
# plt.xscale("log")
# plt.ylim(0, 30000)
plt.legend()
plt.show()

In [None]:
Bins=50
Density = False
Stacked = True
Range = [0.0,0.25]
BS = -1
perBin = ((Range[1] - Range[0])/Bins)*1000
# var = 'Ds_diff_D0pi'
var = 'Ds_massDifference_0'
print("Width Per Bin: {width:.2f} MeV".format(width = perBin))

label1= r'$isSignal(D_s^{+})=1$'
label2= r'$isSignal(D_s^{+})=0$'
label3= r'$NaN$'

labels=[label1,label2,label3]
colors=['#7eb0d5','#fd7f6f','purple']

data = [DataFrames["Signal"][(DataFrames["Signal"]['Ds_isSignal']==1) & (DataFrames["Signal"]["Ds_BkgBDT"]>=BS)][var],
        DataFrames["Signal"][(DataFrames["Signal"]['Ds_isSignal']==0) & (DataFrames["Signal"]["Ds_BkgBDT"]>=BS)][var],
        DataFrames["Signal"][(DataFrames["Signal"]['Ds_isSignal'].isna()) & (DataFrames["Signal"]["Ds_BkgBDT"]>=BS)][var]
       ]


plt.hist(data[::-1], color=colors[::-1], label=labels[::-1], alpha=1, range=Range, linewidth=2, stacked=Stacked, density=Density, bins=Bins, histtype='step')
# plt.axvspan(Range[0],0.16,color='gray',alpha=0.2)
# plt.axvline(0.16,ls='--',color='gray')

# Title
#---------
# Signal
# plt.title(r'$2M\;Events$', loc = "left")
plt.title(r'$2M\;Events$'+"\n"+r"$BDT \geq 0.525$", loc = "left")
plt.title(r'$\bf Signal\;Events$', loc = "right")
# # Background
# plt.title(r'$\int\mathcal{L}dt\approx\;100$ fb$^{-1}$', loc = "left")
# plt.title(r'$\bf Generic\;c\bar{c}\;Events$', loc = "right")
# Label
#---------
plt.ylabel(r'$Entries/(\; {width:.2f}\;MeV/c^2)$'.format(width = perBin))
plt.xlabel(r'$\Delta m_{e}(D_s^{+} - D^{0})\;[GeV/c^{2}]$')
# plt.yscale("log") 
plt.legend()
plt.show()

# Save BDT Output

Correct Charge

In [None]:
# import os
# import uproot
# import numpy as np  # Make sure this is imported if you're working in a standalone script

# # === Samples to process ===
# samples = ["Signal", "BB", "ccbar", "ddbar", "ssbar", "taupair", "uubar"]

# # === Output directory for new Bkg BDT files ===
# output_dir = "/group/belle/users/amubarak/03-ML/BkgBDT/"
# os.makedirs(output_dir, exist_ok=True)

# # === Base input info used to construct filenames ===
# base_input_dir = "/group/belle/users/amubarak/02-Grid/Sample_Grid"
# Date = "0530"
# Attempt = "0"

# # === Loop over samples and write output ROOT files ===
# for s in samples:
#     if s not in DataFrames:
#         print(f"Warning: {s} not in DataFrames — skipping.")
#         continue

#     # Convert Ds_BkgBDT to float32 if present
#     if "Ds_BkgBDT" in DataFrames[s].columns:
#         DataFrames[s]["Ds_BkgBDT"] = DataFrames[s]["Ds_BkgBDT"].astype(np.float32)

#     # Construct the original file name
#     if s == "Signal":
#         original_name = "Ds2D0enu-Signal.root"
#     else:
#         original_name = f"Ds2D0e-Generic_Ds_{Date}25_{Attempt}_{s}.root"

#     # Build output file name with BkgBDT tag
#     output_name = original_name.replace(".root", "_withBkgBDT.root")
#     out_path = os.path.join(output_dir, output_name)

#     # Save DataFrame to ROOT
#     with uproot.recreate(out_path) as f:
#         f["Dstree"] = DataFrames[s]

#     print(f"Saved: {out_path}")

Wrong Charge

In [None]:
# import os
# import uproot
# import numpy as np  # Required for dtype conversion

# # === Define wrong-charge samples ===
# samples_WCh = ["Signal_WCh", "BB_WCh", "ccbar_WCh", "ddbar_WCh", "ssbar_WCh", "taupair_WCh", "uubar_WCh", "Data_WCh"]

# # === Output directory for BkgBDT files (wrong charge) ===
# output_dir_WCh = "/group/belle/users/amubarak/03-ML/BkgBDT_WCh/"
# os.makedirs(output_dir_WCh, exist_ok=True)

# # === Base input path for wrong-charge files ===
# base_input_dir_WCh = "/group/belle/users/amubarak/02-Grid/Sample_Grid_WCh"
# Date_WCh = "0530"
# Attempt_WCh = "0"

# # === Save each wrong-charge DataFrame with updated Ds_BkgBDT ===
# for s in samples_WCh:
#     if s not in DataFrames:
#         print(f"Warning: {s} not in DataFrames — skipping.")
#         continue

#     # Convert Ds_BkgBDT to float32 if present
#     if "Ds_BkgBDT" in DataFrames[s].columns:
#         DataFrames[s]["Ds_BkgBDT"] = DataFrames[s]["Ds_BkgBDT"].astype(np.float32)

#     # Set original file name
#     if s == "Signal_WCh":
#         original_name = "Ds2D0enu-Signal_WCh.root"
#     else:
#         tag = s.replace("_WCh", "")
#         original_name = f"Ds2D0e-Generic_Ds_{Date_WCh}25_{Attempt_WCh}_{tag}.root"

#     # Build output path with _withBkgBDT suffix
#     output_name = original_name.replace(".root", "_withBkgBDT.root")
#     out_path = os.path.join(output_dir_WCh, output_name)

#     # Save DataFrame to ROOT
#     with uproot.recreate(out_path) as f:
#         f["Dstree"] = DataFrames[s]

#     print(f"Saved: {out_path}")

Reverse PID

In [None]:
# import os
# import uproot
# import numpy as np  # Ensure this is available for dtype conversion

# # === Define ReverseID samples ===
# samples_ReverseID = ["Signal_ReverseID", "BB_ReverseID", "ccbar_ReverseID", "ddbar_ReverseID", "ssbar_ReverseID", "taupair_ReverseID", "uubar_ReverseID", "Data_ReverseID"]

# # === Output directory for BkgBDT ReverseID files ===
# output_dir_ReverseID = "/group/belle/users/amubarak/03-ML/BkgBDT_ReverseID/"
# os.makedirs(output_dir_ReverseID, exist_ok=True)

# # === Base input path for ReverseID ===
# base_input_dir_ReverseID = "/group/belle/users/amubarak/02-Grid/Sample_Grid_ReverseID"
# Date_ReverseID = "0530"
# Attempt_ReverseID = "0"

# # === Save each ReverseID DataFrame with Ds_BkgBDT ===
# for s in samples_ReverseID:
#     if s not in DataFrames:
#         print(f"Warning: {s} not in DataFrames — skipping.")
#         continue

#     # Convert Ds_BkgBDT to float32 if present
#     if "Ds_BkgBDT" in DataFrames[s].columns:
#         DataFrames[s]["Ds_BkgBDT"] = DataFrames[s]["Ds_BkgBDT"].astype(np.float32)

#     # Set original file name
#     if s == "Signal_ReverseID":
#         original_name = "Ds2D0enu-Signal_ReverseID.root"
#     else:
#         tag = s.replace("_ReverseID", "")
#         original_name = f"Ds2D0e-Generic_Ds_{Date_ReverseID}25_{Attempt_ReverseID}_{tag}.root"

#     # Build output file name with _withBkgBDT suffix
#     output_name = original_name.replace(".root", "_withBkgBDT.root")
#     out_path = os.path.join(output_dir_ReverseID, output_name)

#     # Save DataFrame to ROOT
#     with uproot.recreate(out_path) as f:
#         f["Dstree"] = DataFrames[s]

#     print(f"Saved: {out_path}")

Reverse PID and Wrong Charge

In [None]:
import os
import uproot
import numpy as np  # Ensure this is available for dtype conversion

# === Define ReverseID samples ===
samples_ReverseID_WCh = ["All_ReverseID_WCh", "BB_ReverseID_WCh", "ccbar_ReverseID_WCh", "ddbar_ReverseID_WCh", "ssbar_ReverseID_WCh", "taupair_ReverseID_WCh", "uubar_ReverseID_WCh", "uds_ReverseID_WCh", "Data_ReverseID_WCh"]

# === Output directory for BkgBDT ReverseID files ===
output_dir_ReverseID_WCh = "/group/belle/users/amubarak/03-ML/BkgBDT_ReverseID_WCh/"
os.makedirs(output_dir_ReverseID_WCh, exist_ok=True)

# === Base input path for ReverseID ===
base_input_dir_ReverseID_WCh = "/group/belle/users/amubarak/02-Grid/Sample_Grid_ReverseID_WCh"
Date_ReverseID_WCh = "0530"
Attempt_ReverseID_WCh = "0"

# === Save each ReverseID DataFrame with Ds_BkgBDT ===
for s in samples_ReverseID_WCh:
    if s not in DataFrames:
        print(f"Warning: {s} not in DataFrames — skipping.")
        continue

    # Convert Ds_BkgBDT to float32 if present
    if "Ds_BkgBDT" in DataFrames[s].columns:
        DataFrames[s]["Ds_BkgBDT"] = DataFrames[s]["Ds_BkgBDT"].astype(np.float32)

    # Set original file name
    if s == "Signal_ReverseID_WCh":
        original_name = "Ds2D0enu-Signal_ReverseID_WCh.root"
    else:
        tag = s.replace("_ReverseID_WCh", "")
        original_name = f"Ds2D0e-Generic_Ds_{Date_ReverseID_WCh}25_{Attempt_ReverseID_WCh}_{tag}.root"

    # Build output file name with _withBkgBDT suffix
    output_name = original_name.replace(".root", "_withBkgBDT.root")
    out_path = os.path.join(output_dir_ReverseID_WCh, output_name)

    # Save DataFrame to ROOT
    with uproot.recreate(out_path) as f:
        f["Dstree"] = DataFrames[s]

    print(f"Saved: {out_path}")