In [None]:
# !pip uninstall -y scikit-learn
# !pip install scikit-learn==1.3.1

In [None]:
# ! pip install --upgrade pip
# ! pip install --user xgboost seaborn
# ! pip install --user bayesian-optimization

In [None]:
# import mplhep
import sys

import seaborn as sns

import numpy as np
import pandas as pd
import uproot
from matplotlib import pyplot as plt

from sklearn.datasets import make_classification,make_regression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import auc,roc_curve,confusion_matrix,classification_report,precision_recall_curve,mean_squared_error,accuracy_score,roc_auc_score
from sklearn.model_selection import GridSearchCV, cross_validate, validation_curve,train_test_split,KFold,learning_curve,cross_val_score
from sklearn.utils import compute_sample_weight
from scipy.stats import ks_2samp

import xgboost
from xgboost import XGBClassifier

from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import randint, uniform

In [None]:
plt.rcParams.update({
    "axes.labelsize": 16,
    "xtick.labelsize": 12,
    "ytick.labelsize": 12,
    "legend.fontsize": 14,
    "figure.titlesize": 20
})

In [None]:
pd.set_option('display.max_rows', 200000)
pd.set_option('display.max_columns', 200000)

In [None]:
sys.path.append("/home/belle2/amubarak/Ds2D0enue_Analysis/07-Python_Functions/")

# Prep-Work

### Import Data

Incorrect Charge
- This is the only sample that has the $D^{0}$ tree now. It should not matter to save it for the other control samples or the other samples in general

In [None]:
import os
import pandas as pd
import uproot
from tqdm import tqdm

# === Configuration ===
DataFrames = {}
samples_WCh = [
    "Signal_WCh", "BB_WCh", "ccbar_WCh", "ddbar_WCh",
    "ssbar_WCh", "taupair_WCh", "uubar_WCh", "Data_WCh"
]
Date_WCh = "0630"
Attempt_WCh = "0"

# === Load one sample at a time ===
for sample in tqdm(samples_WCh, desc="Loading WCh samples"):
    if sample == "Signal_WCh":
        path = "/home/belle2/amubarak/C01-Simulated_Events/Ds2D0enu-Signal_WCh.root:D02kmpiptree"
    else:
        path = f"/group/belle/users/amubarak/02-Grid/Sample_Grid_WCh/Ds2D0e-Generic_Ds_{Date_WCh}25_{Attempt_WCh}_{sample}.root:D02kmpiptree"

    try:
        df = uproot.concatenate(path, library='pd')
        DataFrames[sample] = df
        print(f"✔️ Loaded: {path} [{len(df):,} entries]")
    except Exception as e:
        print(f"❌ Failed: {sample} — {e}")

# === Combine wrong-charge backgrounds ===
background_WCh = ["BB_WCh", "ccbar_WCh", "ddbar_WCh", "ssbar_WCh", "taupair_WCh", "uubar_WCh"]
DataFrames["All_WCh"] = pd.concat([DataFrames[s] for s in background_WCh], ignore_index=True)
DataFrames["uds_WCh"] = pd.concat(
    [DataFrames[s] for s in ["uubar_WCh", "ddbar_WCh", "ssbar_WCh"]],
    ignore_index=True
)

The line below is to look at the available variables.

In [None]:
print(DataFrames.keys())

In [None]:
DataFrames["All_WCh"].columns.tolist()

### Setup
The code below will be used to apply cuts to the data.  
The range of the plots.

In [None]:
# Electron ID
#-------------------
# DataFrames["Signal"] = DataFrames["Signal"][DataFrames["Signal"]['e_electronID']>=0.95]
# DataFrames["ccbar"] = DataFrames["ccbar"][DataFrames["ccbar"]['e_electronID']>=0.95]
# DataFrames["Signal"] = DataFrames["Signal"][DataFrames["Signal"]['Ds_gammaveto_em_electronID']>=0.95]
# DataFrames["ccbar"] = DataFrames["ccbar"][DataFrames["ccbar"]['Ds_gammaveto_em_electronID']>=0.95]

# Photon Conversion
#-------------------
# DataFrames[samples[0]] = DataFrames[samples[0]][DataFrames[samples[0]]['Ds_gammaveto_M_Correction']>=0.1]
# DataFrames[samples[1]] = DataFrames[samples[1]][DataFrames[samples[1]]['Ds_gammaveto_M_Correction']>=0.1]

# Peaking Background Removal
#----------------------------
# DataFrames["ccbar"] = DataFrames["ccbar"][(DataFrames["ccbar"]['Ds_diff_D0pi']>=0.15)]
# DataFrames["Signal"] = DataFrames["Signal"][(DataFrames["Signal"]['Ds_diff_D0pi']>=0.15)]

# # Vertex Fitting
# #----------------
# DataFrames["Signal"] = DataFrames["Signal"][DataFrames["Signal"]['Ds_chiProb']>=0.01]
# DataFrames["ccbar"] = DataFrames["ccbar"][DataFrames["ccbar"]['Ds_chiProb']>=0.01]

# Dalitz Removal
#----------------------------
# DataFrames["ccbar"] = DataFrames["ccbar"][(DataFrames["ccbar"]['Ds_pi0veto_M_Correction']<=0.08) | (DataFrames["ccbar"]['Ds_pi0veto_M_Correction']>=0.16)]
# DataFrames["Signal"] = DataFrames["Signal"][(DataFrames["Signal"]['Ds_pi0veto_M_Correction']<=0.08) | (DataFrames["Signal"]['Ds_pi0veto_M_Correction']>=0.16)]

# Vertex Fit
#----------------
# DataFrames[samples[0]] = DataFrames[samples[0]][DataFrames[samples[0]]['Ds_chiProb_rank']==1]
# DataFrames[samples[1]] = DataFrames[samples[1]][DataFrames[samples[1]]['Ds_chiProb_rank']==1]

# D0 Invariant Mass
#-----------------------
# DataFrames[samples[0]] = DataFrames[samples[0]][(DataFrames[samples[0]]['Ds_D0_sideband']==1)]
# DataFrames[samples[1]] = DataFrames[samples[1]][(DataFrames[samples[1]]['Ds_D0_sideband']==1)]

# Fake $D^0$ Suppression
data/MC Comparison

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt

# === Settings ===
bins = 50
density = False
figsize = (7, 7)

# === Luminosity Scaling ===
lum_data = 364.093   # fb⁻¹
lum_MC = 1443.999    # fb⁻¹
scale_factor = lum_data / lum_MC

# === Correct Input Variables and Labels ===
Variables = [
    'K_kmpip_abs_dr',
    'pi_kmpip_abs_dr',
    'K_kmpip_kaonID',
    'pi_kmpip_pionID',
    'D0_kmpip_dM',
    'D0_kmpip_chiProb',
    'D0_kmpip_flightDistance',
    'D0_kmpip_useCMSFrame_p',
    'D0_kmpip_cos_decayAngle_1',
]

features = [
    r'$dr(K^{-})\;[\mathrm{cm}]$',
    r'$dr(\pi^{+})\;[\mathrm{cm}]$',
    r'$kaonID(K^{-})$',
    r'$pionID(\pi^{+})$',
    r'$m(D^{0}) - m_{PDG}(D^{0})\;[\mathrm{GeV}/c^{2}]$',
    r'$p$-value$(D^{0})$',
    r'$Flight \; Distance(D^{0})\;[\mathrm{cm}]$',
    r'$p^{*} (D^{0})\;[\mathrm{GeV}/c]$',
    r'$\cos\theta^*_{daughter_1}$',
]

ranges = {
    'K_kmpip_abs_dr': [0, 0.08],
    'pi_kmpip_abs_dr': [0, 0.08],
    'K_kmpip_kaonID': [0.5, 1],
    'pi_kmpip_pionID': [0.2, 1],
    'D0_kmpip_dM': [-0.02, 0.02],
    'D0_kmpip_chiProb': [0, 1],
    'D0_kmpip_flightDistance': [-0.4, 0.4],
    'D0_kmpip_useCMSFrame_p': [2.5, 5.0],
    'D0_kmpip_cos_decayAngle_1': [-1, 1],
}

colors = {
    'mc': '#007C91',
    'data': 'black',
}

# === Load Data ===
df_mc = DataFrames["All_WCh"]
df_data = DataFrames["Data_WCh"]

# === Plot Loop ===
for var, label in zip(Variables, features):
    if var not in ranges:
        print(f"Skipping {var}: no defined range")
        continue

    Range = ranges[var]
    bin_width = (Range[1] - Range[0]) / bins
    edges = np.linspace(Range[0], Range[1], bins + 1)
    bin_centers = 0.5 * (edges[:-1] + edges[1:])

    # Clean and drop NaN/inf
    data_vals = df_data[var].replace([np.inf, -np.inf], np.nan).dropna()
    mc_vals = df_mc[var].replace([np.inf, -np.inf], np.nan).dropna()

    # Histograms
    hist_data, _ = np.histogram(data_vals, bins=edges, density=density)
    hist_mc_raw, _ = np.histogram(mc_vals, bins=edges, density=density)

    # Apply scaling
    if density:
        hist_mc = hist_mc_raw * scale_factor  # shape only
        err_mc = np.sqrt(hist_mc_raw * len(mc_vals)) / len(mc_vals) * scale_factor
    else:
        hist_mc = hist_mc_raw * scale_factor
        err_mc = np.sqrt(hist_mc_raw) * scale_factor

    if density:
        err_data = np.sqrt(hist_data * len(data_vals)) / len(data_vals)
    else:
        err_data = np.sqrt(hist_data)

    # Ratio and uncertainty
    with np.errstate(divide='ignore', invalid='ignore'):
        ratio = hist_data / hist_mc
        ratio[~np.isfinite(ratio)] = 0

        err_ratio = ratio * np.sqrt(
            (err_data / np.maximum(hist_data, 1e-10))**2 +
            (err_mc / np.maximum(hist_mc, 1e-10))**2
        )
        err_ratio[~np.isfinite(err_ratio)] = 0

    # === Plot
    fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True, figsize=figsize,
                                   gridspec_kw={"height_ratios": [3, 1]})

    # Top: Data vs MC
    ax1.hist(edges[:-1], bins=edges, weights=hist_mc, histtype='step',
             linewidth=2.5, color=colors['mc'], label="Generic MC")

    ax1.errorbar(bin_centers, hist_data, yerr=err_data, fmt='o',
                 color=colors['data'], label="Data", markersize=3,
                 capsize=1, elinewidth=1)

    ax1.set_ylabel(r'$Entries/({:.3f})$'.format(bin_width))
    ax1.set_title(r"$\int\mathcal{L}dt =\;364.093$ fb$^{-1}$", loc="right")
    ax1.set_xlim(Range)
    ax1.legend(loc='upper right')

    # Bottom: Ratio
    ax2.axhline(1.0, color='black', lw=1)
    ax2.axhline(1.1, color='gray', lw=1, ls='dashed')
    ax2.axhline(0.9, color='gray', lw=1, ls='dashed')
    ax2.errorbar(bin_centers, ratio, yerr=err_ratio, fmt='o',
                 color='black', markersize=3, capsize=1)

    ax2.set_ylabel("Data / MC")
    ax2.set_xlabel(label)
    ax2.set_xlim(Range)
    ax2.set_ylim(0.5, 1.5)

    plt.show()
    plt.close()