In [1]:
import json
import uproot
import awkward as ak
import vector
import numpy as np
import matplotlib.pyplot as plt
import warnings

# plotting params
plt.rcParams.update({
    'figure.figsize': (10, 6),
    'axes.grid': True,
    'grid.alpha': 0.6,
    'grid.linestyle': '--',
    'font.size': 14,
    "figure.dpi": 200,
})

# Suppress a harmless warning from the vector library with awkward arrays
warnings.filterwarnings("ignore", message="Passing an awkward array to a ufunc")

# Register the vector library with awkward array
ak.behavior.update(vector.backends.awkward.behavior)

# --- 1. CONFIGURATION ---
# All user-changable settings are here.
with open("hh-bbbb-obj-config.json", "r") as config_file:
    CONFIG = json.load(config_file)

# --- 2. DATA LOADING & PREPARATION FUNCTIONS ---
from data_loading_helpers import load_and_prepare_data, select_gen_b_quarks_from_higgs

# --- 3. ANALYSIS FUNCTIONS ---
from analysis_helpers import *

# --- 4. PLOTTING FUNCTIONS ---
from plotting_helpers import *

# --- 5. SELECTION CUT FUNCTIONS ---
def apply_custom_cuts(reco_jets, config, key, kinematic_only=False):
    """
    Apply custom cuts to a jet collection.

    Parameters
    ----------
    reco_jets : awkward.Array
        Jet collection (offline or L1).
    config : dict
        Global CONFIG dict.
    key : str
        Either "offline" or "l1" to select the appropriate config.
    kinematic_only : bool
        If True, only apply kinematic cuts, i.e., pt and eta cuts
        If False, apply custom tagger cuts as well
    """
    subcfg = config[key]

    pt_cut = subcfg["pt_cut"]
    eta_cut = subcfg["eta_cut"]
    b_tag_cut = subcfg["b_tag_cut"]
    tagger_name = subcfg["tagger_name"]

    print(f"\nApplying custom pT cut of {pt_cut} GeV for {key} jets...")
    pt_mask = reco_jets.pt > pt_cut
    eta_mask = abs(reco_jets.eta) < eta_cut
    final_mask = pt_mask & eta_mask

    if kinematic_only:
        pass
    else:
        print(f"Applying custom cuts for {tagger_name} ({key})...")

        if key == "offline":
            charm_veto_cut = subcfg["charm_veto_cut"]
            electron_veto_cut = subcfg["electron_veto_cut"]
            muon_veto_cut = subcfg["muon_veto_cut"]

            if tagger_name.startswith("btagPNet"):
                b_jet_mask = (reco_jets.btagPNetB > b_tag_cut)
                charm_veto_mask = (reco_jets.btagPNetCvB < charm_veto_cut)
                final_mask = final_mask & charm_veto_mask & b_jet_mask

            elif tagger_name.startswith("btagUParTAK4"):
                b_jet_mask = (reco_jets.btagUParTAK4probb > b_tag_cut)
                charm_veto_mask = (reco_jets.btagUParTAK4CvB < charm_veto_cut)
                electron_veto_mask = (reco_jets.btagUParTAK4Ele < electron_veto_cut)
                muon_veto_mask = (reco_jets.btagUParTAK4Mu < muon_veto_cut)
                final_mask = (
                    final_mask
                    & charm_veto_mask
                    & electron_veto_mask
                    & muon_veto_mask
                    & b_jet_mask
                )

        elif key == "l1ng":
            # For L1, just apply the tagger cut generically
            tag_mask = getattr(reco_jets, tagger_name) > b_tag_cut
            final_mask = final_mask & tag_mask

        elif key == "l1ext":
            tag_mask = getattr(reco_jets, tagger_name) > b_tag_cut
            final_mask = final_mask & tag_mask

    reco_jets = reco_jets[final_mask]
    return reco_jets



In [3]:
import numpy as np
import awkward as ak
import vector
import uproot
import json
import os

# Register vector behavior
ak.behavior.update(vector.backends.awkward.behavior)

def process_batch(events, config, max_constituents=30):
    """
    Processes a single batch of events using index-based slicing on individual arrays.
    """
    
    # --- 1. Select Objects ---
    jets = events[config["l1ng"]["collection_name"]]
    cands = events["L1BarrelExtPuppi"]

    jets["vector"] = ak.zip({k: jets[k] for k in ["pt", "eta", "phi"]}, with_name="Momentum4D")
    cands["vector"] = ak.zip({k: cands[k] for k in ["pt", "eta", "phi", "dxy", "z0", "puppiWeight", "id", "charge"]}, with_name="Momentum4D")

    # Ensure they are standard arrays (just in case)
    # Note: We access fields directly below to avoid vector object issues
    
    # --- 2. Labeling (Signal vs Background) ---
    # Filter for b-quarks from Higgs
    gen_b = select_gen_b_quarks_from_higgs(events)
    gen_b["vector"] = ak.zip({k: gen_b[k] for k in ["pt", "eta", "phi"]}, with_name="Momentum4D")
    
    # Broadcasting for labeling
    # jets: (E, J, 1), gen: (E, 1, G)
    dr_labels = jets.vector[:, :, None].deltaR(gen_b.vector[:, None, :])
    is_b_jet = ak.any(dr_labels < 0.4, axis=2)
    labels = ak.values_astype(is_b_jet, np.float32)

    # --- 3. Constituent Matching ---
    
    # A. Compute DeltaR Matrix
    jet_vec = jets.vector[:, :, None]
    cand_vec = cands.vector[:, None, :]
    
    dr_matrix = jet_vec.deltaR(cand_vec)
    in_cone = dr_matrix < 0.4
    
    # B. Create Indices for Slicing
    # Get local indices (0..N_cands) per event -> Shape (E, C)
    cand_indices = ak.local_index(cands, axis=1)
    
    # Broadcast to (E, J, C) to match the mask
    indices_broadcast, mask_broadcast = ak.broadcast_arrays(cand_indices[:, None, :], in_cone)
    
    # Apply mask -> Shape (E, J, SubConstituents)
    matches = indices_broadcast[mask_broadcast]
    
    # --- 4. Feature Extraction (The Robust Fix) ---
    
    # We prepare the "gather" map by flattening the 3D matches to 2D
    # This matches the (Event, Candidate) structure of the source arrays
    match_counts = ak.num(matches, axis=2)
    matches_flat = ak.flatten(matches, axis=2)
    
    def get_matched_prop(field_name, default_val=0.0):
        if field_name in cands.fields:
            # 1. Slice 2D source with 2D indices (Safe and Robust)
            flat_values = cands[field_name][matches_flat]
            # 2. Unflatten back to 3D (Events, Jets, Constituents)
            return ak.unflatten(flat_values, match_counts, axis=2)
            
        # Handle missing fields (return 0s of the correct shape)
        # We use matches as a template for the shape
        return ak.zeros_like(matches, dtype=np.float32) + default_val

    m_pt = get_matched_prop("pt")
    m_eta = get_matched_prop("eta")
    m_phi = get_matched_prop("phi")
    m_dxy = get_matched_prop("dxy")
    m_z0 = get_matched_prop("z0")
    m_w = get_matched_prop("puppiWeight", 1.0)
    m_id = get_matched_prop("pdgId", 0.0)
    m_charge = get_matched_prop("charge", 0.0)

    # --- 5. Relative Features ---
    # Broadcast Jet properties (E, J) to match Constituents (E, J, SubC)
    j_pt = jets.pt[:, :, None]
    j_eta = jets.eta[:, :, None]
    j_phi = jets.phi[:, :, None]
    
    # 1. Log pT Rel
    log_pt_rel = np.log(np.maximum(m_pt, 1e-3) / np.maximum(j_pt, 1e-3))
    
    # 2. Delta Eta/Phi
    deta = m_eta - j_eta
    dphi = m_phi - j_phi
    dphi = (dphi + np.pi) % (2 * np.pi) - np.pi # Wrap to [-pi, pi]
    
    # 3. Log Delta R
    log_dr = np.log(np.maximum(np.sqrt(deta**2 + dphi**2), 1e-3))
    
    # --- 6. Padding & Stacking ---
    
    def pad_and_fill(arr, target=max_constituents):
        # Pad axis 2 (constituents) and fill None with 0
        return ak.fill_none(ak.pad_none(arr, target, axis=2, clip=True), 0.0)

    # Stack features
    features_list = [
        pad_and_fill(log_pt_rel),
        pad_and_fill(log_dr),
        pad_and_fill(deta),
        pad_and_fill(dphi),
        pad_and_fill(m_dxy),
        pad_and_fill(m_z0),
        pad_and_fill(m_w),
        pad_and_fill(m_id),
        pad_and_fill(m_charge)
    ]
    
    # Flatten structure: (Events, Jets, Const) -> (Total_Jets, Const)
    X_batch = np.stack([ak.to_numpy(ak.flatten(f, axis=1)) for f in features_list], axis=-1)
    
    # Flatten Labels: (Events, Jets) -> (Total_Jets)
    y_batch = ak.to_numpy(ak.flatten(labels, axis=None))
    
    return X_batch, y_batch

def generate_dataset(config_path, output_file="l1_training_data.npz", chunk_size=10000):
    """
    Main loop that processes the file in chunks and saves to disk incrementally.
    """
    with open(config_path, "r") as f:
        CONFIG = json.load(f)

    file_pattern = CONFIG["file_pattern"]
    tree_name = CONFIG["tree_name"]
    
    # Define branches to read (Optimization: Only read what we need)
    collections_to_load = [
        CONFIG["l1ng"]["collection_name"],      # Jets
        "L1BarrelExtPuppi",                     # Candidates
        "GenPart"                               # Labels
    ]
    
    all_X = []
    all_y = []
    
    print(f"Processing {file_pattern} in chunks of {chunk_size}...")
    
    # Iterate over the file
    for batch_events in uproot.iterate(f"{file_pattern}:{tree_name}", step_size=chunk_size, library="ak"):
        
        print("Reshaping data into nested objects...")
        for prefix in collections_to_load:
            prefixed_fields = [field for field in batch_events.fields if field.startswith(prefix + "_")]
            if not prefixed_fields:
                print(f"Warning: No fields found with prefix '{prefix}_'. Skipping.")
                continue
            field_map = {field.replace(prefix + "_", ""): batch_events[field] for field in prefixed_fields}
            batch_events[prefix] = ak.zip(field_map)

        # # Clean up field names (remove prefixes like 'L1puppiJetSC4NG_')
        # # This allows process_batch to just use "pt" instead of full names
        # for col in collections_to_load:
        #     prefix = col + "_"
        #     fields = [f for f in batch_events.fields if f.startswith(prefix)]
        #     if fields:
        #         sub_record = {f.replace(prefix, ""): batch_events[f] for f in fields}
        #         batch_events[col] = ak.zip(sub_record)

        # Process
        X_chunk, y_chunk = process_batch(batch_events, CONFIG)
        
        all_X.append(X_chunk)
        all_y.append(y_chunk)
        print(f"  Processed batch: {len(X_chunk)} jets")

    # Final Concatenation
    final_X = np.concatenate(all_X, axis=0)
    final_y = np.concatenate(all_y, axis=0)
    
    print(f"Saving {final_X.shape} dataset to {output_file}...")
    np.savez_compressed(output_file, X=final_X, y=final_y)
    print("Done.")


In [None]:
generate_dataset("hh-bbbb-obj-config.json")

In [4]:
import matplotlib.pyplot as plt

file_pattern = "/vols/cms/at3722/root-obj-perf/data/hh4b_puppi_pf/hh4b/data_0.root"
tree_name = CONFIG["tree_name"]
max_events = CONFIG["max_events"]
collections_to_load = [
    CONFIG["l1ng"]["collection_name"],      # Jets
    "L1BarrelExtPuppi",                     # Candidates
    "GenPart"                               # Labels
]
events = load_and_prepare_data(file_pattern, tree_name, collections_to_load, max_events=max_events)

n_constituents = 16

l1_col = CONFIG["l1ng"]["collection_name"]
l1_puppi_col = "L1BarrelExtPuppi"

gen_b = select_gen_b_quarks_from_higgs(events)

l1_jet_vecs = events[l1_col].vector[:, :, None]
l1_cand_vec = events[l1_puppi_col].vector[:, None, :]

dR_matrix = l1_jet_vecs.deltaR(l1_cand_vec)
in_cone = dR_matrix < 0.4

cands_indices = ak.local_index(events[l1_puppi_col], axis=1)
indices_broadcast, mask_broadcast = ak.broadcast_arrays(cands_indices[:, None, :], in_cone)
cand_idxs_matched = indices_broadcast[mask_broadcast] 

cands_broadcast, mask_broadcast = ak.broadcast_arrays(events[l1_puppi_col][:, None, :], in_cone)
matched_cands = cands_broadcast[mask_broadcast] 
matched_pt_sorted_idxs = ak.argsort(matched_cands.pt, axis=2, ascending=False, stable=True)
matched_cands = matched_cands[matched_pt_sorted_idxs]

j_pt = events[l1_col].pt[:, :, None]
j_eta = events[l1_col].eta[:, :, None]
j_phi = events[l1_col].phi[:, :, None]

m_pt = matched_cands.pt
m_eta = matched_cands.eta
m_phi = matched_cands.phi

# 1. Log pT Rel
log_pt_rel = np.log(np.maximum(m_pt, 1e-3) / np.maximum(j_pt, 1e-3))

# 2. Delta Eta/Phi
deta = m_eta - j_eta
dphi = m_phi - j_phi
dphi = (dphi + np.pi) % (2 * np.pi) - np.pi # Wrap to [-pi, pi]

# 3. Log Delta R
log_dr = np.log(np.maximum(np.sqrt(deta**2 + dphi**2), 1e-3))

# 4. impact parameter dxy and z0
m_dxy = matched_cands.dxy
m_z0 = matched_cands.z0

# 5. puppi weight
m_w = matched_cands.puppiWeight

# 6. pdgId
m_id = matched_cands.id

# 7. charge
m_charge = matched_cands.charge

# 8. energy
m_e = matched_cands.e


def pad_and_fill(arr, target=n_constituents):
    # Pad axis 2 (constituents) and fill None with 0
    return ak.fill_none(ak.pad_none(arr, target, axis=2, clip=True), 0.0)

feature_list = [
    pad_and_fill(log_pt_rel),
    pad_and_fill(log_dr),
    pad_and_fill(deta),
    pad_and_fill(dphi),
    pad_and_fill(m_dxy),
    pad_and_fill(m_z0),
    pad_and_fill(m_w),
    pad_and_fill(m_id),
    pad_and_fill(m_charge),
    pad_and_fill(m_e),
]

X = np.stack([ak.to_numpy(ak.flatten(f, axis=1)) for f in feature_list], axis=-1)

from analysis_helpers import get_purity_mask_cross_matched
is_pure_label = get_purity_mask_cross_matched(gen_b, events[l1_col])
labels = ak.values_astype(is_pure_label, np.float32)
display(labels)
Y = ak.to_numpy(ak.flatten(labels, axis=None))

# [E, n_jets, n_constituents] exactly because clip=True in pad_none
padded_matched_cands = pad_and_fill(matched_cands, n_constituents)


padded_matched_cands[0, 0, 0].fields

X, Y









# for idx in range(3):
#     print(f"\nEvent {idx}:")
#     l1_jets = events[l1_col][idx]
#     l1_cands = events[l1_puppi_col][idx]
#     print(f"  L1 Jets: {len(l1_jets)}")
#     print(f"  L1 Candidates: {len(l1_cands)}")

#     plt.scatter(
#         l1_cands.phi,
#         l1_cands.eta,
#         s=5,
#         c='gray',
#         alpha=0.5,
#         label='L1 Candidates'
#     )
#     plt.scatter(
#         l1_jets.phi,
#         l1_jets.eta,
#         s=50,
#         c='red',
#         marker='x',
#         label='L1 Jets'
#     )
#     plt.xlabel('Phi')
#     plt.ylabel('Eta')
#     plt.title(f'L1 Jets and Candidates - Event {idx}')
#     plt.legend()
#     plt.show()

#     l1_jets_expanded = l1_jets
    
#     for jdx in range(min(5, len(l1_jets))):
#         jet = l1_jets[jdx]
#         print(f"    Jet {jdx}: pt={jet.pt:.2f}, eta={jet.eta:.2f}, phi={jet.phi:.2f}")
#         # Find constituents within DeltaR < 0.4
#         jet_vec = vector.obj(pt=jet.pt, eta=jet.eta, phi=jet.phi)
#         constituents_in_jet = []
#         for cdx in range(len(l1_cands)):
#             cand = l1_cands[cdx]
#             cand_vec = vector.obj(pt=cand.pt, eta=cand.eta, phi=cand.phi)
#             delta_r = jet_vec.deltaR(cand_vec)
#             if delta_r < 0.4:
#                 constituents_in_jet.append(cand)
#         print(f"      Constituents in DeltaR<0.4: {len(constituents_in_jet)}")

Loading data from /vols/cms/at3722/root-obj-perf/data/hh4b_puppi_pf/hh4b/data_0.root...
Reshaping data into nested objects...
Creating 4-vector objects...
Loaded and restructured 1000 events.
Selecting gen-level b-quarks...
Found 4000 b-quarks from Higgs decays.


  return impl(*broadcasted_args, **(kwargs or {}))
  return impl(*broadcasted_args, **(kwargs or {}))


(array([[[-9.06365097e-01, -4.27261734e+00, -1.33132935e-02, ...,
           2.00000000e+00,  0.00000000e+00,  4.05755959e+01],
         [-1.00352883e+00, -5.08091450e+00,  4.13513184e-03, ...,
           3.00000000e+00,  0.00000000e+00,  3.68633995e+01],
         [-2.33039975e+00, -2.18690467e+00,  9.14001465e-02, ...,
           3.00000000e+00,  0.00000000e+00,  9.88432217e+00],
         ...,
         [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
           0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
         [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
           0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
         [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
           0.00000000e+00,  0.00000000e+00,  0.00000000e+00]],
 
        [[-1.34730887e+00, -4.73386145e+00, -8.78906250e-03, ...,
           0.00000000e+00, -1.00000000e+00,  2.70730629e+01],
         [-1.44615471e+00, -3.94351459e+00,  8.66699219e-03, ...,
           0.00000000

In [None]:
import matplotlib.pyplot as plt
plt.hist(ak.flatten(ak.flatten(np.log((m_dxy, 1e-10)))))
plt.show()

: 

In [None]:
print("Hello world")

: 