In [2]:
import uproot
import os
from tqdm import tqdm
import pandas as pd

# Directory containing the ROOT files
root_dir = "../data/val_dataset/val_5M/"  

# List to store file names and row counts
file_info = []

# Iterate through each file in the directory
for root_file in tqdm(os.listdir(root_dir)):
    if root_file.endswith(".root"):
        file_path = os.path.join(root_dir, root_file)
        try:
            # Open the ROOT file and get the first tree
            with uproot.open(file_path) as f:
                # List available keys to find the tree
                tree_name = [key for key in f.keys() if "tree" in key.lower() or "events" in key.lower()]
                if tree_name:
                    tree = f[tree_name[0]]
                    num_entries = tree.num_entries
                    file_info.append({"file": root_file, "rows": num_entries})
                else:
                    file_info.append({"file": root_file, "rows": "No tree found"})
        except Exception as e:
            file_info.append({"file": root_file, "rows": f"Error: {e}"})

# Convert the results to a DataFrame
df = pd.DataFrame(file_info)

# Display the DataFrame
print(df)

# Save the DataFrame to a CSV file
csv_path = "root_file_lengths.csv"
df.to_csv(csv_path, index=False)
print(f"File lengths saved to: {csv_path}")


  0%|          | 0/50 [00:00<?, ?it/s]

100%|██████████| 50/50 [00:01<00:00, 43.97it/s]


                    file    rows
0       HToWW4Q_123.root  100000
1         WToQQ_122.root  100000
2         WToQQ_123.root  100000
3         HToCC_122.root  100000
4         WToQQ_120.root  100000
5         ZToQQ_121.root  100000
6         HToBB_120.root  100000
7     HToWW2Q1L_123.root  100000
8         HToCC_123.root  100000
9     HToWW2Q1L_120.root  100000
10        HToBB_124.root  100000
11     TTBarLep_123.root  100000
12        TTBar_120.root  100000
13        HToBB_122.root  100000
14        HToGG_122.root  100000
15     TTBarLep_124.root  100000
16        TTBar_121.root  100000
17    HToWW2Q1L_121.root  100000
18    HToWW2Q1L_122.root  100000
19  ZJetsToNuNu_123.root  100000
20        TTBar_123.root  100000
21        ZToQQ_124.root  100000
22    HToWW2Q1L_124.root  100000
23        HToCC_124.root  100000
24  ZJetsToNuNu_120.root  100000
25  ZJetsToNuNu_122.root  100000
26  ZJetsToNuNu_121.root  100000
27        HToBB_123.root  100000
28      HToWW4Q_122.root  100000
29      HT

In [3]:
import uproot

filepath = "../data/val_dataset/val_5M/HToBB_120.root"  # Path to ROOT file

# Open ROOT file
file = uproot.open(filepath)

# Print all tree names
print("Available Trees:", file.keys())

# Open the main tree (assuming it's named 'tree')
tree = file["tree"]

# Print all branches (columns) available in the ROOT file
print("Available Branches (Columns):", tree.keys())


Available Trees: ['tree;1']
Available Branches (Columns): ['part_px', 'part_py', 'part_pz', 'part_energy', 'part_deta', 'part_dphi', 'part_d0val', 'part_d0err', 'part_dzval', 'part_dzerr', 'part_charge', 'part_isChargedHadron', 'part_isNeutralHadron', 'part_isPhoton', 'part_isElectron', 'part_isMuon', 'label_QCD', 'label_Hbb', 'label_Hcc', 'label_Hgg', 'label_H4q', 'label_Hqql', 'label_Zqq', 'label_Wqq', 'label_Tbqq', 'label_Tbl', 'jet_pt', 'jet_eta', 'jet_phi', 'jet_energy', 'jet_nparticles', 'jet_sdmass', 'jet_tau1', 'jet_tau2', 'jet_tau3', 'jet_tau4', 'aux_genpart_eta', 'aux_genpart_phi', 'aux_genpart_pid', 'aux_genpart_pt', 'aux_truth_match']


In [7]:
import uproot
import numpy as np

# Path to the specific ROOT file
file_path = "../data/val_dataset/val_5M/HToBB_120.root"

# Open the ROOT file
with uproot.open(file_path) as f:
    # Identify the tree (assuming there's only one tree or named accordingly)
    tree_name = [key for key in f.keys() if "tree" in key.lower() or "events" in key.lower()]
    
    if not tree_name:
        print("No tree found in the file.")
    else:
        tree = f[tree_name[0]]
        
        # Load the 'label_Hbb' branch
        label_hbb = tree["label_Hbb"].array(library="np")

        # Check for zeros or False values
        zero_count = np.sum(label_hbb == 0)
        true_count = np.sum(label_hbb == 1)
        
        # Print the result
        print(f"Total Entries: {len(label_hbb)}")
        print(f"Entries with label_Hbb = 0/False: {zero_count}")
        print(f"Entries with label_Hbb = 1/True: {true_count}")
        
        # Check if any 0/False values exist
        if zero_count > 0:
            print("There are entries with label_Hbb = 0/False.")
        else:
            print("All entries have label_Hbb = 1/True.")


Total Entries: 100000
Entries with label_Hbb = 0/False: 0
Entries with label_Hbb = 1/True: 100000
All entries have label_Hbb = 1/True.
