In [1]:
import os
import pandas as pd

In [2]:
# Function to load Parquet files and extract relevant data
def load_and_label_parquet_files(input_dir, signal_prefix="ZH"):
    """
    Load all Parquet files from a directory and assign labels based on file names.
    
    Parameters:
    - input_dir (str): Path to the directory containing Parquet files.
    - signal_prefix (str): Prefix to identify signal files.
    
    Returns:
    - list: Column names from the first Parquet file.
    - pd.DataFrame: Summary DataFrame with name (prefix), event number, and signal flag.
    """
    parquet_files = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.parquet')]
    if not parquet_files:
        raise FileNotFoundError(f"No Parquet files found in {input_dir}")
    
    data_summary = []
    columns = None
    
    for file in parquet_files:
        # Extract prefix from the file name
        prefix = os.path.basename(file).split('_')[0]
        
        # Determine label: 1 for signal, 0 for background
        signal_flag = 1 if prefix == signal_prefix else 0
        
        # Load Parquet file
        print(f"Loading {file}...")
        df = pd.read_parquet(file)
        
        # Save column names from the first file
        if columns is None:
            columns = list(df.columns)
        
        # Add to summary
        data_summary.append({
            "name": prefix,
            "event number": len(df),
            "signal": signal_flag
        })
    
    # Create a summary DataFrame
    summary_df = pd.DataFrame(data_summary)

    # Combine rows with the same prefix and sum event numbers
    summary_df = summary_df.groupby(["name", "signal"], as_index=False).sum()
    
    return columns, summary_df

In [3]:
# Path to training data
input_dir = "./data/train"

In [4]:
# Load files and analyze
try:
    columns, analysis_df = load_and_label_parquet_files(input_dir)
    
    # Display column names
    print("Columns in the training files:")
    print(columns)
    
    # Display analysis summary in the notebook
    display(analysis_df)
except FileNotFoundError as e:
    print(e)

Loading ./data/train/DYto2L-2Jets_MLL-50_FxFx_2022_postEE_DiJet_bx_CR_BB_combined.parquet...
Loading ./data/train/DYto2L-2Jets_MLL-50_FxFx_2022_postEE_DiJet_bx_CR_B_combined.parquet...
Loading ./data/train/DYto2L-2Jets_MLL-50_FxFx_2022_postEE_DiJet_bx_CR_LF_combined.parquet...
Loading ./data/train/DYto2L-2Jets_MLL-50_FxFx_2022_postEE_DiJet_bx_CR_TT_combined.parquet...
Loading ./data/train/DYto2L-2Jets_MLL-50_FxFx_2022_postEE_DiJet_bx_SR_2L2B_combined.parquet...
Loading ./data/train/DYto2L-2Jets_MLL-50_FxFx_2022_postEE_DiJet_cx_CR_BB_combined.parquet...
Loading ./data/train/DYto2L-2Jets_MLL-50_FxFx_2022_postEE_DiJet_cx_CR_B_combined.parquet...
Loading ./data/train/DYto2L-2Jets_MLL-50_FxFx_2022_postEE_DiJet_cx_CR_LF_combined.parquet...
Loading ./data/train/DYto2L-2Jets_MLL-50_FxFx_2022_postEE_DiJet_cx_CR_TT_combined.parquet...
Loading ./data/train/DYto2L-2Jets_MLL-50_FxFx_2022_postEE_DiJet_cx_SR_2L2B_combined.parquet...
Loading ./data/train/DYto2L-2Jets_MLL-50_FxFx_2022_postEE_DiJet_incl

Unnamed: 0,name,signal,event number
0,DYto2L-2Jets,0,712705
1,TTTo2L2Nu,0,67843
2,ZH,1,412946


In [5]:
# Generate a PRL-style LaTeX table
def generate_prl_style_table(df, output_file=None):
    """
    Generate a PRL-style LaTeX table summarizing the file analysis.
    
    Parameters:
    - df (pd.DataFrame): DataFrame containing the analysis summary.
    - output_file (str): Path to save the LaTeX table (optional).
    
    Returns:
    - str: PRL-style LaTeX table as a string.
    """
    # Define the LaTeX structure for PRL format
    latex_table = (
        "\\begin{table}[h!]\n"
        "\\centering\n"
        "\\begin{tabular}{lrl}\n"  # Define columns: l = left-aligned, r = right-aligned
        "\\hline\n"
        "Name & Event Number & Signal \\\\\n"
        "\\hline\n"
    )

    # Add rows
    for _, row in df.iterrows():
        latex_table += f"{row['name']} & {row['event number']} & {row['signal']} \\\\\n"

    # Close the table
    latex_table += (
        "\\hline\n"
        "\\end{tabular}\n"
        "\\caption{Summary of Training Files.}\n"
        "\\label{tab:training_summary}\n"
        "\\end{table}\n"
    )

    # Save the table if requested
    if output_file:
        with open(output_file, "w") as f:
            f.write(latex_table)
        print(f"PRL-style LaTeX table saved to {output_file}")
    
    return latex_table

# Generate and display the PRL-style LaTeX table
prl_latex_table = generate_prl_style_table(analysis_df, output_file="file_summary_prl.tex")
print(prl_latex_table)

PRL-style LaTeX table saved to file_summary_prl.tex
\begin{table}[h!]
\centering
\begin{tabular}{lrl}
\hline
Name & Event Number & Signal \\
\hline
DYto2L-2Jets & 712705 & 0 \\
TTTo2L2Nu & 67843 & 0 \\
ZH & 412946 & 1 \\
\hline
\end{tabular}
\caption{Summary of Training Files.}
\label{tab:training_summary}
\end{table}

