In [131]:
from matplotlib import pyplot as plt
import pandas as pd
from pathlib import Path
import numpy as np
import os
import time
import seaborn as sns
from matplotlib.colors import ListedColormap
import matplotlib.ticker as ticker
import matplotlib.font_manager as fm

### Merge meta information

In [132]:
%%time

# = = = = = = = = = = 
# Load dataframe
# = = = = = = = = = = 
read_manually = True
subset_to_10k = False

if read_manually:
    # path to BLEU etc.
    p = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/FINAL_FINAL_meta_and_metrics_only.csv')
    
    # exclude text-columns (not necessary, already loading DF without text columns `meta_and_metrics_only`
    #headers = [*pd.read_csv(p, sep='|', nrows=1)]
    #df = pd.read_csv(p, sep='|', usecols=[c for c in headers if c not in {'html', 'nougat', 'pymupdf', 'grobid', 'pypdf', 'marker', 'tesseract'}])

    # load
    df = pd.read_csv(p, sep='|')

    # load subset indices (10_240)
    if subset_to_10k:
        df_10240 = pd.read_csv('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/scaling_data/frames/df_orig_10240.csv', sep='|')
        
        # subset
        df_subset = df[df['path'].isin(df_10240['path'])]
        df = df_subset.copy()
else:
    df = pd.read_csv('./frames/meta_frame.csv', sep='|')

CPU times: user 94.1 ms, sys: 29.2 ms, total: 123 ms
Wall time: 129 ms


In [133]:
len(df)

23398

In [134]:
normalized_flag = False
median_flag = True
threshold = 0.1

# encode how parser names are written
name_dict = {'nougat' : 'Nougat', 'pymupdf' : 'PyMuPDF', 'grobid' : 'GROBID', 'marker' : 'Marker', 
             'tesseract' : 'Tesseract', 'pypdf' : 'pypdf'}
score_types = ['bleu', 'rouge', 'car']

def compute_iqr(column: pd.Series) -> float:
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    return Q3 - Q1
    
# encode how parser names are written
name_dict = {'nougat' : 'Nougat', 'pymupdf' : 'PyMuPDF', 'grobid' : 'GROBID', 'marker' : 'Marker', 
             'tesseract' : 'Tesseract', 'pypdf' : 'pypdf'}
score_types = ['bleu', 'rouge', 'car']

# Lists for BLEU, ROUGE, and CAR columns
norm_str = '_norm' if normalized_flag else ''
score_list_dict = {}
for score in score_types:
    score_list = []
    for parser in name_dict.keys():
        element = f'{score}_{parser}{norm_str}'
        score_list.append(element)
    score_list_dict[score] = score_list

# Parsers (row indices)
parsers = name_dict.keys()

if median_flag:
    # Compute success rate
    score_columns = score_list_dict['bleu']  # Assuming you want to use 'bleu' scores for success rate
    colnames = [name_dict[c.split('_')[1]] for c in score_columns]
    bomb_frequencies = (df[score_columns] < threshold).mean()
    success_rate = {c: 1.0 - f for c, f in zip(colnames, bomb_frequencies)}
    
    # Create empty dictionaries to store median and IQR results
    median_dict = {}
    iqr_dict = {}
    
    # Compute median and IQR for each score type and parser
    for score_type in score_types:
        score_columns = score_list_dict[score_type]
        for parser, col in zip(parsers, score_columns):
            # Compute median and IQR for the current column
            #median_value = df[col].median()
            #iqr_value = compute_iqr(df[col])
            # Compute conditional mean (for values > 0.05)
            filtered_values = df[col][df[col] > threshold]
            median_value = filtered_values.median() if not filtered_values.empty else 0.0
            iqr_value = compute_iqr(filtered_values) if not filtered_values.empty else 0.0
            
            # Store results in the dictionaries
            median_dict[f'{score_type}_median'] = median_dict.get(f'{score_type}_median', []) + [median_value]
            iqr_dict[f'{score_type}_iqr'] = iqr_dict.get(f'{score_type}_iqr', []) + [iqr_value]
    
    # Combine success rate, median, and IQR into a DataFrame with parsers as row indices
    df_overview = pd.DataFrame({
        'Completion rate': [success_rate[name_dict[p]] for p in parsers],
        'bleu_median': median_dict['bleu_median'],
        'bleu_iqr': iqr_dict['bleu_iqr'],
        'car_median': median_dict['car_median'],
        'car_iqr': iqr_dict['car_iqr']
    }, index=parsers)
else:
    # Compute success rate
    score_columns = score_list_dict['bleu']  # Assuming you want to use 'bleu' scores for success rate
    colnames = [name_dict[c.split('_')[1]] for c in score_columns]
    bomb_frequencies = (df[score_columns] < threshold).mean()
    success_rate = {c: 1.0 - f for c, f in zip(colnames, bomb_frequencies)}
    
    # Create empty dictionaries to store conditional mean and standard deviation results
    mean_dict = {}
    std_dict = {}
    
    # Compute conditional mean and standard deviation for each score type and parser
    for score_type in score_types:
        score_columns = score_list_dict[score_type]
        for parser, col in zip(parsers, score_columns):
            # Compute conditional mean (for values > 0.05)
            filtered_values = df[col][df[col] > threshold]
            mean_value = filtered_values.mean() if not filtered_values.empty else 0.0
            std_value = filtered_values.std() if not filtered_values.empty else 0.0
            
            # Store results in the dictionaries
            mean_dict[f'{score_type}_mean'] = mean_dict.get(f'{score_type}_mean', []) + [mean_value]
            std_dict[f'{score_type}_std'] = std_dict.get(f'{score_type}_std', []) + [std_value]
    
    # Combine success rate, conditional mean, and standard deviation into a DataFrame with parsers as row indices
    df_overview = pd.DataFrame({
        'Completion rate': [success_rate[name_dict[p]] for p in parsers],
        'bleu_mean': mean_dict['bleu_mean'],
        'bleu_std': std_dict['bleu_std'],
        'car_mean': mean_dict['car_mean'],
        'car_std': std_dict['car_std']
    }, index=parsers)
    
    # Reorder parsers as per parsers_order
    parsers_order = ['nougat',  'marker',  'grobid', 'tesseract', 'pymupdf', 'pypdf']
    df_overview = df_overview.reindex(parsers_order)

# parer order
parsers_order = ['nougat',  'marker',  'grobid', 'tesseract', 'pymupdf', 'pypdf']
df_overview = df_overview.reindex(parsers_order)

(df_overview*100.).round(2)

Unnamed: 0,Completion rate,bleu_median,bleu_iqr,car_median,car_iqr
nougat,95.11,47.42,20.96,66.59,16.88
marker,90.67,51.28,20.09,65.86,20.72
grobid,77.31,29.43,20.6,56.73,15.78
tesseract,96.37,47.62,17.3,65.64,15.37
pymupdf,90.15,50.14,21.24,64.01,23.19
pypdf,85.75,44.09,21.09,40.92,24.11


In [137]:
def generate_latex_table_with_highlights(df: pd.DataFrame, median_flag: bool):
    # Multiply all values by 100 and round to one decimal place
    df = df * 100.0
    df = df.round(1)

    if median_flag:
        # Identify maximum and minimum values for highlighting
        max_completion_rate = df['Completion rate'].max()
        max_bleu_median = df['bleu_median'].max()
        max_car_median = df['car_median'].max()
        
        min_bleu_iqr = df['bleu_iqr'].min()
        min_car_iqr = df['car_iqr'].min()
        
        # Start writing LaTeX table
        latex_str = r"""
    \begin{table}[htbp]
    \caption{Accuracy: Document- (coverage rate), word- (BLEU), and character-level accuracy}
    \begin{center}
    \begin{tabular}{|c|c|c|c|c|c|}
    \hline
    \textbf{Parser} & \textbf{Coverage [\%]} & \multicolumn{2}{|c|}{\textbf{BLEU [\%]}} & \multicolumn{2}{|c|}{\textbf{CAR [\%]}} \\
    \cline{3-6}
     &  & \textbf{Median} & \textbf{IQR} & \textbf{Median} & \textbf{IQR} \\
    \hline
    """
    
        # Iterate through the rows and format each line in LaTeX
        for index, row in df.iterrows():
            # Format completion rate
            completion_rate = f"\\textbf{{{row['Completion rate']:.1f}}}" if row['Completion rate'] == max_completion_rate else f"{row['Completion rate']:.1f}"
            
            # Format BLEU median and IQR
            bleu_median = f"\\textbf{{{row['bleu_median']:.1f}}}" if row['bleu_median'] == max_bleu_median else f"{row['bleu_median']:.1f}"
            bleu_iqr = f"\\textbf{{{row['bleu_iqr']:.1f}}}" if row['bleu_iqr'] == min_bleu_iqr else f"{row['bleu_iqr']:.1f}"
            
            # Format CAR median and IQR
            car_median = f"\\textbf{{{row['car_median']:.1f}}}" if row['car_median'] == max_car_median else f"{row['car_median']:.1f}"
            car_iqr = f"\\textbf{{{row['car_iqr']:.1f}}}" if row['car_iqr'] == min_car_iqr else f"{row['car_iqr']:.1f}"
            
            # Add the formatted row to the LaTeX string
            latex_str += f"{index.capitalize()} & {completion_rate} & {bleu_median} & {bleu_iqr} & {car_median} & {car_iqr} \\\\ \n"
            latex_str += r"\hline" + "\n"
    
        # Closing the table structure
        latex_str += r"""
    \end{tabular}
    \label{tab1}
    \end{center}
    \end{table}
    """
    
    else:
        # Identify maximum and minimum values for highlighting mean/std
        max_completion_rate = df['Completion rate'].max()
        max_bleu_mean = df['bleu_mean'].max()
        max_car_mean = df['car_mean'].max()
        
        min_bleu_std = df['bleu_std'].min()
        min_car_std = df['car_std'].min()
        
        # Start writing LaTeX table
        latex_str = r"""
    \begin{table}[htbp]
    \caption{Accuracy: Document- (coverage rate), word- (BLEU), and character-level accuracy (mean)}
    \begin{center}
    \begin{tabular}{|c|c|c|c|c|c|}
    \hline
    \textbf{Parser} & \textbf{Coverage [\%]} & \multicolumn{2}{|c|}{\textbf{BLEU [\%]}} & \multicolumn{2}{|c|}{\textbf{CAR [\%]}} \\
    \cline{3-6}
     &  & \textbf{Mean} & \textbf{Std} & \textbf{Mean} & \textbf{Std} \\
    \hline
    """
    
        # Iterate through the rows and format each line in LaTeX
        for index, row in df.iterrows():
            # Format completion rate
            completion_rate = f"\\textbf{{{row['Completion rate']:.1f}}}" if row['Completion rate'] == max_completion_rate else f"{row['Completion rate']:.1f}"
            
            # Format BLEU mean and std
            bleu_mean = f"\\textbf{{{row['bleu_mean']:.1f}}}" if row['bleu_mean'] == max_bleu_mean else f"{row['bleu_mean']:.1f}"
            bleu_std = f"\\textbf{{{row['bleu_std']:.1f}}}" if row['bleu_std'] == min_bleu_std else f"{row['bleu_std']:.1f}"
            
            # Format CAR mean and std
            car_mean = f"\\textbf{{{row['car_mean']:.1f}}}" if row['car_mean'] == max_car_mean else f"{row['car_mean']:.1f}"
            car_std = f"\\textbf{{{row['car_std']:.1f}}}" if row['car_std'] == min_car_std else f"{row['car_std']:.1f}"
            
            # Add the formatted row to the LaTeX string
            latex_str += f"{index.capitalize()} & {completion_rate} & {bleu_mean} & {bleu_std} & {car_mean} & {car_std} \\\\ \n"
            latex_str += r"\hline" + "\n"
    
        # Closing the table structure
        latex_str += r"""
    \end{tabular}
    \label{tab1}
    \end{center}
    \end{table}
    """
    
    return latex_str

# Example usage
# Assuming df_overview is the DataFrame you showed earlier
# Pass 'median_flag=True' for median/IQR table, 'median_flag=False' for mean/std table
latex_code = generate_latex_table_with_highlights(df_overview, median_flag=median_flag)
print(latex_code)




    \begin{table}[htbp]
    \caption{Accuracy: Document- (completion rate), word- (BLEU), and character-level accuracy}
    \begin{center}
    \begin{tabular}{|c|c|c|c|c|c|}
    \hline
    \textbf{Parser} & \textbf{Completion [\%]} & \multicolumn{2}{|c|}{\textbf{BLEU [\%]}} & \multicolumn{2}{|c|}{\textbf{CAR [\%]}} \\
    \cline{3-6}
     &  & \textbf{Median} & \textbf{IQR} & \textbf{Median} & \textbf{IQR} \\
    \hline
    Nougat & 95.1 & 47.4 & 21.0 & \textbf{66.6} & 16.9 \\ 
\hline
Marker & 90.7 & \textbf{51.3} & 20.1 & 65.9 & 20.7 \\ 
\hline
Grobid & 77.3 & 29.4 & 20.6 & 56.7 & 15.8 \\ 
\hline
Tesseract & \textbf{96.4} & 47.6 & \textbf{17.3} & 65.6 & \textbf{15.4} \\ 
\hline
Pymupdf & 90.2 & 50.1 & 21.2 & 64.0 & 23.2 \\ 
\hline
Pypdf & 85.7 & 44.1 & 21.1 & 40.9 & 24.1 \\ 
\hline

    \end{tabular}
    \label{tab1}
    \end{center}
    \end{table}
    


In [53]:
df.columns

Index(['path', 'bleu_nougat', 'rouge_nougat', 'car_nougat', 'bleu_nougat_norm',
       'rouge_nougat_norm', 'car_nougat_norm', 'bleu_pymupdf', 'rouge_pymupdf',
       'car_pymupdf', 'bleu_pymupdf_norm', 'rouge_pymupdf_norm',
       'car_pymupdf_norm', 'bleu_grobid', 'rouge_grobid', 'car_grobid',
       'bleu_grobid_norm', 'rouge_grobid_norm', 'car_grobid_norm',
       'bleu_marker', 'rouge_marker', 'car_marker', 'bleu_marker_norm',
       'rouge_marker_norm', 'car_marker_norm', 'bleu_tesseract',
       'rouge_tesseract', 'car_tesseract', 'bleu_tesseract_norm',
       'rouge_tesseract_norm', 'car_tesseract_norm', 'bleu_pypdf',
       'rouge_pypdf', 'car_pypdf', 'bleu_pypdf_norm', 'rouge_pypdf_norm',
       'car_pypdf_norm', 'category', 'subcategory'],
      dtype='object')

In [27]:
df_overview

Unnamed: 0,Completion rate,bleu_median,bleu_iqr,car_median,car_iqr
nougat,0.975,0.46231,0.300216,0.661556,0.222016
pymupdf,0.945312,0.498479,0.354954,0.654622,0.297272
grobid,0.825195,0.223779,0.259463,0.54256,0.192155
marker,0.919922,0.495679,0.347011,0.648923,0.329011
tesseract,0.98291,0.470957,0.224311,0.670663,0.191685
pypdf,0.929102,0.416638,0.331006,0.349962,0.246086


In [None]:
pd.DataFrame({'Failure Rate' : bomb_frequencies})

In [None]:
font_prop.set_size(23)

In [None]:
cond_means = []
for col in score_columns:
    cond_means.append(float(df[col].mean()))

In [None]:
df.columns