# Read Data From LM Eval Harness

In [3]:
from glob import glob

eval_logs_folder = "top_evals"
all_json_files = glob(f"{eval_logs_folder}/*/*.json")
print(all_json_files)

['top_evals/zaydzuhri__myopic-1.8B-4096-model/results_2025-08-20T11-59-09.690564.json', 'top_evals/zaydzuhri__mtp-1.8B-4096-model/results_2025-08-20T12-31-13.727702.json', 'top_evals/zaydzuhri__vanilla-1.8B-4096-model/results_2025-08-20T12-34-56.285949.json', 'top_evals/zaydzuhri__vanilla-340M-4096-model/results_2025-08-20T11-23-06.323725.json', 'top_evals/zaydzuhri__mtp-340M-4096-model/results_2025-08-20T11-12-35.491684.json', 'top_evals/zaydzuhri__top-7B-4096-model/results_2025-08-20T10-46-34.638307.json', 'top_evals/zaydzuhri__myopic-340M-4096-model/results_2025-08-20T11-20-54.956874.json', 'top_evals/zaydzuhri__vanilla-7B-4096-model/results_2025-08-20T14-15-37.501701.json', 'top_evals/zaydzuhri__mtp-7B-4096-model/results_2025-08-20T10-59-16.724946.json']


# Create Plot Using Background Gradient

In [None]:
import json
import pandas as pd

exception_to_percentage = [
    "Perplexity",
    "Bits"
]

def generate_latex_comparison_table(json_sources, baseline_source_name, precision=2):
    """
    Parses multiple JSON results and generates a LaTeX comparison table where
    each non-baseline cell is colored with a heatmap style based on its
    difference from the baseline.

    Args:
        json_sources (dict): A dictionary where keys are source names (e.g., "Model A")
                             and values are the JSON path.
        baseline_source_name (str): The name of the source to use as the baseline
                                    for coloring.
        precision (int): The number of decimal places for the scores.

    Returns:
        str: A string containing the generated LaTeX table.
    """
    if baseline_source_name not in json_sources:
        return "Error: Baseline source name not found in json_sources."

    # --- 1. Parse all JSONs into a list of records ---
    all_records = []
    for source_name, json_path in json_sources.items():
        try:
            with open(json_path, 'r') as file:
                data = json.load(file)
            results = data.get("results", {})
        except (FileNotFoundError, json.JSONDecodeError) as e:
            print(f"Warning: Could not read or parse file for source '{source_name}' at {json_path}. Error: {e}. Skipping.")
            continue

        for task_name, task_data in results.items():
            for key, value in task_data.items():
                if "alias" in key or "stderr" in key:
                    continue
                try:
                    numeric_value = float(value)
                except (ValueError, TypeError):
                    continue
                all_records.append({
                    "source": source_name,
                    "task": task_name.replace('_', ' ').replace('openai', '').strip().title(),
                    "metric": key.split(',')[0].replace('_', ' ').title(),
                    "value": numeric_value
                })

    if not all_records:
        return "No valid data found to generate a table."

    # --- 2. Create and Pivot DataFrame ---
    df = pd.DataFrame(all_records)
    pivot_df = df.pivot_table(index=['task', 'metric'], columns='source', values='value')
    other_sources = sorted([s for s in json_sources if s != baseline_source_name])
    valid_columns = [baseline_source_name] + [s for s in other_sources if s in pivot_df.columns]
    pivot_df = pivot_df[valid_columns]

    # --- 3. Build LaTeX String with Direct Cell Coloring ---
    latex_parts = [
        "% Add this to your LaTeX preamble: \\usepackage[table]{xcolor} \\usepackage{multirow} \\usepackage{booktabs}",
        "\\begin{table}[htbp!]",
        "\\centering",
        "\\caption{Comparison of evaluation results. Colors relative to baseline.}",
        "\\label{tab:generated_comparison}"
    ]
    source_names = pivot_df.columns.tolist()
    column_format = "l|l|" + "r" * len(source_names)
    latex_parts.append(f"\\begin{{tabular}}{{{column_format}}}")
    latex_parts.append("\\toprule")
    
    header_cols = ["Task", "Metric"] + source_names
    latex_parts.append(" & ".join(header_cols) + " \\\\")
    latex_parts.append("\\midrule")

    # Table Body
    for i, (task_name, group) in enumerate(pivot_df.groupby(level='task', sort=True)):
        num_metrics = len(group)
        for j, ((_, metric_name), row) in enumerate(group.sort_index().iterrows()):
            
            # --- CORRECTED LOGIC FOR PERCENTAGE CONVERSION ---
            def format_value(val, metric):
                if pd.isna(val):
                    return val
                # Check if the metric is one of the exceptions.
                is_exception = any(ex.lower() in metric.lower() for ex in exception_to_percentage)
                # If it's NOT an exception, multiply by 100.
                if not is_exception:
                    return val * 100
                return val

            baseline_val = format_value(row[baseline_source_name], metric_name)
            
            # Start the row with the uncolored baseline value
            formatted_cells = [f"{baseline_val:.{precision}f}" if pd.notna(baseline_val) else '---']

            # Process each of the other sources for coloring
            for source in other_sources:
                compare_val = format_value(row.get(source), metric_name)
                
                if pd.notna(baseline_val) and pd.notna(compare_val):
                    # For delta calculation, use original non-percentage values for better scaling
                    original_baseline = row[baseline_source_name]
                    original_compare = row.get(source)
                    delta = (original_compare - original_baseline)
                    
                    # --- Heatmap Logic ---
                    max_delta_for_color = 0.1 
                    normalized_delta = min(abs(delta) / max_delta_for_color, 1.0)
                    intensity = int(normalized_delta * 60)

                    is_good = (delta < 0) if "Perplexity" in metric_name else (delta > 0)
                    color = "green" if is_good else "red"
                    
                    cell_str = (f"\\cellcolor{{{color}!{intensity}}}"
                                f"{{{compare_val:.{precision}f}}}")
                    formatted_cells.append(cell_str)
                else:
                    formatted_cells.append('---')
            
            # Join all cells for the final row string
            row_content = " & ".join(formatted_cells)
            if j == 0:
                line = f"\\multirow[c]{{{num_metrics}}}{{*}}{{{task_name}}} & {metric_name} & {row_content} \\\\"
            else:
                line = f" & {metric_name} & {row_content} \\\\"
            latex_parts.append(line)
        
        if i < len(pivot_df.index.get_level_values('task').unique()) - 1:
            latex_parts.append(f"\\cmidrule{{1-{len(header_cols)}}}")

    latex_parts.extend(["\\bottomrule", "\\end{tabular}", "\\end{table}"])
    return "\n".join(latex_parts), df


In [89]:
mtp_json_file = "top_evals/zaydzuhri__mtp-340M-4096-model/results_2025-08-20T11-12-35.491684.json"
vanilla_json_file = "top_evals/zaydzuhri__vanilla-340M-4096-model/results_2025-08-20T11-23-06.323725.json"
top_json_file = "top_evals/zaydzuhri__myopic-340M-4096-model/results_2025-08-20T11-20-54.956874.json"

latex_output, df = generate_latex_comparison_table(
    {"NTP" : vanilla_json_file, 
     "MTP" : mtp_json_file,
     "TOP" : top_json_file}, 
     baseline_source_name="NTP",
)
print(latex_output)

% Add this to your LaTeX preamble: \usepackage[table]{xcolor} \usepackage{multirow} \usepackage{booktabs}
\begin{table}[htbp!]
\centering
\caption{Comparison of evaluation results. Colors relative to baseline.}
\label{tab:generated_comparison}
\begin{tabular}{l|l|rrr}
\toprule
Task & Metric & NTP & MTP & TOP \\
\midrule
\multirow[c]{2}{*}{Arc Challenge} & Acc & 26.54 & \cellcolor{green!9}{28.07} & \cellcolor{green!9}{28.07} \\
 & Acc Norm & 28.84 & \cellcolor{green!6}{29.86} & \cellcolor{green!3}{29.35} \\
\cmidrule{1-5}
\multirow[c]{2}{*}{Arc Easy} & Acc & 60.23 & \cellcolor{green!21}{63.80} & \cellcolor{green!18}{63.26} \\
 & Acc Norm & 56.52 & \cellcolor{green!11}{58.38} & \cellcolor{green!10}{58.29} \\
\cmidrule{1-5}
\multirow[c]{2}{*}{Hellaswag} & Acc & 35.52 & \cellcolor{red!0}{35.38} & \cellcolor{red!0}{35.43} \\
 & Acc Norm & 42.53 & \cellcolor{green!1}{42.73} & \cellcolor{green!6}{43.57} \\
\cmidrule{1-5}
\multirow[c]{2}{*}{Lambada} & Acc & 36.35 & \cellcolor{red!6}{35.32} & \

In [90]:
df[df['metric'] == 'Acc Norm'].groupby('source')['value'].mean()

source
MTP    0.549709
NTP    0.538886
TOP    0.557166
Name: value, dtype: float64

In [105]:
from glob import glob

eval_logs_folder = "top_evals"
all_json_files = glob(f"{eval_logs_folder}/*/*.json")
print(all_json_files)

['top_evals/zaydzuhri__myopic-1.8B-4096-model/results_2025-08-20T11-59-09.690564.json', 'top_evals/zaydzuhri__mtp-1.8B-4096-model/results_2025-08-20T12-31-13.727702.json', 'top_evals/zaydzuhri__vanilla-1.8B-4096-model/results_2025-08-20T12-34-56.285949.json', 'top_evals/zaydzuhri__vanilla-340M-4096-model/results_2025-08-20T11-23-06.323725.json', 'top_evals/zaydzuhri__mtp-340M-4096-model/results_2025-08-20T11-12-35.491684.json', 'top_evals/zaydzuhri__top-7B-4096-model/results_2025-08-20T10-46-34.638307.json', 'top_evals/zaydzuhri__myopic-340M-4096-model/results_2025-08-20T11-20-54.956874.json', 'top_evals/zaydzuhri__vanilla-7B-4096-model/results_2025-08-20T14-15-37.501701.json', 'top_evals/zaydzuhri__mtp-7B-4096-model/results_2025-08-20T10-59-16.724946.json']


In [77]:
print(mtp_json_file)
print(vanilla_json_file)
print(top_json_file)

top_evals/zaydzuhri__mtp-1.8B-4096-model/results_2025-08-20T12-31-13.727702.json
top_evals/zaydzuhri__vanilla-1.8B-4096-model/results_2025-08-20T12-34-56.285949.json
top_evals/zaydzuhri__myopic-1.8B-4096-model/results_2025-08-20T11-59-09.690564.json


In [76]:
size = "1.8B"

for json_file in all_json_files:
    if size in json_file:
        if "mtp" in json_file:
            mtp_json_file = json_file
        elif "vanilla" in json_file:
            vanilla_json_file = json_file
        elif "myopic" in json_file or "top" in json_file:
            top_json_file = json_file

latex_output, df = generate_latex_comparison_table(
    {"NTP" : vanilla_json_file, 
     "MTP" : mtp_json_file,
     "Top" : top_json_file}, 
     baseline_source_name="NTP",
)
print(latex_output)

% Add this to your LaTeX preamble: \usepackage[table]{xcolor} \usepackage{multirow} \usepackage{booktabs}
\begin{table}[htbp!]
\centering
\caption{Comparison of evaluation results. Colors relative to baseline.}
\label{tab:generated_comparison}
\begin{tabular}{l|l|rrr}
\toprule
Task & Metric & NTP & MTP & Top \\
\midrule
\multirow[c]{2}{*}{Arc Challenge} & Acc & 35.58 & \cellcolor{green!16}{38.40} & \cellcolor{green!22}{39.25} \\
 & Acc Norm & 38.65 & \cellcolor{green!11}{40.61} & \cellcolor{green!22}{42.32} \\
\cmidrule{1-5}
\multirow[c]{2}{*}{Arc Easy} & Acc & 72.81 & \cellcolor{red!0}{72.69} & \cellcolor{green!4}{73.48} \\
 & Acc Norm & 67.05 & \cellcolor{green!21}{70.66} & \cellcolor{green!18}{70.12} \\
\cmidrule{1-5}
\multirow[c]{2}{*}{Hellaswag} & Acc & 46.03 & \cellcolor{red!8}{44.61} & \cellcolor{red!1}{45.75} \\
 & Acc Norm & 60.05 & \cellcolor{red!10}{58.29} & \cellcolor{green!2}{60.45} \\
\cmidrule{1-5}
\multirow[c]{2}{*}{Lambada} & Acc & 49.58 & \cellcolor{red!9}{47.93} & \c

In [None]:
print(df[df["metric"] == "Acc Norm"].head())

   source           task    metric     value
1     NTP  Arc Challenge  Acc Norm  0.386519
3     NTP       Arc Easy  Acc Norm  0.670455
5     NTP      Hellaswag  Acc Norm  0.600478
10    NTP           Piqa  Acc Norm  0.735038
12    NTP           Sciq  Acc Norm  0.864000


In [88]:
df[df['metric'] == 'Acc Norm'].groupby('source')['value'].mean()

source
MTP    0.659666
NTP    0.651298
Top    0.669883
Name: value, dtype: float64

In [None]:
size = "7B"

for json_file in all_json_files:
    if size in json_file:
        if "mtp" in json_file:
            mtp_json_file = json_file
        elif "vanilla" in json_file:
            vanilla_json_file = json_file
        elif "myopic" in json_file or "top" in json_file:
            top_json_file = json_file

latex_output, df = generate_latex_comparison_table(
    {"NTP" : vanilla_json_file, 
     "MTP" : mtp_json_file,
     "TOP" : top_json_file}, 
     baseline_source_name="NTP",
)
print(latex_output)

% Add this to your LaTeX preamble: \usepackage[table]{xcolor} \usepackage{multirow} \usepackage{booktabs}
\begin{table}[htbp!]
\centering
\caption{Comparison of evaluation results. Colors relative to baseline.}
\label{tab:generated_comparison}
\begin{tabular}{l|l|rrr}
\toprule
Task & Metric & NTP & MTP & TOP \\
\midrule
\multirow[c]{2}{*}{Arc Challenge} & Acc & 45.05 & \cellcolor{red!8}{43.69} & \cellcolor{red!5}{44.20} \\
 & Acc Norm & 45.48 & \cellcolor{green!0}{45.56} & \cellcolor{green!5}{46.42} \\
\cmidrule{1-5}
\multirow[c]{2}{*}{Arc Easy} & Acc & 77.31 & \cellcolor{green!2}{77.69} & \cellcolor{green!4}{78.03} \\
 & Acc Norm & 74.07 & \cellcolor{red!1}{73.86} & \cellcolor{green!3}{74.62} \\
\cmidrule{1-5}
\multirow[c]{2}{*}{Hellaswag} & Acc & 50.95 & \cellcolor{red!8}{49.58} & \cellcolor{green!3}{51.53} \\
 & Acc Norm & 67.43 & \cellcolor{red!9}{65.85} & \cellcolor{green!7}{68.73} \\
\cmidrule{1-5}
\multirow[c]{2}{*}{Lambada} & Acc & 55.89 & \cellcolor{red!16}{53.13} & \cellcolor

# Create Plot Using Small Number at Bottom Corner

In [123]:
import json
import pandas as pd

exception_to_percentage = [
    "Perplexity",
    "Bits"
]

def generate_latex_comparison_table(
        json_sources, 
        baseline_source_name, 
        precision=2, 
        size: str | None = None,
        add_metric_column: bool = True
    ):
    """
    Parses multiple JSON results and generates a LaTeX comparison table. Each
    non-baseline cell contains the score and a small, colored delta indicating
    the change from the baseline.

    Args:
        json_sources (dict): A dictionary where keys are source names (e.g., "Model A")
                             and values are the JSON path.
        baseline_source_name (str): The name of the source to use as the baseline
                                    for calculating the delta.
        precision (int): The number of decimal places for the scores.
        size (str | None): Optional size parameter to include in the caption.
        add_metric_column (bool): If True, includes the 'Metric' column.

    Returns:
        str: A string containing the generated LaTeX table.
        pd.DataFrame: The processed pandas DataFrame.
    """
    if baseline_source_name not in json_sources:
        return "Error: Baseline source name not found in json_sources."

    # --- 1. Parse all JSONs into a list of records ---
    all_records = []
    for source_name, json_path in json_sources.items():
        try:
            with open(json_path, 'r') as file:
                data = json.load(file)
            results = data.get("results", {})
        except (FileNotFoundError, json.JSONDecodeError) as e:
            print(f"Warning: Could not read or parse file for source '{source_name}' at {json_path}. Error: {e}. Skipping.")
            continue

        for task_name, task_data in results.items():
            for key, value in task_data.items():
                if "alias" in key or "stderr" in key:
                    continue
                try:
                    numeric_value = float(value)
                except (ValueError, TypeError):
                    continue
                all_records.append({
                    "source": source_name,
                    "task": task_name.replace('_', ' ').replace('openai', '').strip().title(),
                    "metric": key.split(',')[0].replace('_', ' ').title(),
                    "value": numeric_value
                })

    if not all_records:
        return "No valid data found to generate a table."

    # --- 2. Create and Pivot DataFrame ---
    df = pd.DataFrame(all_records)
    pivot_df = df.pivot_table(index=['task', 'metric'], columns='source', values='value')
    other_sources = sorted([s for s in json_sources if s != baseline_source_name])
    valid_columns = [baseline_source_name] + [s for s in other_sources if s in pivot_df.columns]
    pivot_df = pivot_df[valid_columns]

    # --- 3. Build LaTeX String with Score and Small Delta ---
    latex_parts = [
        "% Add this to your LaTeX preamble: \\usepackage[table]{xcolor} \\usepackage{multirow} \\usepackage{booktabs}",
        "\\begin{table}[htbp!]",
        "\\centering",
        f"\\caption{{Comparison of evaluation results{' for size ' + size if size else ''}. Deltas relative to baseline.}}",
        "\\label{tab:generated_comparison}"
    ]
    source_names = pivot_df.columns.tolist()
    
    column_format = "l|"
    if add_metric_column:
        column_format += "l|"
    column_format += "r" * len(source_names)
    
    latex_parts.append(f"\\begin{{tabular}}{{{column_format}}}")
    latex_parts.append("\\toprule")
    
    header_cols = ["Task"]
    if add_metric_column:
        header_cols.append("Metric")
    header_cols += source_names
    latex_parts.append(" & ".join(header_cols) + " \\\\")
    latex_parts.append("\\midrule")

    # Table Body
    for i, (task_name, group) in enumerate(pivot_df.groupby(level='task', sort=True)):
        num_metrics = len(group)
        for j, ((_, metric_name), row) in enumerate(group.sort_index().iterrows()):
            
            def format_value(val, metric):
                if pd.isna(val): return val
                is_exception = any(ex.lower() in metric.lower() for ex in exception_to_percentage)
                return val * 100 if not is_exception else val

            baseline_val = format_value(row[baseline_source_name], metric_name)
            value_cells = [f"{baseline_val:.{precision}f}" if pd.notna(baseline_val) else '---']

            for source in other_sources:
                compare_val = format_value(row.get(source), metric_name)
                
                if pd.notna(baseline_val) and pd.notna(compare_val):
                    delta = compare_val - baseline_val
                    is_neutral = abs(delta) < 1e-6

                    if is_neutral:
                        cell_str = f"{{{compare_val:.{precision}f}}}"
                    else:
                        is_good = (delta > 0) if not any(ex.lower() in metric_name.lower() for ex in exception_to_percentage) else (delta < 0)
                        color = "green!70!black" if is_good else "red!70!black"
                        delta_part = f"\\ \\textsubscript{{\\textcolor{{{color}}}{{{delta:+.{precision}f}}}}}"
                        cell_str = f"{{{compare_val:.{precision}f}}}{delta_part}"
                    
                    value_cells.append(cell_str)
                else:
                    value_cells.append('---')
            
            row_content = " & ".join(value_cells)
            
            # --- FIXED LOGIC ---
            if add_metric_column:
                if j == 0: # First row of a task group
                    line = f"\\multirow[c]{{{num_metrics}}}{{*}}{{{task_name}}} & {metric_name} & {row_content} \\\\"
                else: # Subsequent rows
                    line = f" & {metric_name} & {row_content} \\\\"
            else: # If not adding metric column
                if j == 0:
                    line = f"\\multirow[c]{{{num_metrics}}}{{*}}{{{task_name}}} & {row_content} \\\\"
                else:
                    line = f" & {row_content} \\\\"
            
            latex_parts.append(line)
        
        if i < len(pivot_df.index.get_level_values('task').unique()) - 1:
            latex_parts.append(f"\\cmidrule{{1-{len(header_cols)}}}")

    latex_parts.extend(["\\bottomrule", "\\end{tabular}", "\\end{table}"])
    return "\n".join(latex_parts), df


In [124]:
def create_plot_specify_size(size: str, add_metric_column: bool = True) -> pd.DataFrame:
    for json_file in all_json_files:
        if size in json_file:
            if "mtp" in json_file:
                mtp_json_file = json_file
            elif "vanilla" in json_file:
                vanilla_json_file = json_file
            elif "myopic" in json_file or "top" in json_file:
                top_json_file = json_file

    latex_output, df = generate_latex_comparison_table(
        {"NTP" : vanilla_json_file, 
        "MTP" : mtp_json_file,
        "TOP" : top_json_file}, 
        baseline_source_name="NTP",
        size=size,
        add_metric_column=add_metric_column
    )
    print(latex_output)

    return df

df = create_plot_specify_size("7B", False)


% Add this to your LaTeX preamble: \usepackage[table]{xcolor} \usepackage{multirow} \usepackage{booktabs}
\begin{table}[htbp!]
\centering
\caption{Comparison of evaluation results for size 7B. Deltas relative to baseline.}
\label{tab:generated_comparison}
\begin{tabular}{l|rrr}
\toprule
Task & NTP & MTP & TOP \\
\midrule
\multirow[c]{2}{*}{Arc Challenge} & 45.05 & {43.69}\ \textsubscript{\textcolor{red!70!black}{-1.37}} & {44.20}\ \textsubscript{\textcolor{red!70!black}{-0.85}} \\
 & 45.65 & {45.56}\ \textsubscript{\textcolor{red!70!black}{-0.09}} & {46.42}\ \textsubscript{\textcolor{green!70!black}{+0.77}} \\
\cmidrule{1-4}
\multirow[c]{2}{*}{Arc Easy} & 77.31 & {77.69}\ \textsubscript{\textcolor{green!70!black}{+0.38}} & {78.03}\ \textsubscript{\textcolor{green!70!black}{+0.72}} \\
 & 74.03 & {73.86}\ \textsubscript{\textcolor{red!70!black}{-0.17}} & {74.62}\ \textsubscript{\textcolor{green!70!black}{+0.59}} \\
\cmidrule{1-4}
\multirow[c]{2}{*}{Hellaswag} & 50.96 & {49.58}\ \textsubs

# New Table With Combined Tasks

In [None]:
import json
import pandas as pd

exception_to_percentage = [
    "Perplexity",
    "Bits"
]

def generate_latex_comparison_table(
        json_sources, 
        baseline_source_name, 
        precision=2, 
        size: str | None = None,
        add_metric_column: bool = True
    ):
    """
    Parses multiple JSON results and generates a LaTeX comparison table. Each
    non-baseline cell contains the score and a small, colored delta indicating
    the change from the baseline.

    Args:
        json_sources (dict): A dictionary where keys are source names (e.g., "Model A")
                             and values are the JSON path.
        baseline_source_name (str): The name of the source to use as the baseline
                                    for calculating the delta.
        precision (int): The number of decimal places for the scores.
        size (str | None): Optional size parameter to include in the caption.
        add_metric_column (bool): If True, includes the 'Metric' column.

    Returns:
        str: A string containing the generated LaTeX table.
        pd.DataFrame: The processed pandas DataFrame.
    """
    if baseline_source_name not in json_sources:
        return "Error: Baseline source name not found in json_sources."

    # --- 1. Parse all JSONs into a list of records ---
    all_records = []
    for source_name, json_path in json_sources.items():
        try:
            with open(json_path, 'r') as file:
                data = json.load(file)
            results = data.get("results", {})
        except (FileNotFoundError, json.JSONDecodeError) as e:
            print(f"Warning: Could not read or parse file for source '{source_name}' at {json_path}. Error: {e}. Skipping.")
            continue

        for task_name, task_data in results.items():
            for key, value in task_data.items():
                if "alias" in key or "stderr" in key:
                    continue
                try:
                    numeric_value = float(value)
                except (ValueError, TypeError):
                    continue
                all_records.append({
                    "source": source_name,
                    "task": task_name.replace('_', ' ').replace('openai', '').strip().title(),
                    "metric": key.split(',')[0].replace('_', ' ').title(),
                    "value": numeric_value
                })

    if not all_records:
        return "No valid data found to generate a table."

    # --- 2. Create and Pivot DataFrame ---
    df = pd.DataFrame(all_records)
    # combined_task = ["Arc Challenge", "Arc Easy", "Hellaswag", "Piqa", "Sciq"]
    # remove_acc_norm = df.drop(df[df["task"].isin(combined_task)].index)
    # new_values = df[df["task"].isin(combined_task)][(df["metric"] == "Acc Norm")].groupby("source")["value"].mean().to_dict()
    # new_rows = [
    #     {
    #         "source": key,
    #         "task": "Combined",
    #         "metric": "Acc Norm",
    #         "value": value
    #     }
    #     for key, value in new_values.items()
    # ]
    # concat_df = pd.concat([remove_acc_norm, pd.DataFrame(new_rows)], ignore_index=True)
    # df = concat_df
    
    # Filter out specific metrics
    all_metric = set(df["metric"].unique()) - {"Word Perplexity", "Byte Perplexity"}
    df = df[df["metric"].isin(all_metric)]

    pivot_df = df.pivot_table(index=['task', 'metric'], columns='source', values='value')
    other_sources = sorted([s for s in json_sources if s != baseline_source_name])
    valid_columns = [baseline_source_name] + [s for s in other_sources if s in pivot_df.columns]
    pivot_df = pivot_df[valid_columns]

    # --- 3. Build LaTeX String with Score and Small Delta ---
    latex_parts = [
        "% Add this to your LaTeX preamble: \\usepackage[table]{xcolor} \\usepackage{multirow} \\usepackage{booktabs}",
        "\\begin{table}[htbp!]",
        "\\centering",
        f"\\caption{{Comparison of evaluation results{' for size ' + size if size else ''}. Deltas relative to baseline.}}",
        "\\label{tab:generated_comparison}"
    ]
    
    # --- DUAL COLUMN SETUP ---
    column_format = "l|"
    if add_metric_column:
        column_format += "l|"
    column_format += "r" # Baseline column
    if other_sources:
        # For each other source, create two columns with a small space between them
        column_format += "r@{\\hspace{3pt}}l" * len(other_sources)

    latex_parts.append(f"\\begin{{tabular}}{{{column_format}}}")
    latex_parts.append("\\toprule")
    
    # Create header with multicolumn for non-baseline sources
    header_parts = ["Task"]
    if add_metric_column:
        header_parts.append("Metric")
    header_parts.append(baseline_source_name)
    for source in other_sources:
        header_parts.append(f"\\multicolumn{{2}}{{c}}{{{source}}}")
    latex_parts.append(" & ".join(header_parts) + " \\\\")
    latex_parts.append("\\midrule")

    # Table Body
    for i, (task_name, group) in enumerate(pivot_df.groupby(level='task', sort=True)):
        num_metrics = len(group)
        for j, ((_, metric_name), row) in enumerate(group.sort_index().iterrows()):
            
            def format_value(val, metric):
                if pd.isna(val): return val
                is_exception = any(ex.lower() in metric.lower() for ex in exception_to_percentage)
                return val * 100 if not is_exception else val

            baseline_val = format_value(row[baseline_source_name], metric_name)
            value_cells = [f"{baseline_val:.{precision}f}" if pd.notna(baseline_val) else '---']

            for source in other_sources:
                compare_val = format_value(row.get(source), metric_name)
                
                if pd.notna(baseline_val) and pd.notna(compare_val):
                    delta = compare_val - baseline_val
                    rounded_delta = round(delta, precision)

                    score_cell = f"{{{compare_val:.{precision}f}}}"
                    delta_cell = "" # Default to empty

                    if rounded_delta != 0.0:
                        is_good = (rounded_delta > 0) if not any(ex.lower() in metric_name.lower() for ex in exception_to_percentage) else (rounded_delta < 0)
                        color = "green!70!black" if is_good else "red!70!black"
                        delta_cell = f"\\textsubscript{{\\textcolor{{{color}}}{{{rounded_delta:+.{precision}f}}}}}"
                    
                    value_cells.append(score_cell)
                    value_cells.append(delta_cell)
                else:
                    value_cells.append('---')
                    value_cells.append('') # Empty cell for delta column
            
            row_content = " & ".join(value_cells)
            
            if add_metric_column:
                if j == 0:
                    line = f"\\multirow[c]{{{num_metrics}}}{{*}}{{{task_name}}} & {metric_name} & {row_content} \\\\"
                else:
                    line = f" & {metric_name} & {row_content} \\\\"
            else:
                if j == 0:
                    line = f"\\multirow[c]{{{num_metrics}}}{{*}}{{{task_name}}} & {row_content} \\\\"
                else:
                    line = f" & {row_content} \\\\"
            
            latex_parts.append(line)
        
        # Calculate total columns for cmidrule
        total_cols = 1 + (1 if add_metric_column else 0) + 1 + (2 * len(other_sources))
        if i < len(pivot_df.index.get_level_values('task').unique()) - 1:
            latex_parts.append(f"\\cmidrule{{1-{total_cols}}}")

    latex_parts.extend(["\\bottomrule", "\\end{tabular}", "\\end{table}"])
    return "\n".join(latex_parts), df


In [19]:
def create_plot_specify_size(size: str, add_metric_column: bool = True) -> pd.DataFrame:
    for json_file in all_json_files:
        if size in json_file:
            if "mtp" in json_file:
                mtp_json_file = json_file
            elif "vanilla" in json_file:
                vanilla_json_file = json_file
            elif "myopic" in json_file or "top" in json_file:
                top_json_file = json_file

    latex_output, df = generate_latex_comparison_table(
        {"NTP" : vanilla_json_file, 
        "MTP" : mtp_json_file,
        "TOP" : top_json_file}, 
        baseline_source_name="NTP",
        size=size,
        add_metric_column=add_metric_column
    )
    print(latex_output)

    return df

df = create_plot_specify_size("1.8B", False)


% Add this to your LaTeX preamble: \usepackage[table]{xcolor} \usepackage{multirow} \usepackage{booktabs}
\begin{table}[htbp!]
\centering
\caption{Comparison of evaluation results for size 1.8B. Deltas relative to baseline.}
\label{tab:generated_comparison}
\begin{tabular}{l|rr@{\hspace{3pt}}lr@{\hspace{3pt}}l}
\toprule
Task & NTP & \multicolumn{2}{c}{MTP} & \multicolumn{2}{c}{TOP} \\
\midrule
\multirow[c]{1}{*}{Combined} & 65.13 & {65.97} & \textsubscript{\textcolor{green!70!black}{+0.84}} & {66.99} & \textsubscript{\textcolor{green!70!black}{+1.86}} \\
\cmidrule{1-6}
\multirow[c]{2}{*}{Lambada} & 49.58 & {47.93} & \textsubscript{\textcolor{red!70!black}{-1.65}} & {50.34} & \textsubscript{\textcolor{green!70!black}{+0.76}} \\
 & 11.38 & {13.69} & \textsubscript{\textcolor{red!70!black}{+2.31}} & {11.19} & \textsubscript{\textcolor{green!70!black}{-0.19}} \\
\cmidrule{1-6}
\multirow[c]{1}{*}{Nq Open} & 4.54 & {4.46} & \textsubscript{\textcolor{red!70!black}{-0.08}} & {5.37} & \textsubs

  new_values = df[df["task"].isin(combined_task)][(df["metric"] == "Acc Norm")].groupby("source")["value"].mean().to_dict()


In [129]:
df["task"].unique()

array(['Arc Challenge', 'Arc Easy', 'Hellaswag', 'Lambada', 'Nq Open',
       'Piqa', 'Sciq', 'Social Iqa', 'Triviaqa', 'Wikitext'], dtype=object)

In [136]:
combined_task = ["Arc Challenge", "Arc Easy", "Hellaswag", "Piqa", "Sciq"]
df[df["task"].isin(combined_task)][df["metric"] == "Acc Norm"]["value"].mean()

  df[df["task"].isin(combined_task)][df["metric"] == "Acc Norm"]["value"].mean()


np.float64(0.7071827045421929)

In [146]:
remove_acc_norm = df.drop(df[df["task"].isin(combined_task)].index)
remove_acc_norm

Unnamed: 0,source,task,metric,value
6,NTP,Lambada,Perplexity,7.967722
7,NTP,Lambada,Acc,0.558898
8,NTP,Nq Open,Exact Match,0.07313
13,NTP,Social Iqa,Acc,0.443705
14,NTP,Triviaqa,Exact Match,0.242755
15,NTP,Wikitext,Word Perplexity,11.663074
16,NTP,Wikitext,Byte Perplexity,1.583067
17,NTP,Wikitext,Bits Per Byte,0.662723
24,MTP,Lambada,Perplexity,8.993055
25,MTP,Lambada,Acc,0.531341


In [None]:
# Add new row
# new_row = {
#     "source" : "Combined",
#     "task"
# }
new_values = df[df["task"].isin(combined_task)][(df["metric"] == "Acc Norm")].groupby("source")["value"].mean().to_dict()
new_rows = [
    {
        "source": key,
        "task": "Combined",
        "metric": "Acc Norm",
        "value": value
    }
    for key, value in new_values.items()
]

[{'source': 'MTP', 'task': 'Combined', 'metric': 'Acc Norm', 'value': 0.7006293719297261}, {'source': 'NTP', 'task': 'Combined', 'metric': 'Acc Norm', 'value': 0.7054061409393804}, {'source': 'TOP', 'task': 'Combined', 'metric': 'Acc Norm', 'value': 0.7155126007574719}]


  new_values = df[df["task"].isin(combined_task)][(df["metric"] == "Acc Norm")].groupby("source")["value"].mean().to_dict()


In [155]:
dropped_df = df.drop(df[df["task"].isin(combined_task)].index)

In [156]:
concat_df = pd.concat([dropped_df, pd.DataFrame(new_rows)], ignore_index=True)

In [157]:
concat_df

Unnamed: 0,source,task,metric,value
0,NTP,Lambada,Perplexity,7.967722
1,NTP,Lambada,Acc,0.558898
2,NTP,Nq Open,Exact Match,0.07313
3,NTP,Social Iqa,Acc,0.443705
4,NTP,Triviaqa,Exact Match,0.242755
5,NTP,Wikitext,Word Perplexity,11.663074
6,NTP,Wikitext,Byte Perplexity,1.583067
7,NTP,Wikitext,Bits Per Byte,0.662723
8,MTP,Lambada,Perplexity,8.993055
9,MTP,Lambada,Acc,0.531341


# Combined Combined Table

In [4]:
import json
import pandas as pd

exception_to_percentage = [
    "Perplexity",
    "Bits"
]

def generate_combined_latex_table(
        json_sources_by_size, 
        baseline_source_name, 
        precision=2, 
        add_metric_column: bool = True,
        combined: bool = False
    ):
    """
    Parses multiple JSON results across different sizes and generates a single
    combined LaTeX comparison table.

    Args:
        json_sources_by_size (dict): A dictionary where keys are sizes (e.g., "7B")
                                     and values are dictionaries of JSON paths for that size.
        baseline_source_name (str): The name of the source to use as the baseline.
        precision (int): The number of decimal places for the scores.
        add_metric_column (bool): If True, includes the 'Metric' column.

    Returns:
        str: A string containing the generated LaTeX table.
        pd.DataFrame: The processed pandas DataFrame.
    """
    if not json_sources_by_size:
        return "Error: No JSON sources provided."

    # --- 1. Parse all JSONs from all sizes into a list of records ---
    all_records = []
    for size, json_sources in json_sources_by_size.items():
        if baseline_source_name not in json_sources:
            return f"Error: Baseline source '{baseline_source_name}' not found for size '{size}'."
        for source_name, json_path in json_sources.items():
            try:
                with open(json_path, 'r') as file:
                    data = json.load(file)
                results = data.get("results", {})
            except (FileNotFoundError, json.JSONDecodeError) as e:
                print(f"Warning: Could not process file for '{size} {source_name}'. Error: {e}. Skipping.")
                continue

            for task_name, task_data in results.items():
                for key, value in task_data.items():
                    if "alias" in key or "stderr" in key:
                        continue
                    try:
                        numeric_value = float(value)
                    except (ValueError, TypeError):
                        continue
                    all_records.append({
                        "size": size,
                        "source": source_name,
                        "task": task_name.replace('_', ' ').replace('openai', '').strip().title(),
                        "metric": key.split(',')[0].replace('_', ' ').title(),
                        "value": numeric_value
                    })

    if not all_records:
        return "No valid data found to generate a table."

    # --- 2. Create and Pivot DataFrame ---
    df = pd.DataFrame(all_records)
    
    if combined:
        # Handle the "Combined" task logic
        combined_task_names = ["Arc Challenge", "Arc Easy", "Hellaswag", "Piqa", "Sciq"]
        avg_acc_norm = df[
            (df["task"].isin(combined_task_names)) & 
            (df["metric"] == "Acc Norm")
        ].groupby(["size", "source"])["value"].mean().reset_index()
        
        new_rows = []
        for _, row in avg_acc_norm.iterrows():
            new_rows.append({
                "size": row["size"],
                "source": row["source"],
                "task": "Combined",
                "metric": "Acc Norm",
                "value": row["value"]
            })
        
        # Keep original rows that are not part of the combined calculation and add the new combined rows
        df_filtered = df.drop(df[df["task"].isin(combined_task_names)].index)
        df = pd.concat([df_filtered, pd.DataFrame(new_rows)], ignore_index=True)

    # Pivot the table to create a multi-level column structure
    pivot_df = df.pivot_table(index=['task', 'metric'], columns=['size', 'source'], values='value')
    
    # Define column order
    size_order = ["340M", "1.8B", "7B"]
    model_order = [baseline_source_name] + sorted([s for s in list(json_sources_by_size.values())[0] if s != baseline_source_name])
    
    try:
        pivot_df = pivot_df.reindex(columns=pd.MultiIndex.from_product([size_order, model_order], names=['size', 'source']))
    except KeyError:
        return "Error: Mismatch in model names across different sizes."


    # --- 3. Build Combined LaTeX String ---
    latex_parts = [
        "% Add this to your LaTeX preamble: \\usepackage[table]{xcolor} \\usepackage{multirow} \\usepackage{booktabs}",
        "\\begin{table*}[htbp!]", # Use table* for full page width
        "\\centering",
        "\\small",
        "\\setlength{\\tabcolsep}{3pt}", # Reduce column padding
        "\\caption{Combined comparison of evaluation results across model sizes.}",
        "\\label{tab:combined_comparison}"
    ]
    
    num_models = len(model_order)
    num_sizes = len(size_order)
    cols_per_model = 2 # score + delta
    cols_per_size = cols_per_model * num_models

    # --- ALIGNMENT FIX ---
    # Define column format to treat every model as a pair of columns for consistent spacing.
    column_format = "l|"
    if add_metric_column:
        column_format += "l|"
    # Use r@{\hspace{2pt}}l for every model pair for a tighter look
    column_format += ("r@{\\hspace{2pt}}l" * num_models + "|") * num_sizes
    column_format = column_format[:-1] # Remove the last extra '|'

    latex_parts.append(f"\\begin{{tabular}}{{{column_format}}}")
    latex_parts.append("\\toprule")
    
    # --- Header Row 1 (Sizes) ---
    header1_parts = ["Task"]
    if add_metric_column:
        header1_parts.append("Metric")
    for size in size_order:
        # Each size group now spans a consistent number of columns
        header1_parts.append(f"\\multicolumn{{{cols_per_size}}}{{c|}}{{{size}}}")
    latex_parts.append(" & ".join(header1_parts) + " \\\\")

    # --- Header Row 2 (Models) ---
    header2_parts = [""] * (1 + (1 if add_metric_column else 0))
    for _ in size_order:
        for source in model_order:
             # Each model name spans two columns (score + delta)
             header2_parts.append(f"\\multicolumn{{{cols_per_model}}}{{c}}{{{source}}}")
    latex_parts.append(" & ".join(header2_parts) + " \\\\")
    latex_parts.append("\\midrule")

    # --- Placeholder Rows ---
    placeholder_cols = cols_per_size * num_sizes
    placeholder_text = " & ".join([""] * (placeholder_cols))
    latex_parts.append(f"Param & {placeholder_text} \\\\" if add_metric_column else f"Param {placeholder_text} \\\\")
    latex_parts.append(f"Loss & {placeholder_text} \\\\" if add_metric_column else f"Train {placeholder_text} \\\\")
    latex_parts.append("\\midrule")

    # --- Table Body ---
    for i, (task_name, group) in enumerate(pivot_df.groupby(level='task', sort=True)):
        num_metrics = len(group)
        for j, ((_, metric_name), row) in enumerate(group.sort_index().iterrows()):
            
            def format_value(val, metric):
                if pd.isna(val): return val
                is_exception = any(ex.lower() in metric.lower() for ex in exception_to_percentage)
                return val * 100 if not is_exception else val

            all_cells = []
            for size in size_order:
                for source in model_order:
                    val = format_value(row.get((size, source)), metric_name)
                    
                    if source == baseline_source_name:
                        # Baseline just gets the score and an empty delta cell
                        all_cells.append(f"{val:.{precision}f}" if pd.notna(val) else '---')
                        all_cells.append('')
                        continue

                    baseline_val = format_value(row.get((size, baseline_source_name)), metric_name)
                    if pd.notna(baseline_val) and pd.notna(val):
                        delta = val - baseline_val
                        rounded_delta = round(delta, precision)

                        score_cell = f"{{{val:.{precision}f}}}"
                        delta_cell = ""
                        if rounded_delta != 0.0:
                            is_good = (rounded_delta > 0) if not any(ex.lower() in metric_name.lower() for ex in exception_to_percentage) else (rounded_delta < 0)
                            color = "green!70!black" if is_good else "red!70!black"
                            delta_cell = f"\\textsubscript{{\\textcolor{{{color}}}{{{rounded_delta:+.{precision}f}}}}}"
                        
                        all_cells.append(score_cell)
                        all_cells.append(delta_cell)
                    else:
                        all_cells.extend(['---', ''])
            
            row_content = " & ".join(all_cells)
            
            # Construct the final row string
            line_start = ""
            if add_metric_column:
                if j == 0:
                    line_start = f"\\multirow[c]{{{num_metrics}}}{{*}}{{{task_name}}} & {metric_name}"
                else:
                    line_start = f" & {metric_name}"
            else:
                if j == 0:
                    line_start = f"\\multirow[c]{{{num_metrics}}}{{*}}{{{task_name}}}"
                else:
                    line_start = ""
            
            latex_parts.append(f"{line_start} & {row_content} \\\\")
        
        total_cols = 1 + (1 if add_metric_column else 0) + placeholder_cols
        if i < len(pivot_df.index.get_level_values('task').unique()) - 1:
            latex_parts.append(f"\\cmidrule{{1-{total_cols}}}")

    latex_parts.extend(["\\bottomrule", "\\end{tabular}", "\\end{table*}"])
    print("\n".join(latex_parts))
    return df


In [None]:
json_sources_by_size = {"340M" : {}, "1.8B" : {}, "7B" : {}}

for json_file in all_json_files:
    for key, value in json_sources_by_size.items():
        if key in json_file:
            if "mtp" in json_file:
                json_sources_by_size[key]["MTP"] = json_file
            elif "vanilla" in json_file:
                json_sources_by_size[key]["NTP"] = json_file
            elif "myopic" in json_file or "top" in json_file:
                json_sources_by_size[key]["TOP"] = json_file

print(json_sources_by_size)


{'340M': {'NTP': 'top_evals/zaydzuhri__vanilla-340M-4096-model/results_2025-08-20T11-23-06.323725.json', 'MTP': 'top_evals/zaydzuhri__mtp-340M-4096-model/results_2025-08-20T11-12-35.491684.json', 'TOP': 'top_evals/zaydzuhri__myopic-340M-4096-model/results_2025-08-20T11-20-54.956874.json'}, '1.8B': {'TOP': 'top_evals/zaydzuhri__myopic-1.8B-4096-model/results_2025-08-20T11-59-09.690564.json', 'MTP': 'top_evals/zaydzuhri__mtp-1.8B-4096-model/results_2025-08-20T12-31-13.727702.json', 'NTP': 'top_evals/zaydzuhri__vanilla-1.8B-4096-model/results_2025-08-20T12-34-56.285949.json'}, '7B': {'TOP': 'top_evals/zaydzuhri__top-7B-4096-model/results_2025-08-20T10-46-34.638307.json', 'NTP': 'top_evals/zaydzuhri__vanilla-7B-4096-model/results_2025-08-20T14-15-37.501701.json', 'MTP': 'top_evals/zaydzuhri__mtp-7B-4096-model/results_2025-08-20T10-59-16.724946.json'}}


In [6]:
df = generate_combined_latex_table(
    json_sources_by_size, 
    baseline_source_name="NTP",
    precision=2,
    add_metric_column=False
)

% Add this to your LaTeX preamble: \usepackage[table]{xcolor} \usepackage{multirow} \usepackage{booktabs}
\begin{table*}[htbp!]
\centering
\small
\setlength{\tabcolsep}{3pt}
\caption{Combined comparison of evaluation results across model sizes.}
\label{tab:combined_comparison}
\begin{tabular}{l|r@{\hspace{2pt}}lr@{\hspace{2pt}}lr@{\hspace{2pt}}l|r@{\hspace{2pt}}lr@{\hspace{2pt}}lr@{\hspace{2pt}}l|r@{\hspace{2pt}}lr@{\hspace{2pt}}lr@{\hspace{2pt}}l}
\toprule
Task & \multicolumn{6}{c|}{340M} & \multicolumn{6}{c|}{1.8B} & \multicolumn{6}{c|}{7B} \\
 & \multicolumn{2}{c}{NTP} & \multicolumn{2}{c}{MTP} & \multicolumn{2}{c}{TOP} & \multicolumn{2}{c}{NTP} & \multicolumn{2}{c}{MTP} & \multicolumn{2}{c}{TOP} & \multicolumn{2}{c}{NTP} & \multicolumn{2}{c}{MTP} & \multicolumn{2}{c}{TOP} \\
\midrule
Param  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
Train  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\midrule
\multirow[c]{2}{*}{Arc Challenge} & 26.54 &  & {28.07} & \textsubscrip

In [32]:
def create_plot_specify_size(size: str, add_metric_column: bool = True) -> pd.DataFrame:
    for json_file in all_json_files:
        if size in json_file:
            if "mtp" in json_file:
                mtp_json_file = json_file
            elif "vanilla" in json_file:
                vanilla_json_file = json_file
            elif "myopic" in json_file or "top" in json_file:
                top_json_file = json_file

    latex_output, df = generate_latex_comparison_table(
        {"NTP" : vanilla_json_file, 
        "MTP" : mtp_json_file,
        "TOP" : top_json_file}, 
        baseline_source_name="NTP",
        size=size,
        add_metric_column=add_metric_column
    )
    print(latex_output)

    return df

df = create_plot_specify_size("7B", False)


% Add this to your LaTeX preamble: \usepackage[table]{xcolor} \usepackage{multirow} \usepackage{booktabs}
\begin{table}[htbp!]
\centering
\caption{Comparison of evaluation results for size 7B. Deltas relative to baseline.}
\label{tab:generated_comparison}
\begin{tabular}{l|rr@{\hspace{3pt}}lr@{\hspace{3pt}}l}
\toprule
Task & NTP & \multicolumn{2}{c}{MTP} & \multicolumn{2}{c}{TOP} \\
\midrule
\multirow[c]{1}{*}{Combined} & 70.54 & {70.06} & \textsubscript{\textcolor{red!70!black}{-0.48}} & {71.55} & \textsubscript{\textcolor{green!70!black}{+1.01}} \\
\cmidrule{1-6}
\multirow[c]{2}{*}{Lambada} & 55.89 & {53.13} & \textsubscript{\textcolor{red!70!black}{-2.76}} & {57.03} & \textsubscript{\textcolor{green!70!black}{+1.14}} \\
 & 7.97 & {8.99} & \textsubscript{\textcolor{red!70!black}{+1.03}} & {7.64} & \textsubscript{\textcolor{green!70!black}{-0.32}} \\
\cmidrule{1-6}
\multirow[c]{1}{*}{Nq Open} & 7.31 & {7.40} & \textsubscript{\textcolor{green!70!black}{+0.08}} & {7.70} & \textsubscript

  new_values = df[df["task"].isin(combined_task)][(df["metric"] == "Acc Norm")].groupby("source")["value"].mean().to_dict()


# Plot with Ordering and Filtering

In [7]:
import json
import pandas as pd

exception_to_percentage = [
    "Perplexity",
    "Bits"
]

def generate_combined_latex_table(
        json_sources_by_size, 
        baseline_source_name, 
        task_metric_order=None,
        precision=2, 
        add_metric_column: bool = True,
        combined: bool = False
    ):
    """
    Parses multiple JSON results and generates a single combined LaTeX comparison table.
    Allows for specifying the tasks, metrics, and their order in the final table.

    Args:
        json_sources_by_size (dict): A dictionary where keys are sizes (e.g., "7B")
                                     and values are dictionaries of JSON paths for that size.
        baseline_source_name (str): The name of the source to use as the baseline.
        task_metric_order (list, optional): A list of tuples to specify the tasks, 
                                            metrics, and their order. If None, all tasks/metrics
                                            are included. 
                                            Example:
                                            [
                                                ("Lambada", ["Acc", "Perplexity"]),
                                                ("Hellaswag", ["Acc Norm"]),
                                                ("Arc Challenge", ["Acc Norm"]),
                                            ]
        precision (int): The number of decimal places for the scores.
        add_metric_column (bool): If True, includes the 'Metric' column.
        combined (bool): If True, calculates a "Combined" task average from specific tasks.

    Returns:
        str: A string containing the generated LaTeX table.
        pd.DataFrame: The processed pandas DataFrame.
    """
    if not json_sources_by_size:
        return "Error: No JSON sources provided."

    # --- 1. Parse all JSONs from all sizes into a list of records ---
    all_records = []
    for size, json_sources in json_sources_by_size.items():
        if baseline_source_name not in json_sources:
            return f"Error: Baseline source '{baseline_source_name}' not found for size '{size}'."
        for source_name, json_path in json_sources.items():
            try:
                with open(json_path, 'r') as file:
                    data = json.load(file)
                results = data.get("results", {})
            except (FileNotFoundError, json.JSONDecodeError) as e:
                print(f"Warning: Could not process file for '{size} {source_name}'. Error: {e}. Skipping.")
                continue

            for task_name, task_data in results.items():
                for key, value in task_data.items():
                    if "alias" in key or "stderr" in key:
                        continue
                    try:
                        numeric_value = float(value)
                    except (ValueError, TypeError):
                        continue
                    all_records.append({
                        "size": size,
                        "source": source_name,
                        "task": task_name.replace('_', ' ').replace('openai', '').strip().title(),
                        "metric": key.split(',')[0].replace('_', ' ').title(),
                        "value": numeric_value
                    })

    if not all_records:
        return "No valid data found to generate a table."

    # --- 2. Create DataFrame and Apply Custom Filtering/Ordering ---
    df = pd.DataFrame(all_records)

    if task_metric_order:
        # Create a mapping of (task, metric) to a sorting index
        order_map = {}
        i = 0
        for task, metrics in task_metric_order:
            for metric in metrics:
                order_map[(task, metric)] = i
                i += 1
        
        # Filter the DataFrame to only include the specified tasks and metrics
        df = df[df.apply(lambda row: (row['task'], row['metric']) in order_map, axis=1)].copy()
        
        # Add a temporary column for sorting and then sort the DataFrame
        df['sort_order'] = df.apply(lambda row: order_map.get((row['task'], row['metric'])), axis=1)
        df = df.sort_values('sort_order').drop(columns=['sort_order'])

    if combined:
        # Handle the "Combined" task logic
        combined_task_names = ["Arc Challenge", "Arc Easy", "Hellaswag", "Piqa", "Sciq"]
        avg_acc_norm = df[
            (df["task"].isin(combined_task_names)) & 
            (df["metric"] == "Acc Norm")
        ].groupby(["size", "source"])["value"].mean().reset_index()
        
        new_rows = []
        for _, row in avg_acc_norm.iterrows():
            new_rows.append({
                "size": row["size"],
                "source": row["source"],
                "task": "Combined",
                "metric": "Acc Norm",
                "value": row["value"]
            })
        
        # Keep original rows that are not part of the combined calculation and add the new combined rows
        df_filtered = df.drop(df[df["task"].isin(combined_task_names)].index)
        df = pd.concat([df_filtered, pd.DataFrame(new_rows)], ignore_index=True)

    # Pivot the table, ensuring the sort order from above is maintained
    pivot_df = df.pivot_table(index=['task', 'metric'], columns=['size', 'source'], values='value', sort=False)
    
    # Define column order
    size_order = ["340M", "1.8B", "7B"]
    model_order = [baseline_source_name] + sorted([s for s in list(json_sources_by_size.values())[0] if s != baseline_source_name])
    
    try:
        pivot_df = pivot_df.reindex(columns=pd.MultiIndex.from_product([size_order, model_order], names=['size', 'source']))
    except KeyError:
        return "Error: Mismatch in model names across different sizes."


    # --- 3. Build Combined LaTeX String ---
    latex_parts = [
        "% Add this to your LaTeX preamble: \\usepackage[table]{xcolor} \\usepackage{multirow} \\usepackage{booktabs}",
        "\\begin{table*}[htbp!]", # Use table* for full page width
        "\\centering",
        "\\small",
        "\\setlength{\\tabcolsep}{3pt}", # Reduce column padding
        "\\caption{Combined comparison of evaluation results across model sizes.}",
        "\\label{tab:combined_comparison}"
    ]
    
    num_models = len(model_order)
    num_sizes = len(size_order)
    cols_per_model = 2 # score + delta
    cols_per_size = cols_per_model * num_models

    # Define column format
    column_format = "l|"
    if add_metric_column:
        column_format += "l|"
    column_format += ("r@{\\hspace{2pt}}l" * num_models + "|") * num_sizes
    column_format = column_format[:-1] # Remove the last extra '|'

    latex_parts.append(f"\\begin{{tabular}}{{{column_format}}}")
    latex_parts.append("\\toprule")
    
    # Header Row 1 (Sizes)
    header1_parts = ["Task"]
    if add_metric_column:
        header1_parts.append("Metric")
    for size in size_order:
        header1_parts.append(f"\\multicolumn{{{cols_per_size}}}{{c|}}{{{size}}}")
    latex_parts.append(" & ".join(header1_parts) + " \\\\")

    # Header Row 2 (Models)
    header2_parts = [""] * (1 + (1 if add_metric_column else 0))
    for _ in size_order:
        for source in model_order:
              header2_parts.append(f"\\multicolumn{{{cols_per_model}}}{{c}}{{{source}}}")
    latex_parts.append(" & ".join(header2_parts) + " \\\\")
    latex_parts.append("\\midrule")

    # Placeholder Rows
    placeholder_cols = cols_per_size * num_sizes
    placeholder_text = " & ".join([""] * (placeholder_cols))
    latex_parts.append(f"Param & {placeholder_text} \\\\" if add_metric_column else f"Param {placeholder_text} \\\\")
    latex_parts.append(f"Loss & {placeholder_text} \\\\" if add_metric_column else f"Train {placeholder_text} \\\\")
    latex_parts.append("\\midrule")

    # --- Table Body ---
    # Group by task, maintaining the custom order by setting sort=False
    for i, (task_name, group) in enumerate(pivot_df.groupby(level='task', sort=False)):
        num_metrics = len(group)
        for j, ((_, metric_name), row) in enumerate(group.sort_index().iterrows()):
            
            def format_value(val, metric):
                if pd.isna(val): return val
                is_exception = any(ex.lower() in metric.lower() for ex in exception_to_percentage)
                return val * 100 if not is_exception else val

            all_cells = []
            for size in size_order:
                for source in model_order:
                    val = format_value(row.get((size, source)), metric_name)
                    
                    if source == baseline_source_name:
                        # Baseline just gets the score and an empty delta cell
                        all_cells.append(f"{val:.{precision}f}" if pd.notna(val) else '---')
                        all_cells.append('')
                        continue

                    baseline_val = format_value(row.get((size, baseline_source_name)), metric_name)
                    if pd.notna(baseline_val) and pd.notna(val):
                        delta = val - baseline_val
                        rounded_delta = round(delta, precision)

                        score_cell = f"{{{val:.{precision}f}}}"
                        delta_cell = ""
                        if rounded_delta != 0.0:
                            is_good = (rounded_delta > 0) if not any(ex.lower() in metric_name.lower() for ex in exception_to_percentage) else (rounded_delta < 0)
                            color = "green!70!black" if is_good else "red!70!black"
                            delta_cell = f"\\textsubscript{{\\textcolor{{{color}}}{{{rounded_delta:+.{precision}f}}}}}"
                        
                        all_cells.append(score_cell)
                        all_cells.append(delta_cell)
                    else:
                        all_cells.extend(['---', ''])
            
            row_content = " & ".join(all_cells)
            
            # Construct the final row string
            line_start = ""
            if add_metric_column:
                if j == 0:
                    line_start = f"\\multirow[c]{{{num_metrics}}}{{*}}{{{task_name}}} & {metric_name}"
                else:
                    line_start = f" & {metric_name}"
            else:
                if j == 0:
                    line_start = f"\\multirow[c]{{{num_metrics}}}{{*}}{{{task_name}}}"
                else:
                    line_start = ""
            
            latex_parts.append(f"{line_start} & {row_content} \\\\")
        
        total_cols = 1 + (1 if add_metric_column else 0) + placeholder_cols
        if i < len(pivot_df.index.get_level_values('task').unique()) - 1:
            latex_parts.append(f"\\cmidrule{{1-{total_cols}}}")

    latex_parts.extend(["\\bottomrule", "\\end{tabular}", "\\end{table*}"])
    print("\n".join(latex_parts))
    return df


In [8]:
json_sources_by_size = {"340M" : {}, "1.8B" : {}, "7B" : {}}

for json_file in all_json_files:
    for key, value in json_sources_by_size.items():
        if key in json_file:
            if "mtp" in json_file:
                json_sources_by_size[key]["MTP"] = json_file
            elif "vanilla" in json_file:
                json_sources_by_size[key]["NTP"] = json_file
            elif "myopic" in json_file or "top" in json_file:
                json_sources_by_size[key]["TOP"] = json_file

print(json_sources_by_size)


{'340M': {'NTP': 'top_evals/zaydzuhri__vanilla-340M-4096-model/results_2025-08-20T11-23-06.323725.json', 'MTP': 'top_evals/zaydzuhri__mtp-340M-4096-model/results_2025-08-20T11-12-35.491684.json', 'TOP': 'top_evals/zaydzuhri__myopic-340M-4096-model/results_2025-08-20T11-20-54.956874.json'}, '1.8B': {'TOP': 'top_evals/zaydzuhri__myopic-1.8B-4096-model/results_2025-08-20T11-59-09.690564.json', 'MTP': 'top_evals/zaydzuhri__mtp-1.8B-4096-model/results_2025-08-20T12-31-13.727702.json', 'NTP': 'top_evals/zaydzuhri__vanilla-1.8B-4096-model/results_2025-08-20T12-34-56.285949.json'}, '7B': {'TOP': 'top_evals/zaydzuhri__top-7B-4096-model/results_2025-08-20T10-46-34.638307.json', 'NTP': 'top_evals/zaydzuhri__vanilla-7B-4096-model/results_2025-08-20T14-15-37.501701.json', 'MTP': 'top_evals/zaydzuhri__mtp-7B-4096-model/results_2025-08-20T10-59-16.724946.json'}}


In [10]:
df["task"].unique(), df["metric"].unique()

(array(['Arc Challenge', 'Arc Easy', 'Hellaswag', 'Lambada', 'Nq Open',
        'Piqa', 'Sciq', 'Social Iqa', 'Triviaqa', 'Wikitext'], dtype=object),
 array(['Acc', 'Acc Norm', 'Perplexity', 'Exact Match', 'Word Perplexity',
        'Byte Perplexity', 'Bits Per Byte'], dtype=object))

In [11]:
task_metric_order = [
    ["Lambada", ["Acc", "Perplexity"]],
    ["Hellaswag", ["Acc Norm"]],
    ["Arc Challenge", ["Acc Norm"]],
    ["Piqa", ["Acc Norm"]],
    ["Sciq", ["Acc Norm"]],
    ["Social Iqa", ["Acc"]],
    ["Nq Open", ["Exact Match"]],
    ["Triviaqa", ["Exact Match"]],
]

In [12]:
df = generate_combined_latex_table(
    json_sources_by_size, 
    baseline_source_name="NTP",
    precision=2,
    task_metric_order=task_metric_order,
    add_metric_column=False
)
    

% Add this to your LaTeX preamble: \usepackage[table]{xcolor} \usepackage{multirow} \usepackage{booktabs}
\begin{table*}[htbp!]
\centering
\small
\setlength{\tabcolsep}{3pt}
\caption{Combined comparison of evaluation results across model sizes.}
\label{tab:combined_comparison}
\begin{tabular}{l|r@{\hspace{2pt}}lr@{\hspace{2pt}}lr@{\hspace{2pt}}l|r@{\hspace{2pt}}lr@{\hspace{2pt}}lr@{\hspace{2pt}}l|r@{\hspace{2pt}}lr@{\hspace{2pt}}lr@{\hspace{2pt}}l}
\toprule
Task & \multicolumn{6}{c|}{340M} & \multicolumn{6}{c|}{1.8B} & \multicolumn{6}{c|}{7B} \\
 & \multicolumn{2}{c}{NTP} & \multicolumn{2}{c}{MTP} & \multicolumn{2}{c}{TOP} & \multicolumn{2}{c}{NTP} & \multicolumn{2}{c}{MTP} & \multicolumn{2}{c}{TOP} & \multicolumn{2}{c}{NTP} & \multicolumn{2}{c}{MTP} & \multicolumn{2}{c}{TOP} \\
\midrule
Param  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
Train  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\midrule
\multirow[c]{2}{*}{Lambada} & 36.35 &  & {35.32} & \textsubscript{\tex