# Read Data From LM Eval Harness

In [5]:
import json
import pandas as pd

json_file = "top_evals/zaydzuhri__vanilla-7B-4096-model/results_2025-08-19T13-51-01.158842.json"

with open(json_file, 'r') as file:
    data = json.load(file)

In [9]:
results = data['results']
benchmark_score_name = {}
for benchmark, values in results.items():
    names = []
    for metric, score in values.items():
        if metric != "alias" and "stderr" not in metric:
            names.append(metric)
    benchmark_score_name[benchmark] = names

print(benchmark_score_name)

{'arc_challenge': ['acc,none', 'acc_norm,none'], 'arc_easy': ['acc,none', 'acc_norm,none'], 'hellaswag': ['acc,none', 'acc_norm,none'], 'lambada_openai': ['perplexity,none', 'acc,none'], 'nq_open': ['exact_match,remove_whitespace'], 'piqa': ['acc,none', 'acc_norm,none'], 'sciq': ['acc,none', 'acc_norm,none'], 'triviaqa': ['exact_match,remove_whitespace'], 'wikitext': ['word_perplexity,none', 'byte_perplexity,none', 'bits_per_byte,none']}


In [4]:
import json
import pandas as pd

json_file = "top_evals/zaydzuhri__vanilla-7B-4096-model/results_2025-08-19T13-51-01.158842.json"

with open(json_file, 'r') as file:
    data = json.load(file)

if "results" in data:
    df = pd.DataFrame(data["results"])

df.head()

Unnamed: 0,arc_challenge,arc_easy,hellaswag,lambada_openai,nq_open,piqa,sciq,triviaqa,wikitext
alias,arc_challenge,arc_easy,hellaswag,lambada_openai,nq_open,piqa,sciq,triviaqa,wikitext
"acc,none",0.450512,0.773148,0.50946,0.558898,,0.76333,0.929,,
"acc_stderr,none",0.01454,0.008594,0.004989,0.006917,,0.009917,0.008126,,
"acc_norm,none",0.454778,0.740741,0.674268,,,0.770403,0.886,,
"acc_norm_stderr,none",0.014552,0.008992,0.004677,,,0.009813,0.010055,,


In [73]:
import json
import pandas as pd

exception_to_percentage = [
    "Perplexity",
    "Bits"
]

def generate_latex_comparison_table(json_sources, baseline_source_name, precision=2):
    """
    Parses multiple JSON results and generates a LaTeX comparison table where
    each non-baseline cell is colored with a heatmap style based on its
    difference from the baseline.

    Args:
        json_sources (dict): A dictionary where keys are source names (e.g., "Model A")
                             and values are the JSON path.
        baseline_source_name (str): The name of the source to use as the baseline
                                    for coloring.
        precision (int): The number of decimal places for the scores.

    Returns:
        str: A string containing the generated LaTeX table.
    """
    if baseline_source_name not in json_sources:
        return "Error: Baseline source name not found in json_sources."

    # --- 1. Parse all JSONs into a list of records ---
    all_records = []
    for source_name, json_path in json_sources.items():
        try:
            with open(json_path, 'r') as file:
                data = json.load(file)
            results = data.get("results", {})
        except (FileNotFoundError, json.JSONDecodeError) as e:
            print(f"Warning: Could not read or parse file for source '{source_name}' at {json_path}. Error: {e}. Skipping.")
            continue

        for task_name, task_data in results.items():
            for key, value in task_data.items():
                if "alias" in key or "stderr" in key:
                    continue
                try:
                    numeric_value = float(value)
                except (ValueError, TypeError):
                    continue
                all_records.append({
                    "source": source_name,
                    "task": task_name.replace('_', ' ').replace('openai', '').strip().title(),
                    "metric": key.split(',')[0].replace('_', ' ').title(),
                    "value": numeric_value
                })

    if not all_records:
        return "No valid data found to generate a table."

    # --- 2. Create and Pivot DataFrame ---
    df = pd.DataFrame(all_records)
    pivot_df = df.pivot_table(index=['task', 'metric'], columns='source', values='value')
    other_sources = sorted([s for s in json_sources if s != baseline_source_name])
    valid_columns = [baseline_source_name] + [s for s in other_sources if s in pivot_df.columns]
    pivot_df = pivot_df[valid_columns]

    # --- 3. Build LaTeX String with Direct Cell Coloring ---
    latex_parts = [
        "% Add this to your LaTeX preamble: \\usepackage[table]{xcolor} \\usepackage{multirow} \\usepackage{booktabs}",
        "\\begin{table}[htbp!]",
        "\\centering",
        "\\caption{Comparison of evaluation results. Colors relative to baseline.}",
        "\\label{tab:generated_comparison}"
    ]
    source_names = pivot_df.columns.tolist()
    column_format = "l|l|" + "r" * len(source_names)
    latex_parts.append(f"\\begin{{tabular}}{{{column_format}}}")
    latex_parts.append("\\toprule")
    
    header_cols = ["Task", "Metric"] + source_names
    latex_parts.append(" & ".join(header_cols) + " \\\\")
    latex_parts.append("\\midrule")

    # Table Body
    for i, (task_name, group) in enumerate(pivot_df.groupby(level='task', sort=True)):
        num_metrics = len(group)
        for j, ((_, metric_name), row) in enumerate(group.sort_index().iterrows()):
            
            # --- CORRECTED LOGIC FOR PERCENTAGE CONVERSION ---
            def format_value(val, metric):
                if pd.isna(val):
                    return val
                # Check if the metric is one of the exceptions.
                is_exception = any(ex.lower() in metric.lower() for ex in exception_to_percentage)
                # If it's NOT an exception, multiply by 100.
                if not is_exception:
                    return val * 100
                return val

            baseline_val = format_value(row[baseline_source_name], metric_name)
            
            # Start the row with the uncolored baseline value
            formatted_cells = [f"{baseline_val:.{precision}f}" if pd.notna(baseline_val) else '---']

            # Process each of the other sources for coloring
            for source in other_sources:
                compare_val = format_value(row.get(source), metric_name)
                
                if pd.notna(baseline_val) and pd.notna(compare_val):
                    # For delta calculation, use original non-percentage values for better scaling
                    original_baseline = row[baseline_source_name]
                    original_compare = row.get(source)
                    delta = (original_compare - original_baseline)
                    
                    # --- Heatmap Logic ---
                    max_delta_for_color = 0.1 
                    normalized_delta = min(abs(delta) / max_delta_for_color, 1.0)
                    intensity = int(normalized_delta * 60)

                    is_good = (delta < 0) if "Perplexity" in metric_name else (delta > 0)
                    color = "green" if is_good else "red"
                    
                    cell_str = (f"\\cellcolor{{{color}!{intensity}}}"
                                f"{{{compare_val:.{precision}f}}}")
                    formatted_cells.append(cell_str)
                else:
                    formatted_cells.append('---')
            
            # Join all cells for the final row string
            row_content = " & ".join(formatted_cells)
            if j == 0:
                line = f"\\multirow[c]{{{num_metrics}}}{{*}}{{{task_name}}} & {metric_name} & {row_content} \\\\"
            else:
                line = f" & {metric_name} & {row_content} \\\\"
            latex_parts.append(line)
        
        if i < len(pivot_df.index.get_level_values('task').unique()) - 1:
            latex_parts.append(f"\\cmidrule{{1-{len(header_cols)}}}")

    latex_parts.extend(["\\bottomrule", "\\end{tabular}", "\\end{table}"])
    return "\n".join(latex_parts), df


In [89]:
mtp_json_file = "top_evals/zaydzuhri__mtp-340M-4096-model/results_2025-08-20T11-12-35.491684.json"
vanilla_json_file = "top_evals/zaydzuhri__vanilla-340M-4096-model/results_2025-08-20T11-23-06.323725.json"
top_json_file = "top_evals/zaydzuhri__myopic-340M-4096-model/results_2025-08-20T11-20-54.956874.json"

latex_output, df = generate_latex_comparison_table(
    {"NTP" : vanilla_json_file, 
     "MTP" : mtp_json_file,
     "TOP" : top_json_file}, 
     baseline_source_name="NTP",
)
print(latex_output)

% Add this to your LaTeX preamble: \usepackage[table]{xcolor} \usepackage{multirow} \usepackage{booktabs}
\begin{table}[htbp!]
\centering
\caption{Comparison of evaluation results. Colors relative to baseline.}
\label{tab:generated_comparison}
\begin{tabular}{l|l|rrr}
\toprule
Task & Metric & NTP & MTP & TOP \\
\midrule
\multirow[c]{2}{*}{Arc Challenge} & Acc & 26.54 & \cellcolor{green!9}{28.07} & \cellcolor{green!9}{28.07} \\
 & Acc Norm & 28.84 & \cellcolor{green!6}{29.86} & \cellcolor{green!3}{29.35} \\
\cmidrule{1-5}
\multirow[c]{2}{*}{Arc Easy} & Acc & 60.23 & \cellcolor{green!21}{63.80} & \cellcolor{green!18}{63.26} \\
 & Acc Norm & 56.52 & \cellcolor{green!11}{58.38} & \cellcolor{green!10}{58.29} \\
\cmidrule{1-5}
\multirow[c]{2}{*}{Hellaswag} & Acc & 35.52 & \cellcolor{red!0}{35.38} & \cellcolor{red!0}{35.43} \\
 & Acc Norm & 42.53 & \cellcolor{green!1}{42.73} & \cellcolor{green!6}{43.57} \\
\cmidrule{1-5}
\multirow[c]{2}{*}{Lambada} & Acc & 36.35 & \cellcolor{red!6}{35.32} & \

In [90]:
df[df['metric'] == 'Acc Norm'].groupby('source')['value'].mean()

source
MTP    0.549709
NTP    0.538886
TOP    0.557166
Name: value, dtype: float64

In [75]:
from glob import glob

eval_logs_folder = "top_evals"
all_json_files = glob(f"{eval_logs_folder}/*/*.json")
print(all_json_files)

['top_evals/zaydzuhri__myopic-1.8B-4096-model/results_2025-08-20T11-59-09.690564.json', 'top_evals/zaydzuhri__mtp-1.8B-4096-model/results_2025-08-20T12-31-13.727702.json', 'top_evals/zaydzuhri__vanilla-1.8B-4096-model/results_2025-08-20T12-34-56.285949.json', 'top_evals/zaydzuhri__vanilla-340M-4096-model/results_2025-08-20T11-23-06.323725.json', 'top_evals/zaydzuhri__mtp-340M-4096-model/results_2025-08-20T11-12-35.491684.json', 'top_evals/zaydzuhri__top-7B-4096-model/results_2025-08-20T10-46-34.638307.json', 'top_evals/zaydzuhri__myopic-340M-4096-model/results_2025-08-20T11-20-54.956874.json', 'top_evals/zaydzuhri__vanilla-7B-4096-model/results_2025-08-19T13-51-01.158842.json', 'top_evals/zaydzuhri__mtp-7B-4096-model/results_2025-08-20T10-59-16.724946.json']


In [77]:
print(mtp_json_file)
print(vanilla_json_file)
print(top_json_file)

top_evals/zaydzuhri__mtp-1.8B-4096-model/results_2025-08-20T12-31-13.727702.json
top_evals/zaydzuhri__vanilla-1.8B-4096-model/results_2025-08-20T12-34-56.285949.json
top_evals/zaydzuhri__myopic-1.8B-4096-model/results_2025-08-20T11-59-09.690564.json


In [76]:
size = "1.8B"

for json_file in all_json_files:
    if size in json_file:
        if "mtp" in json_file:
            mtp_json_file = json_file
        elif "vanilla" in json_file:
            vanilla_json_file = json_file
        elif "myopic" in json_file or "top" in json_file:
            top_json_file = json_file

latex_output, df = generate_latex_comparison_table(
    {"NTP" : vanilla_json_file, 
     "MTP" : mtp_json_file,
     "Top" : top_json_file}, 
     baseline_source_name="NTP",
)
print(latex_output)

% Add this to your LaTeX preamble: \usepackage[table]{xcolor} \usepackage{multirow} \usepackage{booktabs}
\begin{table}[htbp!]
\centering
\caption{Comparison of evaluation results. Colors relative to baseline.}
\label{tab:generated_comparison}
\begin{tabular}{l|l|rrr}
\toprule
Task & Metric & NTP & MTP & Top \\
\midrule
\multirow[c]{2}{*}{Arc Challenge} & Acc & 35.58 & \cellcolor{green!16}{38.40} & \cellcolor{green!22}{39.25} \\
 & Acc Norm & 38.65 & \cellcolor{green!11}{40.61} & \cellcolor{green!22}{42.32} \\
\cmidrule{1-5}
\multirow[c]{2}{*}{Arc Easy} & Acc & 72.81 & \cellcolor{red!0}{72.69} & \cellcolor{green!4}{73.48} \\
 & Acc Norm & 67.05 & \cellcolor{green!21}{70.66} & \cellcolor{green!18}{70.12} \\
\cmidrule{1-5}
\multirow[c]{2}{*}{Hellaswag} & Acc & 46.03 & \cellcolor{red!8}{44.61} & \cellcolor{red!1}{45.75} \\
 & Acc Norm & 60.05 & \cellcolor{red!10}{58.29} & \cellcolor{green!2}{60.45} \\
\cmidrule{1-5}
\multirow[c]{2}{*}{Lambada} & Acc & 49.58 & \cellcolor{red!9}{47.93} & \c

In [None]:
print(df[df["metric"] == "Acc Norm"].head())

   source           task    metric     value
1     NTP  Arc Challenge  Acc Norm  0.386519
3     NTP       Arc Easy  Acc Norm  0.670455
5     NTP      Hellaswag  Acc Norm  0.600478
10    NTP           Piqa  Acc Norm  0.735038
12    NTP           Sciq  Acc Norm  0.864000


In [88]:
df[df['metric'] == 'Acc Norm'].groupby('source')['value'].mean()

source
MTP    0.659666
NTP    0.651298
Top    0.669883
Name: value, dtype: float64

In [91]:
size = "7B"

for json_file in all_json_files:
    if size in json_file:
        if "mtp" in json_file:
            mtp_json_file = json_file
        elif "vanilla" in json_file:
            vanilla_json_file = json_file
        elif "top" in json_file or "top" in json_file:
            top_json_file = json_file

latex_output, df = generate_latex_comparison_table(
    {"NTP" : vanilla_json_file, 
     "MTP" : mtp_json_file,
     "TOP" : top_json_file}, 
     baseline_source_name="NTP",
)
print(latex_output)

% Add this to your LaTeX preamble: \usepackage[table]{xcolor} \usepackage{multirow} \usepackage{booktabs}
\begin{table}[htbp!]
\centering
\caption{Comparison of evaluation results. Colors relative to baseline.}
\label{tab:generated_comparison}
\begin{tabular}{l|l|rrr}
\toprule
Task & Metric & NTP & MTP & TOP \\
\midrule
\multirow[c]{2}{*}{Arc Challenge} & Acc & 45.05 & \cellcolor{red!8}{43.69} & \cellcolor{red!5}{44.20} \\
 & Acc Norm & 45.48 & \cellcolor{green!0}{45.56} & \cellcolor{green!5}{46.42} \\
\cmidrule{1-5}
\multirow[c]{2}{*}{Arc Easy} & Acc & 77.31 & \cellcolor{green!2}{77.69} & \cellcolor{green!4}{78.03} \\
 & Acc Norm & 74.07 & \cellcolor{red!1}{73.86} & \cellcolor{green!3}{74.62} \\
\cmidrule{1-5}
\multirow[c]{2}{*}{Hellaswag} & Acc & 50.95 & \cellcolor{red!8}{49.58} & \cellcolor{green!3}{51.53} \\
 & Acc Norm & 67.43 & \cellcolor{red!9}{65.85} & \cellcolor{green!7}{68.73} \\
\cmidrule{1-5}
\multirow[c]{2}{*}{Lambada} & Acc & 55.89 & \cellcolor{red!16}{53.13} & \cellcolor