# Read Data From LM Eval Harness

In [5]:
import json
import pandas as pd

json_file = "top_evals/zaydzuhri__vanilla-7B-4096-model/results_2025-08-19T13-51-01.158842.json"

with open(json_file, 'r') as file:
    data = json.load(file)

In [9]:
results = data['results']
benchmark_score_name = {}
for benchmark, values in results.items():
    names = []
    for metric, score in values.items():
        if metric != "alias" and "stderr" not in metric:
            names.append(metric)
    benchmark_score_name[benchmark] = names

print(benchmark_score_name)

{'arc_challenge': ['acc,none', 'acc_norm,none'], 'arc_easy': ['acc,none', 'acc_norm,none'], 'hellaswag': ['acc,none', 'acc_norm,none'], 'lambada_openai': ['perplexity,none', 'acc,none'], 'nq_open': ['exact_match,remove_whitespace'], 'piqa': ['acc,none', 'acc_norm,none'], 'sciq': ['acc,none', 'acc_norm,none'], 'triviaqa': ['exact_match,remove_whitespace'], 'wikitext': ['word_perplexity,none', 'byte_perplexity,none', 'bits_per_byte,none']}


In [4]:
import json
import pandas as pd

json_file = "top_evals/zaydzuhri__vanilla-7B-4096-model/results_2025-08-19T13-51-01.158842.json"

with open(json_file, 'r') as file:
    data = json.load(file)

if "results" in data:
    df = pd.DataFrame(data["results"])

df.head()

Unnamed: 0,arc_challenge,arc_easy,hellaswag,lambada_openai,nq_open,piqa,sciq,triviaqa,wikitext
alias,arc_challenge,arc_easy,hellaswag,lambada_openai,nq_open,piqa,sciq,triviaqa,wikitext
"acc,none",0.450512,0.773148,0.50946,0.558898,,0.76333,0.929,,
"acc_stderr,none",0.01454,0.008594,0.004989,0.006917,,0.009917,0.008126,,
"acc_norm,none",0.454778,0.740741,0.674268,,,0.770403,0.886,,
"acc_norm_stderr,none",0.014552,0.008992,0.004677,,,0.009813,0.010055,,


In [None]:
import json
import pandas as pd

def generate_latex_comparison_table(json_sources, baseline_source_name, precision=2):
    """
    Parses multiple JSON results, uses pandas to structure the data, and
    generates a LaTeX comparison table with a heatmap-style delta column.

    Args:
        json_sources (dict): A dictionary where keys are source names (e.g., "Model A")
                             and values are the JSON path.
        baseline_source_name (str): The name of the source to use as the baseline
                                    for calculating the delta.
        precision (int): The number of decimal places for the scores.

    Returns:
        str: A string containing the generated LaTeX table.
    """
    if baseline_source_name not in json_sources:
        return "Error: Baseline source name not found in json_sources."

    # --- 1. Parse all JSONs into a list of records ---
    all_records = []
    for source_name, json_path in json_sources.items():
        try:
            with open(json_path, 'r') as file:
                data = json.load(file)
            results = data.get("results", {})
        except (FileNotFoundError, json.JSONDecodeError) as e:
            print(f"Warning: Could not read or parse file for source '{source_name}' at {json_path}. Error: {e}. Skipping.")
            continue

        for task_name, task_data in results.items():
            for key, value in task_data.items():
                if "alias" in key or "stderr" in key:
                    continue
                try:
                    numeric_value = float(value)
                except (ValueError, TypeError):
                    continue
                all_records.append({
                    "source": source_name,
                    "task": task_name.replace('_', ' ').replace('openai', '').strip().title(),
                    "metric": key.split(',')[0].replace('_', ' ').title(),
                    "value": numeric_value
                })

    if not all_records:
        return "No valid data found to generate a table."

    # --- 2. Create and Pivot DataFrame ---
    df = pd.DataFrame(all_records)
    pivot_df = df.pivot_table(index=['task', 'metric'], columns='source', values='value')
    other_sources = [s for s in json_sources if s != baseline_source_name]
    valid_columns = [baseline_source_name] + sorted([s for s in other_sources if s in pivot_df.columns])
    pivot_df = pivot_df[valid_columns]

    # --- 3. Calculate Delta and Build LaTeX String ---
    latex_parts = [
        "% Add this to your LaTeX preamble: \\usepackage[table]{xcolor}",
        "\\begin{table}[htbp!]",
        "\\centering",
        "\\caption{Comparison of evaluation results. $\\Delta$ = Model - Baseline.}",
        "\\label{tab:generated_comparison}"
    ]
    source_names = pivot_df.columns.tolist()
    column_format = "l|l|" + "r" * (len(source_names) + (len(source_names) > 1))
    latex_parts.append(f"\\begin{{tabular}}{{{column_format}}}")
    latex_parts.append("\\toprule")
    
    header_cols = ["Task", "Metric"] + source_names
    if len(source_names) > 1:
        header_cols.append("$\\Delta$")
    latex_parts.append(" & ".join(header_cols) + " \\\\")
    latex_parts.append("\\midrule")

    # Table Body
    for i, (task_name, group) in enumerate(pivot_df.groupby(level='task', sort=True)):
        num_metrics = len(group)
        for j, ((_, metric_name), row) in enumerate(group.sort_index().iterrows()):
            row_values = [f"{val:.{precision}f}" if pd.notna(val) else '---' for val in row]
            
            delta_str = ""
            if len(source_names) > 1:
                baseline_val = row[baseline_source_name]
                compare_val = row[source_names[1]]
                
                if pd.notna(baseline_val) and pd.notna(compare_val):
                    delta = compare_val - baseline_val
                    
                    # --- Heatmap Logic ---
                    # Normalize delta to a 0-1 range for intensity, cap at a reasonable max
                    # A value of 0.1 delta (10 percentage points) is considered max intensity
                    max_delta_for_color = 0.1 
                    normalized_delta = min(abs(delta) / max_delta_for_color, 1.0)
                    # Intensity from 0 (white) to 60 (strong color)
                    intensity = int(normalized_delta * 60) 

                    is_good = (delta < 0) if "Perplexity" in metric_name else (delta > 0)
                    color = "green" if is_good else "red"
                    
                    # Use \cellcolor for the background heatmap
                    delta_str = (f" & \\cellcolor{{{color}!{intensity}}}"
                                 f"{{{delta:+.{precision}f}}}")
                else:
                    delta_str = " & ---"


            if j == 0:
                line = f"\\multirow[c]{{{num_metrics}}}{{*}}{{{task_name}}} & {metric_name} & {' & '.join(row_values)}{delta_str} \\\\"
            else:
                line = f" & {metric_name} & {' & '.join(row_values)}{delta_str} \\\\"
            latex_parts.append(line)
        
        if i < len(pivot_df.index.get_level_values('task').unique()) - 1:
            latex_parts.append(f"\\cmidrule{{1-{len(header_cols)}}}")

    latex_parts.extend(["\\bottomrule", "\\end{tabular}", "\\end{table}"])
    print("Hey I modify something here")
    return "\n".join(latex_parts)

non_vanilla_json_file = "top_evals/zaydzuhri__vanilla-7B-4096-model-test/results_2025-08-19T13-51-01.158842.json"
latex_output = generate_latex_comparison_table({"Vanilla" : json_file, "Non-vanilla" : non_vanilla_json_file}, baseline_source_name="Vanilla")
print(latex_output)

\begin{table}[htbp!]
\centering
\caption{Comparison of evaluation results. $\Delta$ = Model - Baseline.}
\label{tab:generated_comparison}
\begin{tabular}{l|l|rrr}
\toprule
Task & Metric & Vanilla & Non-vanilla & $\Delta$ \\
\midrule
\multirow[c]{2}{*}{Arc Challenge} & Acc & 0.45 & 0.46 & \textcolor{green!70!black}{+0.01} \\
 & Acc Norm & 0.45 & 0.46 & \textcolor{green!70!black}{+0.01} \\
\cmidrule{1-5}
\multirow[c]{2}{*}{Arc Easy} & Acc & 0.77 & 0.76 & \textcolor{red!70!black}{-0.01} \\
 & Acc Norm & 0.74 & 0.75 & \textcolor{green!70!black}{+0.01} \\
\cmidrule{1-5}
\multirow[c]{2}{*}{Hellaswag} & Acc & 0.51 & 0.52 & \textcolor{green!70!black}{+0.01} \\
 & Acc Norm & 0.67 & 0.66 & \textcolor{red!70!black}{-0.01} \\
\cmidrule{1-5}
\multirow[c]{2}{*}{Lambada} & Acc & 0.56 & 0.46 & \textcolor{red!70!black}{-0.10} \\
 & Perplexity & 7.97 & 7.87 & \textcolor{green!70!black}{-0.10} \\
\cmidrule{1-5}
\multirow[c]{1}{*}{Nq Open} & Exact Match & 0.07 & 0.08 & \textcolor{green!70!black}{+0.01} \\