In [12]:
import os
import pandas as pd
import json


USERS = ["David"]
METRICS = ["BLEU", "ROUGE"]
METRIC_NAMES = {
    "BLEU": "bleu_score",
    "ROUGE": "rougeLsum_fmeasure",
}
# TODO(armand): Would be interesting to see Mauve here. Should be low / neural as the summary
# should not contain the style.

GENERATIVE_MODELS = [
    "Mistral-7B-Instruct-v0.2",
    "Mistral-7B-Instruct-v0.3",
    "Phi-3-mini-4k-instruct",
    "Meta-Llama-3-8B-Instruct",
]

RESULTS_DIR = "/nfs/scistore19/alistgrp/anicolic/repos/PanzaMailFork/results/summarization"

COLUMNS = ["model", "user", "metric", "score", "seed", "metadata"]

In [14]:
def extract_config_from_run_name(run_name):
    user = [u for u in USERS if u.lower() in run_name][0]
    model = [m for m in GENERATIVE_MODELS if m.split("/")[0] in run_name][0]
    seed = int(run_name.split("seed")[-1])
    return user, model, seed


def load_results(results_dir):
    results_df = pd.DataFrame(columns=COLUMNS)
    for directory in os.listdir(results_dir):
        if not os.path.isdir(os.path.join(results_dir, directory)):
            continue
        user, model, seed = extract_config_from_run_name(directory)
        results_file = os.path.join(results_dir, directory, "summarization_summary.txt")
        with open(results_file, "r") as f:
            results = json.load(f)

        for metric in METRICS:
            score = results[METRIC_NAMES[metric]]
            results_df = results_df._append({
                "model": model,
                "user": user,
                "metric": metric,
                "score": score,
                "seed": seed,
                "metadata": None
            }, ignore_index=True)

    return results_df


results_df = load_results(RESULTS_DIR)
results_df = results_df.groupby(["model", "user", "metric"]).agg({'score': ['mean', 'std']}).reset_index()
results_df.columns = ["model", "user", "metric", "score", "std"]
print(results_df)

                      model   user metric     score       std
0  Meta-Llama-3-8B-Instruct  David   BLEU  0.609128  0.013291
1  Meta-Llama-3-8B-Instruct  David  ROUGE  0.708167  0.011996
2  Mistral-7B-Instruct-v0.2  David   BLEU  0.258485  0.005686
3  Mistral-7B-Instruct-v0.2  David  ROUGE  0.383959  0.003760
4  Mistral-7B-Instruct-v0.3  David   BLEU  0.293148  0.005861
5  Mistral-7B-Instruct-v0.3  David  ROUGE  0.421916  0.005892
6    Phi-3-mini-4k-instruct  David   BLEU  0.331739  0.005762
7    Phi-3-mini-4k-instruct  David  ROUGE  0.457962  0.004634


  results_df = results_df._append({


In [38]:
from pylatex import Document, Section, Tabular, MultiColumn, MultiRow, Command, NoEscape

def format_as_math(number):
    """Return number formatted as LaTeX math."""
    return NoEscape(f"${number}$")

def get_colored_column():
    """Return a colored column formatter."""
    return NoEscape(">{\columncolor{gray!10}}")

def create_document(results_df):
    doc = Document("results")

    columns_format = ""
    columns_format += "l"
    for i, user in enumerate(USERS):
        for metric in METRICS:
            # if i % 2 == 0:
            #     columns_format += get_colored_column()
            columns_format += "c"
    # with doc.create(Tabular("c" * (1 + len(USERS) * len(METRICS)))) as table:
    with doc.create(Tabular(columns_format)) as table:
        table.add_hline()

        # Add user columns
        user_columns = []
        user_columns.append(" ")
        for i, user in enumerate(USERS):
            if False: #i % 2 == 0:
                user_columns.append(MultiColumn(3, align='c', data=Command('cellcolor', arguments=['gray!10'], extra_arguments=Command('texttt', user))))
            else:
                user_columns.append(MultiColumn(len(METRICS), align='c', data=Command('texttt', user)))
        table.add_row(user_columns)
        table.add_hline()

        # Add metric columns
        metrics_columns = []
        metrics_columns.append("Method")
        for user in USERS:
            for metric in METRICS:
                metrics_columns.append(metric)
        table.add_row(metrics_columns)

        for model in GENERATIVE_MODELS:
            clean_line = []
            clean_line.append(Command('texttt', model))
            for user in USERS:
                for metric in METRICS:
                    try:
                        score = results_df[(results_df["model"] == model) & (results_df["user"] == user) & (results_df["metric"] == metric)]["score"].values[0]
                        if score == results_df[(results_df["user"] == user) & (results_df["metric"] == metric)]["score"].max():
                            score = NoEscape(f"\\textbf{{{score:.3f}}}")
                        else:
                            score = f"{score:.3f}"
                        std = results_df[(results_df["model"] == model) & (results_df["user"] == user) & (results_df["metric"] == metric)]["std"].values[0]
                        std = f"{std:.3f}"
                        score = format_as_math(f"{score} \pm {std}")
                    except Exception as e:
                        print(e)
                        score = "-"
                        std = ""
                    # score = format_as_math(f"{score} \pm {std}")
                    clean_line.append(score)
            table.add_row(clean_line)


        table.add_hline()

    return doc

doc = create_document(results_df)
print(doc.dumps())

\documentclass{article}%
\usepackage[T1]{fontenc}%
\usepackage[utf8]{inputenc}%
\usepackage{lmodern}%
\usepackage{textcomp}%
\usepackage{lastpage}%
%
%
%
\begin{document}%
\normalsize%
\begin{tabular}{lcc}%
\hline%
 &\multicolumn{2}{c}{\texttt{David}}\\%
\hline%
Method&BLEU&ROUGE\\%
\texttt{Mistral{-}7B{-}Instruct{-}v0.2}&$0.258 \pm 0.006$&$0.384 \pm 0.004$\\%
\texttt{Mistral{-}7B{-}Instruct{-}v0.3}&$0.293 \pm 0.006$&$0.422 \pm 0.006$\\%
\texttt{Phi{-}3{-}mini{-}4k{-}instruct}&$0.332 \pm 0.006$&$0.458 \pm 0.005$\\%
\texttt{Meta{-}Llama{-}3{-}8B{-}Instruct}&$\textbf{0.609} \pm 0.013$&$\textbf{0.708} \pm 0.012$\\%
\hline%
\end{tabular}%
\end{document}
