In [30]:
import os
import pandas as pd
import json


USERS = ["David", "Jamie"]
METRICS = ["BLEU", "ROUGE"]
METRIC_NAMES = {
    "BLEU": "bleu_score",
    "ROUGE": "rougeLsum_fmeasure",
}
# TODO(armand): Would be interesting to see Mauve here. Should be low / neural as the summary
# should not contain the style.

GENERATIVE_MODELS = [
    "Mistral-7B-Instruct-v0.2",
    "Mistral-7B-Instruct-v0.3",
    "Phi-3-mini-4k-instruct",
    "Meta-Llama-3-8B-Instruct",
]

RESULTS_DIR = "/nfs/scistore19/alistgrp/anicolic/repos/PanzaMailFork/results/summarization"

COLUMNS = ["model", "user", "metric", "score", "seed", "metadata"]

In [31]:
def extract_config_from_run_name(run_name):
    user = [u for u in USERS if u.lower() in run_name][0]
    model = [m for m in GENERATIVE_MODELS if m.split("/")[0] in run_name][0]
    seed = int(run_name.split("seed")[-1])
    return user, model, seed


def load_results(results_dir):
    results_df = pd.DataFrame(columns=COLUMNS)
    for directory in os.listdir(results_dir):
        if not os.path.isdir(os.path.join(results_dir, directory)):
            continue
        user, model, seed = extract_config_from_run_name(directory)
        results_file = os.path.join(results_dir, directory, "summarization_summary.txt")
        with open(results_file, "r") as f:
            results = json.load(f)

        for metric in METRICS:
            score = results[METRIC_NAMES[metric]]
            results_df = results_df._append({
                "model": model,
                "user": user,
                "metric": metric,
                "score": score,
                "seed": seed,
                "metadata": None
            }, ignore_index=True)

    return results_df


results_df = load_results(RESULTS_DIR)
results_df = results_df.groupby(["model", "user", "metric"]).agg({'score': ['mean', 'std']}).reset_index()
results_df.columns = ["model", "user", "metric", "score", "std"]
print(results_df)

  results_df = results_df._append({


                       model   user metric     score       std
0   Meta-Llama-3-8B-Instruct  David   BLEU  0.393968  0.011395
1   Meta-Llama-3-8B-Instruct  David  ROUGE  0.547511  0.012599
2   Meta-Llama-3-8B-Instruct    Jen   BLEU  0.263950  0.011373
3   Meta-Llama-3-8B-Instruct    Jen  ROUGE  0.411270  0.014505
4   Mistral-7B-Instruct-v0.2  David   BLEU  0.218736  0.009001
5   Mistral-7B-Instruct-v0.2  David  ROUGE  0.352652  0.007311
6   Mistral-7B-Instruct-v0.2    Jen   BLEU  0.132566  0.008641
7   Mistral-7B-Instruct-v0.2    Jen  ROUGE  0.251095  0.011969
8   Mistral-7B-Instruct-v0.3  David   BLEU  0.229480  0.021447
9   Mistral-7B-Instruct-v0.3  David  ROUGE  0.372849  0.020707
10  Mistral-7B-Instruct-v0.3    Jen   BLEU  0.163716  0.014411
11  Mistral-7B-Instruct-v0.3    Jen  ROUGE  0.306663  0.015951
12    Phi-3-mini-4k-instruct  David   BLEU  0.267566  0.020191
13    Phi-3-mini-4k-instruct  David  ROUGE  0.421528  0.017806
14    Phi-3-mini-4k-instruct    Jen   BLEU  0.180290  0

In [32]:
from pylatex import Document, Section, Tabular, MultiColumn, MultiRow, Command, NoEscape

def format_as_math(number):
    """Return number formatted as LaTeX math."""
    return NoEscape(f"${number}$")

def get_colored_column():
    """Return a colored column formatter."""
    return NoEscape(">{\columncolor{gray!10}}")

def create_document(results_df, users):
    doc = Document("results")

    columns_format = ""
    columns_format += "l"
    for i, user in enumerate(users):
        for metric in METRICS:
            if len(USERS) > 1 and i % 2 == 0:
                columns_format += get_colored_column()
            columns_format += "c"
    # with doc.create(Tabular("c" * (1 + len(USERS) * len(METRICS)))) as table:
    with doc.create(Tabular(columns_format)) as table:
        table.add_hline()

        # Add user columns
        user_columns = []
        user_columns.append(" ")
        for i, user in enumerate(users):
            if False: #i % 2 == 0:
                user_columns.append(MultiColumn(3, align='c', data=Command('cellcolor', arguments=['gray!10'], extra_arguments=Command('texttt', user))))
            else:
                user_columns.append(MultiColumn(len(METRICS), align='c', data=Command('texttt', user)))
        table.add_row(user_columns)
        table.add_hline()

        # Add metric columns
        metrics_columns = []
        metrics_columns.append("Model")
        for user in users:
            for metric in METRICS:
                metrics_columns.append(metric)
        table.add_row(metrics_columns)

        for model in GENERATIVE_MODELS:
            clean_line = []
            clean_line.append(Command('texttt', model))
            for user in users:
                for metric in METRICS:
                    try:
                        score = results_df[(results_df["model"] == model) & (results_df["user"] == user) & (results_df["metric"] == metric)]["score"].values[0]
                        if score == results_df[(results_df["user"] == user) & (results_df["metric"] == metric)]["score"].max():
                            score = NoEscape(f"\\textbf{{{score:.3f}}}")
                        else:
                            score = f"{score:.3f}"
                        std = results_df[(results_df["model"] == model) & (results_df["user"] == user) & (results_df["metric"] == metric)]["std"].values[0]
                        std = f"{std:.3f}"
                        score = format_as_math(f"{score} \pm {std}")
                    except Exception as e:
                        print(e)
                        score = "-"
                        std = ""
                    # score = format_as_math(f"{score} \pm {std}")
                    clean_line.append(score)
            table.add_row(clean_line)


        table.add_hline()

    return doc

doc = create_document(results_df, USERS)
print(doc.dumps())

\documentclass{article}%
\usepackage[T1]{fontenc}%
\usepackage[utf8]{inputenc}%
\usepackage{lmodern}%
\usepackage{textcomp}%
\usepackage{lastpage}%
%
%
%
\begin{document}%
\normalsize%
\begin{tabular}{l>{\columncolor{gray!10}}c>{\columncolor{gray!10}}ccc}%
\hline%
 &\multicolumn{2}{c}{\texttt{David}}&\multicolumn{2}{c}{\texttt{Jen}}\\%
\hline%
Model&BLEU&ROUGE&BLEU&ROUGE\\%
\texttt{Mistral{-}7B{-}Instruct{-}v0.2}&$0.219 \pm 0.009$&$0.353 \pm 0.007$&$0.133 \pm 0.009$&$0.251 \pm 0.012$\\%
\texttt{Mistral{-}7B{-}Instruct{-}v0.3}&$0.229 \pm 0.021$&$0.373 \pm 0.021$&$0.164 \pm 0.014$&$0.307 \pm 0.016$\\%
\texttt{Phi{-}3{-}mini{-}4k{-}instruct}&$0.268 \pm 0.020$&$0.422 \pm 0.018$&$0.180 \pm 0.006$&$0.331 \pm 0.011$\\%
\texttt{Meta{-}Llama{-}3{-}8B{-}Instruct}&$\textbf{0.394} \pm 0.011$&$\textbf{0.548} \pm 0.013$&$\textbf{0.264} \pm 0.011$&$\textbf{0.411} \pm 0.015$\\%
\hline%
\end{tabular}%
\end{document}


In [33]:
mean_results_df = results_df.groupby(["model", "metric"]).agg({'score': ['mean'], 'std': ['mean']}).reset_index()
mean_results_df.columns = ["model", "metric", "score", "std"]
mean_results_df["user"] = "all_users"
print(mean_results_df)


doc = create_document(mean_results_df, ["all_users"])
print(doc.dumps())

                      model metric     score       std       user
0  Meta-Llama-3-8B-Instruct   BLEU  0.328959  0.011384  all_users
1  Meta-Llama-3-8B-Instruct  ROUGE  0.479390  0.013552  all_users
2  Mistral-7B-Instruct-v0.2   BLEU  0.175651  0.008821  all_users
3  Mistral-7B-Instruct-v0.2  ROUGE  0.301873  0.009640  all_users
4  Mistral-7B-Instruct-v0.3   BLEU  0.196598  0.017929  all_users
5  Mistral-7B-Instruct-v0.3  ROUGE  0.339756  0.018329  all_users
6    Phi-3-mini-4k-instruct   BLEU  0.223928  0.013331  all_users
7    Phi-3-mini-4k-instruct  ROUGE  0.376145  0.014241  all_users
\documentclass{article}%
\usepackage[T1]{fontenc}%
\usepackage[utf8]{inputenc}%
\usepackage{lmodern}%
\usepackage{textcomp}%
\usepackage{lastpage}%
%
%
%
\begin{document}%
\normalsize%
\begin{tabular}{l>{\columncolor{gray!10}}c>{\columncolor{gray!10}}c}%
\hline%
 &\multicolumn{2}{c}{\texttt{all\_users}}\\%
\hline%
Model&BLEU&ROUGE\\%
\texttt{Mistral{-}7B{-}Instruct{-}v0.2}&$0.176 \pm 0.009$&$0.302 \pm 0