In [39]:

import matplotlib.pyplot as plt
import tabulate

import numpy as np

import random
import os
import json

import pandas as pd

import pandas as pd
import wandb
api = wandb.Api()

USERS = ["David", "Jeff", "Kay", "Sara", "Tana"]
METRICS = ["BLEU", "Rouge", "Mauve"]
METRIC_NAMES = {
    "BLEU": "BLEU",
    "Rouge": "rougeLsum_fmeasure",
    "Mauve": "MAUVE"

}
MODEL_TYPES = ["FFT", "RoSA", "LoRA"]

COLUMNS = ["model", "user", "model_type", "metric", "RAFT", "RAG", "score", "seed", "metadata"]

BASE_MODEL_RESULTS = "/nfs/scistore19/alistgrp/anicolic/repos/PanzaMailFork/results/logs_base_models"

# GENERATIVE_MODEL = "Phi-3-mini-4k-instruct"
# GENERATIVE_MODEL = "Mistral-7B-Instruct-v0.2"
GENERATIVE_MODEL = "Meta-Llama-3-8B-Instruct"

In [40]:
def extract_config_from_run_name(run_name):
    user = [u for u in USERS if u.lower() in run_name][0]
    model_type = [m for m in MODEL_TYPES if m.lower() in run_name][0]
    if model_type == "RoSA" and "lr0.0-epochs" in run_name:
        model_type = "LoRA"
    raft = "RAFT" in run_name
    seed = int(run_name.split("seed")[-1].split("-")[0])
    return user, model_type, raft, seed


def load_wandb_project_runs(project_name):
    results_df = pd.DataFrame(columns=COLUMNS)
    runs = api.runs(project_name)
    for run in runs:
        if run.summary["_timestamp"] < 1717813799.4198122:
            continue  # Skip older runs
        user, model_type, raft, seed = extract_config_from_run_name(run.name)
        if user in ["David", "Jeff"] and "lr0.0001" in run.name:
            continue
        if user in ["Kay", "Sara", "Tana"] and "lr0.0001" not in run.name:
            continue
        summary = run.summary._json_dict
        for rag in [False, True]:
            for metric in METRICS:
                try:
                    score = summary[f"EVAL/{METRIC_NAMES[metric]}{'-RAG' if rag else ''}-mean"]
                except Exception as e:
                    print(f"Skipping {run.name} because of {e}")
                    continue
                results_df = results_df._append({
                    "model": "Llama3",
                    "user": user,
                    "model_type": model_type,
                    "metric": metric,
                    "RAFT": raft,
                    "RAG": rag,
                    "score": score,
                    "seed": seed,
                    "metadata": None
                }, ignore_index=True)

    print(len(results_df))
    return results_df

# all_projects = [
#     ("david", "diverse-vit/panza-david_anonymous-Phi3-June8"),
#     ("jeff", "diverse-vit/panza-jeff_johnson-Phi3-June8"),
#     ("kay", "diverse-vit/panza-kay_brown-Phi3-June8"),
#     ("sara", "diverse-vit/panza-shackleton_sara-Phi3-June8"),
#     ("tana", "diverse-vit/panza-tana_williams-Phi3-June8"),
# ]

all_projects = [
    ("david", "diverse-vit/panza-david_anonymous-llama3-June9"),
    ("jeff", "diverse-vit/panza-jeff_johnson-llama3-June9"),
    ("kay", "diverse-vit/panza-kay_brown-llama3-June9"),
    ("sara", "diverse-vit/panza-shackleton_sara-llama3-June9"),
    ("tana", "diverse-vit/panza-tana_williams-llama3-June9"),
]

results_df = pd.DataFrame(columns=COLUMNS)
for _, project_name in all_projects:
    results_df = pd.concat([results_df, load_wandb_project_runs(project_name)])

# results_df = load_wandb_project_runs("diverse-vit/panza-jeff_johnson-Phi3-June8")

print(results_df)
results_df = results_df.groupby(['model', 'user', 'model_type', 'metric', 'RAFT', 'RAG']).agg({'score': ['mean', 'std']}).reset_index()
results_df.columns = ['model', 'user', 'model_type', 'metric', 'RAFT', 'RAG', 'score', 'score_std']
print(results_df)

  results_df = results_df._append({


Skipping panza_david_anonymous_llama3_bf16-bs8-fft-lr1e-05-epochs3-wu20-seed41-PREAMBLE-10783 because of 'EVAL/BLEU-mean'
Skipping panza_david_anonymous_llama3_bf16-bs8-fft-lr1e-05-epochs3-wu20-seed41-PREAMBLE-10783 because of 'EVAL/rougeLsum_fmeasure-mean'
Skipping panza_david_anonymous_llama3_bf16-bs8-fft-lr1e-05-epochs3-wu20-seed41-PREAMBLE-10783 because of 'EVAL/MAUVE-mean'
Skipping panza_david_anonymous_llama3_bf16-bs8-fft-lr1e-05-epochs3-wu20-seed41-PREAMBLE-10783 because of 'EVAL/BLEU-RAG-mean'
Skipping panza_david_anonymous_llama3_bf16-bs8-fft-lr1e-05-epochs3-wu20-seed41-PREAMBLE-10783 because of 'EVAL/rougeLsum_fmeasure-RAG-mean'
Skipping panza_david_anonymous_llama3_bf16-bs8-fft-lr1e-05-epochs3-wu20-seed41-PREAMBLE-10783 because of 'EVAL/MAUVE-RAG-mean'
102


  results_df = pd.concat([results_df, load_wandb_project_runs(project_name)])
  results_df = results_df._append({


108


  results_df = results_df._append({


108


  results_df = results_df._append({


108


  results_df = results_df._append({


108
      model   user model_type metric   RAFT    RAG     score seed metadata
0    Llama3  David       LoRA   BLEU   True  False  0.221758   43     None
1    Llama3  David       LoRA  Rouge   True  False  0.376480   43     None
2    Llama3  David       LoRA  Mauve   True  False  0.130876   43     None
3    Llama3  David       LoRA   BLEU   True   True  0.259400   43     None
4    Llama3  David       LoRA  Rouge   True   True  0.407788   43     None
..      ...    ...        ...    ...    ...    ...       ...  ...      ...
103  Llama3   Tana        FFT  Rouge  False  False  0.358420   41     None
104  Llama3   Tana        FFT  Mauve  False  False  0.801999   41     None
105  Llama3   Tana        FFT   BLEU  False   True  0.242749   41     None
106  Llama3   Tana        FFT  Rouge  False   True  0.328652   41     None
107  Llama3   Tana        FFT  Mauve  False   True  0.902079   41     None

[534 rows x 9 columns]
      model   user model_type metric   RAFT    RAG     score  score_std


In [41]:
def extract_config_from_results_name(file_name):
    user = [u for u in USERS if u.lower() in file_name][0]
    seed = int(file_name.split("seed")[-1].split("_")[0])
    rag = "RAG" in file_name
    return user, seed, rag


def load_base_model_results(results_path):
    results_df = pd.DataFrame(columns=COLUMNS)
    for file_name in os.listdir(results_path):
        if GENERATIVE_MODEL in file_name and "summary" in file_name: # and file_name.endswith(".json"):
            user, seed, rag = extract_config_from_results_name(file_name)
            with open(os.path.join(results_path, file_name)) as f:
                results = json.load(f)
            for metric in METRICS:
                score = results["means"][METRIC_NAMES[metric]]
                results_df = results_df._append({
                    "model": GENERATIVE_MODEL,
                    "user": user,
                    "model_type": "Base",
                    "metric": metric,
                    "RAFT": False,
                    "RAG": rag,
                    "score": score,
                    "seed": seed,
                    "metadata": None
                }, ignore_index=True)

    return results_df

base_results_df = load_base_model_results(BASE_MODEL_RESULTS)
base_results_df = base_results_df.groupby(['model', 'user', 'model_type', 'metric', 'RAFT', 'RAG']).agg({'score': ['mean', 'std']}).reset_index()
base_results_df.columns = ['model', 'user', 'model_type', 'metric', 'RAFT', 'RAG', 'score', 'score_std']
print(base_results_df)

  results_df = results_df._append({


                       model   user model_type metric   RAFT    RAG     score  \
0   Meta-Llama-3-8B-Instruct  David       Base   BLEU  False  False  0.083463   
1   Meta-Llama-3-8B-Instruct  David       Base   BLEU  False   True  0.107134   
2   Meta-Llama-3-8B-Instruct  David       Base  Mauve  False  False  0.009092   
3   Meta-Llama-3-8B-Instruct  David       Base  Mauve  False   True  0.017172   
4   Meta-Llama-3-8B-Instruct  David       Base  Rouge  False  False  0.181028   
5   Meta-Llama-3-8B-Instruct  David       Base  Rouge  False   True  0.212159   
6   Meta-Llama-3-8B-Instruct   Jeff       Base   BLEU  False  False  0.108474   
7   Meta-Llama-3-8B-Instruct   Jeff       Base   BLEU  False   True  0.115027   
8   Meta-Llama-3-8B-Instruct   Jeff       Base  Mauve  False  False  0.004072   
9   Meta-Llama-3-8B-Instruct   Jeff       Base  Mauve  False   True  0.005002   
10  Meta-Llama-3-8B-Instruct   Jeff       Base  Rouge  False  False  0.181796   
11  Meta-Llama-3-8B-Instruct

In [42]:
def get_mock_results():
    df = pd.DataFrame(columns=COLUMNS)

    for user in USERS:
        for model_type in MODEL_TYPES:
            for metric in METRICS:
                for rag in [False, True]:
                    for raft in [False, True]:
                        df = df._append(
                            {
                                "model": "model",
                                "user": user,
                                "model_type": model_type,
                                "metric": metric,
                                "RAFT": raft,
                                "RAG": rag,
                                "score": random.random(),
                                "metadata": None,
                            },
                            ignore_index=True,
                        )

    return df


def create_all_results_table(results_df, base_results_df=None):
    table = []
    table_bold = []
    if base_results_df is not None:
        for rag in [False, True]:
            rag_str = "-RAG" if rag else ""
            model_str = f"Pretrained{rag_str}"
            line = [model_str]
            line_bold = [False]
            for user in USERS:
                for metric in METRICS:
                    try:
                        score = base_results_df[
                            (base_results_df["user"] == user)
                            & (base_results_df["metric"] == metric)
                            & (base_results_df["RAG"] == rag)
                        ]["score"].values[0]
                        # best_score = base_results_df[
                        #     (base_results_df["user"] == user)
                        #     & (base_results_df["metric"] == metric)]["score"].max()
                        # if score == best_score:
                        #     line_bold.append(True)
                        # else:
                        #     line_bold.append(False)
                        line_bold.append(False)
                    except Exception as e:
                        print(e)
                        score = "-"
                        line_bold.append(False)
                    line.append(score)
            table.append(line)
            table_bold.append(line_bold)

    for model_type in MODEL_TYPES:
        for raft in [False, True]:
            for rag in [False, True]:
                # Create line for given model type and raft
                filtered_df = results_df[
                    (results_df["model_type"] == model_type)
                    & (results_df["RAFT"] == raft)
                    & (results_df["RAG"] == rag)
                ]
                raft_str = "-RAFT" if raft else ""
                rag_str = "-RAG" if rag else ""
                model_str = f"{model_type}{raft_str}{rag_str}"
                line = [model_str]
                line_bold = [False]
                for user in USERS:
                    for metric in METRICS:
                        try:
                            score = filtered_df[
                                (filtered_df["user"] == user) & (filtered_df["metric"] == metric)
                            ]["score"].values[0]
                            best_score = results_df[(results_df["user"] == user) & (results_df["metric"] == metric)]["score"].max()
                            # best_score = results_df[(results_df["user"] == user) & (results_df["metric"] == metric) & (results_df["model_type"] == model_type)]["score"].max()
                            if score == best_score:
                                line_bold.append(True)
                            else:
                                line_bold.append(False)
                        except Exception as e:
                            print(e)
                            score = "-"
                            line_bold.append(False)
                        line.append(score)
                table.append(line)
                table_bold.append(line_bold)
    return table, table_bold

# mock_df = get_mock_results()
table, table_bold = create_all_results_table(results_df, base_results_df)
for i, line in enumerate(table):
    print(line)
    print(table_bold[i])

['Pretrained', 0.0834630674868822, 0.18102784951527914, 0.009091640952813812, 0.1084743357270725, 0.1817964946403522, 0.0040720962619612555, 0.11264385019118588, 0.18632575752282585, 0.005397102711700084, 0.14433266845132622, 0.22975609025784902, 0.0040720962619612555, 0.13189641830940993, 0.2095547671252418, 0.005630289023601111]
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
['Pretrained-RAG', 0.10713399892052015, 0.2121585038304329, 0.01717154339118201, 0.11502747702621674, 0.18804668406991995, 0.005001847776876451, 0.12076302043472727, 0.1973427694497837, 0.00435001375085826, 0.15138914164687908, 0.23308546656653994, 0.004072096261961256, 0.14894982898563502, 0.22742815637251454, 0.004792967776141999]
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
['FFT', 0.27772917898371813, 0.46040987253189086, 0.9961982169052762, 0.1655147774141539, 0.28226156005736

In [43]:
from pylatex import Document, Section, Tabular, MultiColumn, MultiRow, Command, NoEscape

def format_as_math(number):
    """Return number formatted as LaTeX math."""
    return NoEscape(f"${number}$")

def get_colored_column():
    """Return a colored column formatter."""
    return NoEscape(">{\columncolor{gray!10}}")

def create_document(results_table, bold_table=None):
    doc = Document("results")

    columns_format = ""
    columns_format += "l"
    for i, user in enumerate(USERS):
        for metric in METRICS:
            if i % 2 == 0:
                columns_format += get_colored_column()
            columns_format += "c"
    # with doc.create(Tabular("c" * (1 + len(USERS) * len(METRICS)))) as table:
    with doc.create(Tabular(columns_format)) as table:
        table.add_hline()

        # Add user columns
        user_columns = []
        user_columns.append(" ")
        for i, user in enumerate(USERS):
            if False: #i % 2 == 0:
                user_columns.append(MultiColumn(3, align='c', data=Command('cellcolor', arguments=['gray!10'], extra_arguments=Command('texttt', user))))
            else:
                user_columns.append(MultiColumn(3, align='c', data=Command('texttt', user)))
        table.add_row(user_columns)
        table.add_hline()

        # Add metric columns
        metrics_columns = []
        metrics_columns.append("Method")
        for user in USERS:
            for metric in METRICS:
                metrics_columns.append(metric)
        table.add_row(metrics_columns)

        for i, line in enumerate(results_table, 1):
            # Add horizontal line after each model type
            if i > 1 and results_table[i-1][0][:3] != results_table[i-2][0][:3]:
                table.add_hline()

            clean_line = []
            for element in line:
                if isinstance(element, str):
                    element = Command('texttt', element)
                    clean_line.append(element)
                else:
                    element = float(element)
                    element = round(element, 3)
                    if table_bold and table_bold[i-1][len(clean_line)]:
                        element = NoEscape(f"\\textbf{{{element}}}")
                    element = format_as_math(element)
                    clean_line.append(element)
            table.add_row(clean_line)

            # Add horizontal line after each model type
            # if i % 4 == 0:
            #     table.add_hline()

        table.add_hline()

    return doc

doc = create_document(table)
print(doc.dumps())

\documentclass{article}%
\usepackage[T1]{fontenc}%
\usepackage[utf8]{inputenc}%
\usepackage{lmodern}%
\usepackage{textcomp}%
\usepackage{lastpage}%
%
%
%
\begin{document}%
\normalsize%
\begin{tabular}{l>{\columncolor{gray!10}}c>{\columncolor{gray!10}}c>{\columncolor{gray!10}}cccc>{\columncolor{gray!10}}c>{\columncolor{gray!10}}c>{\columncolor{gray!10}}cccc>{\columncolor{gray!10}}c>{\columncolor{gray!10}}c>{\columncolor{gray!10}}c}%
\hline%
 &\multicolumn{3}{c}{\texttt{David}}&\multicolumn{3}{c}{\texttt{Jeff}}&\multicolumn{3}{c}{\texttt{Kay}}&\multicolumn{3}{c}{\texttt{Sara}}&\multicolumn{3}{c}{\texttt{Tana}}\\%
\hline%
Method&BLEU&Rouge&Mauve&BLEU&Rouge&Mauve&BLEU&Rouge&Mauve&BLEU&Rouge&Mauve&BLEU&Rouge&Mauve\\%
\texttt{Pretrained}&$0.083$&$0.181$&$0.009$&$0.108$&$0.182$&$0.004$&$0.113$&$0.186$&$0.005$&$0.144$&$0.23$&$0.004$&$0.132$&$0.21$&$0.006$\\%
\texttt{Pretrained{-}RAG}&$0.107$&$0.212$&$0.017$&$0.115$&$0.188$&$0.005$&$0.121$&$0.197$&$0.004$&$0.151$&$0.233$&$0.004$&$0.149$&$0.227$