# Imports

In [1]:
%reload_ext autoreload
%autoreload 2


# import pickle
from plots import (
    MAX_TABLE_SIZE,
    make_table_avg,
    make_perf_table,
)
from utils import load_pickle
from generating_data.utils_for_notebooks import merge_methods
import pandas as pd
from IPython.display import display
import sys

# Functions

In [2]:
def make_table_1(data_for_table_1):

    def prepare_mae(mae_value):
        if mae_value is None or mae_value == float('nan'):
            return float('nan')
        else:
            return round(mae_value * 100, 2)

    def prepare_rank(rank_value):
        if rank_value is None or rank_value == float('nan'):
            return float('nan')
        else:
            return round(rank_value * 100, 2)

    assert len(data_for_table_1) == 4 # mae and rank for mmlu and helm

    rows = []
    mmlu_maes, num_anchors_mmlu_maes = data_for_table_1[0]
    mmlu_ranks, num_anchors_mmlu_ranks = data_for_table_1[1]
    helm_maes, num_anchors_helm_maes = data_for_table_1[2]
    helm_ranks, num_anchors_helm_ranks = data_for_table_1[3]
    assert num_anchors_mmlu_maes == num_anchors_mmlu_ranks == num_anchors_helm_maes == num_anchors_helm_ranks
    num_anchors = num_anchors_mmlu_maes

    if helm_maes is None:
        helm_maes = mmlu_maes.copy()
        helm_maes.loc[:,:] = float('nan')
    if helm_ranks is None:
        helm_ranks = mmlu_ranks.copy()
        helm_ranks.loc[:,:] = float('nan')

    rows.append([ # headers
        "Approach",
        "Condensation", # type
        "Condensation", # num_anchors
        "Prediction", # type
        "MMLU", # mae
        "MMLU", # rank
        "HELM", # mae
        "HELM", # rank
    ])
    rows.append([
        "",
        "type", # type
        "num_anchors", # num_anchors
        "type", # type
        "mae", # mae
        "rank", # rank
        "mae", # mae
        "rank", # rank
    ])
    rows.append([
        "Baseline",
        "Random",
        num_anchors,
        "Eval",
        prepare_mae(mmlu_maes.loc["random"]["naive"]),
        prepare_rank(mmlu_ranks.loc["random"]["naive"]),
        prepare_mae(helm_maes.loc["random"]["naive"]),
        prepare_rank(helm_ranks.loc["random"]["naive"]),
    ])
    rows.append([
        "Baseline",
        "Random",
        num_anchors,
        "kNN",
        prepare_mae(mmlu_maes.loc["random"]["KNN"]),
        prepare_rank(mmlu_ranks.loc["random"]["KNN"]),
        prepare_mae(helm_maes.loc["random"]["KNN"]),
        prepare_rank(helm_ranks.loc["random"]["KNN"]),
    ])
    rows.append([
        "Baseline",
        "Random",
        num_anchors,
        "linear",
        prepare_mae(mmlu_maes.loc["random"]["linear"]),
        prepare_rank(mmlu_ranks.loc["random"]["linear"]),
        prepare_mae(helm_maes.loc["random"]["linear"]),
        prepare_rank(helm_ranks.loc["random"]["linear"]),
    ])
    # tinyBenchmarks
    rows.append([
        "tinyBenchmarks",
        "anchor-correctness",
        num_anchors,
        "gp-IRT",
        prepare_mae(mmlu_maes.loc["anchor"]["gpirt"]),
        prepare_rank(mmlu_ranks.loc["anchor"]["gpirt"]),
        prepare_mae(helm_maes.loc["anchor"]["gpirt"]),
        prepare_rank(helm_ranks.loc["anchor"]["gpirt"]),
    ])
    rows.append([
        "tinyBenchmarks",
        "anchor-IRT",
        num_anchors,
        "gp-IRT",
        prepare_mae(mmlu_maes.loc["anchor-irt"]["gpirt"]),
        prepare_rank(mmlu_ranks.loc["anchor-irt"]["gpirt"]),
        prepare_mae(helm_maes.loc["anchor-irt"]["gpirt"]),
        prepare_rank(helm_ranks.loc["anchor-irt"]["gpirt"]),
    ])
    rows.append([
        "tinyBenchmarks",
        "Random",
        num_anchors,
        "gp-IRT",
        prepare_mae(mmlu_maes.loc["random"]["gpirt"]),
        prepare_rank(mmlu_ranks.loc["random"]["gpirt"]),
        prepare_mae(helm_maes.loc["random"]["gpirt"]),
        prepare_rank(helm_ranks.loc["random"]["gpirt"]),
    ])
    rows.append([
        "DISCO (ours)",
        "High PDS",
        num_anchors,
        "kNN",
        prepare_mae(mmlu_maes.loc["highest"]["KNN"]),
        prepare_rank(mmlu_ranks.loc["highest"]["KNN"]),
        prepare_mae(helm_maes.loc["highest"]["KNN"]),
        prepare_rank(helm_ranks.loc["highest"]["KNN"]),
    ])
    rows.append([
        "DISCO (ours)",
        "High PDS",
        num_anchors,
        "linear",
        prepare_mae(mmlu_maes.loc["highest"]["linear"]),
        prepare_rank(mmlu_ranks.loc["highest"]["linear"]),
        prepare_mae(helm_maes.loc["highest"]["linear"]),
        prepare_rank(helm_ranks.loc["highest"]["linear"]),
    ])

    # res["baseline"] = {
    #     "mae": 0.0,
    #     "rank": 0.0
    # }
    # res["ours"] = {
    df = pd.DataFrame(rows)

    # display(df)

    latex_str = make_table_1_latex(df)

    return df, latex_str


def make_table_1_latex(df):
        # Add column headers
    df.columns = ["Approach", "Type", "# Samples", "Type", "MAE", "Rank", "MAE", "Rank"]

    # Create LaTeX table content
    latex_str = "\\begin{table}[H]\n"
    latex_str += "\\centering\n\\small\n"
    latex_str += "\\begin{tabular}{c|cc|c|cc|cc}\n"
    latex_str += "\\toprule\n"
    latex_str += "\\multicolumn{1}{c}{\\textbf{Approach}}&\\multicolumn{2}{c}{\\textbf{Condensation}} & \\multicolumn{1}{c}{\\textbf{Prediction}} & \\multicolumn{2}{c}{\\textbf{MMLU}}& \\multicolumn{2}{c}{\\textbf{HELM}} \\\\\n"
    latex_str += "&Type & \\# \\negthinspace Samples & Type & {MAE}  &Rank& {MAE}  &Rank \\\\\n"
    latex_str += "\\toprule\n"

    # Process each row
    current_approach = ""
    for _, row in df.iterrows():
        if row["Approach"] == "Approach" or row["Approach"] == "":
            continue
        if row["Approach"] == current_approach:
            approach_str = ""
        else:
            approach_str = row["Approach"]
            current_approach = row["Approach"]

            # Add midrule before new approach except for first one
            if approach_str != "Baseline":
                latex_str += "\\midrule\n"

        # Format numbers
        mae_mmlu = "-" if pd.isna(row["MAE"].values[0]) else f"{float(row['MAE'].values[0]):.2f}"
        rank_mmlu = "-" if pd.isna(row["Rank"].values[0]) else f"{float(row['Rank'].values[0]):.2f}"
        mae_helm = "-" if pd.isna(row["MAE"].values[1]) else f"{float(row['MAE'].values[1]):.2f}"
        rank_helm = "-" if pd.isna(row["Rank"].values[1]) else f"{float(row['Rank'].values[1]):.2f}"

        # Bold best results
        if approach_str == "DISCO (ours)" and row["Type"].values[1] == "linear":
            mae_mmlu = f"\\textbf{{{mae_mmlu}}}"
            rank_mmlu = f"\\textbf{{{rank_mmlu}}}"

        latex_str += f"{approach_str}&{row['Type'].values[0]} & {row['# Samples']} & {row['Type'].values[1]} & {mae_mmlu} &{rank_mmlu} & {mae_helm} &{rank_helm} \\\\\n"

    latex_str += "\\bottomrule\n"
    latex_str += "\\end{tabular}\n"
    latex_str += "\\vspace{1em}\n"
    latex_str += "\\caption{Mean Absolute Error (MAE) for different sampling and prediction strategies. For question answering task on MMLU dataset [FIX]. \\joon{Add computational complexity info\nAdd HELM results, add ranking metric, add method from HELM.\n}}\n"
    latex_str += "\\label{tab:language-main}\n"
    latex_str += "\\end{table}"

    # Store LaTeX code in DataFrame metadata
    df.attrs['latex_table'] = latex_str
    return latex_str


def extract_data_for_table_1(source_df, num_anchors, lower_better):
    # Group by PDS type and calculate mean for each group
    df = source_df[num_anchors]
    # display(df)

    # Keep rows with NaN PDS type and group the rest
    nan_rows = df[df['PDS type'].isna()]
    non_nan_rows = df[df['PDS type'].notna()]
    if lower_better:
        grouped_non_nan = non_nan_rows.groupby('PDS type').min()
    else:
        grouped_non_nan = non_nan_rows.groupby('PDS type').max()
    grouped_df = pd.concat([grouped_non_nan, nan_rows])

    # Get the columns to find minimum across
    min_cols = ['MLP3_e700_lr0.001', 'Ridge_10', 'Lasso_e-4', 'RandomForestRegressor_100', 'GradientBoostingRegressor_200']

    # Find minimum value across specified columns and store in new 'linear' column
    if lower_better:
        grouped_df['linear'] = grouped_df[min_cols].min(axis=1)
    else:
        grouped_df['linear'] = grouped_df[min_cols].max(axis=1)

    # Drop the original columns
    grouped_df = grouped_df.drop(columns=min_cols)


    # display(grouped_df)
    # sys.exit(0)

    # Drop the stratified and #guiding_models columns since they're no longer meaningful after grouping
    for cols_to_drop in ['stratified', '#guiding_models', 'cirt', 'pirt']:
        if cols_to_drop in grouped_df.columns:
            grouped_df = grouped_df.drop(cols_to_drop, axis=1)

    return grouped_df, num_anchors


def make_df_with_results(table_avg, table_std, bench, split):
    cur_methods_for_table = table_avg["mmlu_fields"][split].keys()

    df = make_perf_table(
        table_avg[bench][split],
        table_std[bench][split],
        methods=cur_methods_for_table,
    )

    pd.set_option('display.max_rows', MAX_TABLE_SIZE)
    pd.set_option('display.max_columns', MAX_TABLE_SIZE)
    pd.set_option(
        "display.max_colwidth", MAX_TABLE_SIZE
    )
    for num_samples in df.keys():
        # print("#anchor_points:", num_samples)
        # Reorder columns to put guiding models, PDS type, and stratified first
        cols = df[num_samples].columns.tolist()
        first_cols = ['#guiding_models', 'PDS type', 'stratified']
        other_cols = [col for col in cols if col not in first_cols]
        df[num_samples] = df[num_samples][first_cols + other_cols]

        # Replace all values in #guiding_models column with 382
        df[num_samples].loc[df[num_samples]['#guiding_models'] == 'all', '#guiding_models'] = 382

        # Sort rows by #guiding_models
        df[num_samples] = df[num_samples].sort_values(['PDS type', 'stratified', '#guiding_models'])

        # print(df[num_samples])

    # df[max(list(df.keys()))].to_csv(results_table_path)
    return df

# Read data

In [3]:
# load needed results
# benches = [
#     "mmlu_fields",
#     # "helm"
# ]
# splits = [
#     # "iid",
#     "noniid"
# ]
results_suffixes = {
    "mmlu_fields": {

        "iid": {
            "ours": "_disagreement_best_47",
            "irt": "_disagreement_compare_with_irt43"
        },
        "noniid": {
            "ours": "_disagreement_best_48",
            "irt": "_disagreement_compare_with_irt44"
        }
    }
    # "helm": {
    #     "ours": {
    #         "filename_suffix",
    #         "filename_suffix_mmlu_fields"
    #     },
    # }
}
scenarios_to_skip = []
table_1_data = []
for bench, per_bench in results_suffixes.items():
    ordered = bench == "mmlu_fields"
    for split, per_split in per_bench.items():
        for agg_type in ["mae", "rank"]:
            table_avg_base = None
            table_std_base = None
            for method in [
                "ours",
                "irt"
            ]:
                # our_results_path = f'results/accs_{bench}_split-{split}_iterations-5{per_split["ours"]}.pickle'

                # data_ours = load_pickle(our_results_path)
                # irt_results_path = f'results/accs_{bench}_split-{split}_iterations-5{per_split["irt"]}.pickle'

                # data_irt = load_pickle(irt_results_path)
                filename_suffix = per_split[method]
                results_path = f'results/accs_{bench}_split-{split}_iterations-5{filename_suffix}.pickle'
                data = load_pickle(results_path)

                current_table_avg, current_table_std, current_model_perf = make_table_avg(
                    bench,
                    split,
                    filename_suffix,
                    data,
                    scenarios_to_skip=scenarios_to_skip,
                    ordered=ordered,
                    return_perf_table=True,
                    agg_type=agg_type
                )
                table_avg_base = merge_methods(table_avg_base, current_table_avg)
                table_std_base = merge_methods(table_std_base, current_table_std)
            if split == "noniid":
                df = make_df_with_results(table_avg_base, table_std_base, bench, split)
                table_1_data.append(extract_data_for_table_1(df, num_anchors=100, lower_better=(agg_type == "mae")))
                # print("DEBUG", df[100])

# generate table_avg, perf_avg and etc
# extract max across sampling methods
# for table in table_1_data:
#     print("DEBUG")
#     display(table)

  rank_corrs[i,j,l] = stats.spearmanr(data.mean(axis=3)[i,j,:,l], scores.T.mean(axis=1)).statistic
  rank_corrs[i,j,l] = stats.spearmanr(data.mean(axis=3)[i,j,:,l], scores.T.mean(axis=1)).statistic


# Table 1

In [4]:
table_1, latex_str = make_table_1(table_1_data + [(None, 100), (None, 100)])
display(table_1)
print(latex_str)

Unnamed: 0,Approach,Type,# Samples,Type.1,MAE,Rank,MAE.1,Rank.1
0,Approach,Condensation,Condensation,Prediction,MMLU,MMLU,HELM,HELM
1,,type,num_anchors,type,mae,rank,mae,rank
2,Baseline,Random,100,Eval,3.45,91.55,,
3,Baseline,Random,100,kNN,1.82,91.17,,
4,Baseline,Random,100,linear,1.59,94.23,,
5,tinyBenchmarks,anchor-correctness,100,gp-IRT,2.08,92.72,,
6,tinyBenchmarks,anchor-IRT,100,gp-IRT,3.25,92.18,,
7,tinyBenchmarks,Random,100,gp-IRT,2.79,92.2,,
8,DISCO (ours),High PDS,100,kNN,1.31,97.21,,
9,DISCO (ours),High PDS,100,linear,1.04,98.58,,


\begin{table}[H]
\centering
\small
\begin{tabular}{c|cc|c|cc|cc}
\toprule
\multicolumn{1}{c}{\textbf{Approach}}&\multicolumn{2}{c}{\textbf{Condensation}} & \multicolumn{1}{c}{\textbf{Prediction}} & \multicolumn{2}{c}{\textbf{MMLU}}& \multicolumn{2}{c}{\textbf{HELM}} \\
&Type & \# \negthinspace Samples & Type & {MAE}  &Rank& {MAE}  &Rank \\
\toprule
Baseline&Random & 100 & Eval & 3.45 &91.55 & - &- \\
&Random & 100 & kNN & 1.82 &91.17 & - &- \\
&Random & 100 & linear & 1.59 &94.23 & - &- \\
\midrule
tinyBenchmarks&anchor-correctness & 100 & gp-IRT & 2.08 &92.72 & - &- \\
&anchor-IRT & 100 & gp-IRT & 3.25 &92.18 & - &- \\
&Random & 100 & gp-IRT & 2.79 &92.20 & - &- \\
\midrule
DISCO (ours)&High PDS & 100 & kNN & 1.31 &97.21 & - &- \\
&High PDS & 100 & linear & 1.04 &98.58 & - &- \\
\bottomrule
\end{tabular}
\vspace{1em}
\caption{Mean Absolute Error (MAE) for different sampling and prediction strategies. For question answering task on MMLU dataset [FIX]. \joon{Add computational complexity