# Demo of core features

First, some imports and some plot settings.

In [2]:
import glob
import imageio.v2 as iio
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import yaml

from collections import defaultdict
from importlib import reload
from itertools import product
from numpy.random import Generator
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from time import time
from tqdm.auto import tqdm
from warnings import filterwarnings

from genexpy import lower_bounds as gu
from genexpy import kernels as ku
from genexpy import probability_distributions as prob
from genexpy import rankings_utils as ru
from genexpy import mmd as mmd

Next, we load the hyperparameters from the config.

In [3]:
with open("config.yaml", 'r') as file:
    config = yaml.safe_load(file)

OUTPUT_DIR = Path(config['paths']['output_dir'])
FIGURES_DIR = Path(config['paths']['figures_dir'])

FORMAT = config['format']['output']

SEED = config['parameters']['seed']
RNG = np.random.default_rng(SEED)
ALPHA = config['parameters']['alpha']
LR_CONFIDENCE = config['parameters']['lr_confidence']
CI_LOWER = (1 - LR_CONFIDENCE) / 2
CI_UPPER = LR_CONFIDENCE + CI_LOWER

DATASET = Path(config['data']['dataset_path'])
EXPERIMENTAL_FACTORS = config['data']['experimental_factors']
TARGET = config['data']['target']
ALTERNATIVES = config['data']['alternatives']

SAMPLE_SIZE = config['sampling']['sample_size']
DISJOINT = config['sampling']['disjoint']
REPLACE = config['sampling']['replace']

Then, we load the dataset into a pandas dataframe and we check that only one experimental factor is set to `None`, indicating that it is allowed to vary. 

In [50]:
if DATASET.suffix == '.parquet':
    df = pd.read_parquet(DATASET).query("preferred_score == score_key").drop(columns=["preferred_score", "score_key"])
elif DATASET.suffix == '.csv':
    df = pd.read_csv(DATASET).query("preferred_score == score_key").drop(columns=["preferred_score", "score_key"])
else:
    raise Exception("Please use a Parquet or CSV file as the format of your data")

# preprocessing: remove underepresented models and tasks
rf = ru.get_rankings_from_df(df.reset_index(drop=True),
                             factors=list(EXPERIMENTAL_FACTORS.keys()),
                             alternatives=ALTERNATIVES,
                             target=TARGET,
                             lower_is_better=False, impute_missing=False)

tol = 0.2
rf = rf.loc[:, rf.isna().sum(axis=0) <= rf.shape[0] * tol]
rf = rf.loc[rf.isna().sum(axis=1) <= rf.shape[1] * tol, :]
df = df.loc[df.set_index(list(EXPERIMENTAL_FACTORS.keys())).index.isin(rf.columns)]

# Check whether exactly one of the experimental factors is None
assert sum(value is None for value in EXPERIMENTAL_FACTORS.values()) == 1, "Exactly one experimental factor must be null in config.yaml"

columns_to_check = set(EXPERIMENTAL_FACTORS.keys()).union({TARGET, ALTERNATIVES})
# Check whether the factors listed in the config actually exist in the df
if not_in_df := columns_to_check - set(df.columns):
    raise ValueError(f"The following columns are missing from the dataframe: {not_in_df}")

# Check whether the factors listed in the config are exhaustive
if not_in_config:= set(df.columns) - columns_to_check:
    raise ValueError(f"The following columns in the dataframe are not required: {not_in_config}")

In [55]:
pd.read_parquet(DATASET).query("preferred_score == score_key")["score_key"].value_counts()

score_key
exact_str_match                          97167
multiple_choice_grade                    58029
rougeLsum                                16855
normalized_aggregate_score                4016
difference_score                          2131
bleu                                      1716
accuracy                                   608
numeric_match_with_0_1_relative_error      528
f1                                         468
macro_f1                                   212
alignment_score                            192
full                                       189
sequence_f1                                180
bleurt                                     180
log_likelihood                             176
custom_score                               160
targets_reached                            128
fairness                                   128
average_log_probability                     96
main_words_match                            96
bias_level                                  64
mea

In [67]:
df.task_name.value_counts()

task_name
mult_data_wrangling                        33345
multiemo                                   14985
natural_instructions                        8370
arithmetic                                  2772
bbq_lite_json                               2565
                                           ...  
auto_categorization                          132
simple_arithmetic_multiple_targets_json      132
codenames                                    132
object_counting                              132
simple_arithmetic_json                        45
Name: count, Length: 155, dtype: int64

We then build a query from the experimental factors and convert the `df`. Note that here, we assume that we are using the _all key to designate that we want to use all possible values for that column.

In [182]:
# query df for the fixed factor levels in config.yaml
try:
    query_string = " and ".join(f"{factor} == '{lvl}'" if isinstance(lvl, str) else f"{factor} == {lvl}"
                                for factor, lvl in EXPERIMENTAL_FACTORS.items()
                                if lvl not in [None, "_all"])
    df = df.query(query_string)
except ValueError:
    pass

# for the not allowed-to-vary factors, get their combinations
try:
    groups = df.groupby([factor for factor, lvl in EXPERIMENTAL_FACTORS.items() if lvl == "_all"]).groups
except ValueError:
    groups = {"None": df.index}

Next, we load the kernels that we defined in the config. We will later iterate over them, as to perform our analysis over all 

In [183]:
def init_kernels(rank_matrix):
    kernels = {}
    for kernel_config in config['kernels']:
        kernel_func = getattr(ku, kernel_config['kernel'], None)
        
        if kernel_func:
            delta = kernel_config['delta']  # to get epsilon
            match kernel_config['kernel']:
                case "mallows_kernel":
                    eps = np.sqrt(2 * (1 - np.exp(-delta)))  # assumes nu = 1/binom(n, 2)
                case "jaccard_kernel":
                    eps = np.sqrt(2 * (1 - (1-delta)))
                case "borda_kernel":
                    eps = np.sqrt(2 * (1 - np.exp(-delta)))   # assumes nu = 1/n
                case _ :
                    raise ValueError(f"The kernel {kernel_config['kernel']} must be either the Jaccard, Mallows, or Borda kernel.")

            for param_key, param_values in kernel_config['params'].items():
                if isinstance(param_values, list):
                    for value in param_values:
                        params = {param_key: value}
                        if param_key == 'idx':
                            params[param_key] = rank_matrix.index.get_loc(value)

                        kernel_name = f"{kernel_config['kernel']}_{param_key}_{value}"
                        kernels[kernel_name] = (kernel_func, params, eps, delta)
                else:
                    params = {param_key: param_values}
                    if param_key == 'idx':
                        params[param_key] = rank_matrix.index.get_loc(param_values)

                    kernel_name = f"{kernel_config['kernel']}_{param_key}_{param_values}"
                    kernels[kernel_name] = (kernel_func, params, eps, delta)
        else:
            print(f"Kernel function '{kernel_config['kernel']}' not found in module 'kernels'.")
    return kernels

We create some directories for the different experiments we run.

In [184]:
def create_experiment_directory(kernel_name, factors, delta):
    exp0_dir = OUTPUT_DIR / "_".join([f"{key}={value}" for key, value in factors.items() if value is not None])
    exp1_dir = exp0_dir / f"{kernel_name}"
    exp21_dir = exp1_dir / f"nstar_N_ALPHA={ALPHA}_delta={delta}_ci={LR_CONFIDENCE}"
    exp21_dir.mkdir(parents=True, exist_ok=True)
    exp22_dir = exp1_dir / "computed_generalizability"
    exp22_dir.mkdir(parents=True, exist_ok=True)
    exp23_dir = exp1_dir / "computed_quantiles"
    exp23_dir.mkdir(parents=True, exist_ok=True)
    return exp21_dir, exp22_dir, exp23_dir

We will now define a few methods that will be run in a loop until our `ec_pool` is empty.

We start by sampling from our `ec_pool` and converting our samples to the corresponding rankings.

In [185]:
def sample_ecs(ec_pool, sample_size):
    assert sample_size <= len(ec_pool), f"Sample size {sample_size} is larger than |ec_pool| = {len(ec_pool)}"

    # Sample experimental conditions
    return RNG.choice(ec_pool, sample_size, replace=False)

def compute_rankings(ecs, rank_matrix):
    rm_ = rank_matrix.loc[:, ecs]
    na, nv = rm_.shape
    
    # Generate rankings from the data
    rankings = ru.SampleAM.from_rank_function_dataframe(rm_)
    
    return rankings, nv

We then get compute the variance, variance lower bound and MMDs of these rankings.

In [186]:
def compute_variance_and_lower_bound(rankings, n, kbar, eps, kernel, kernelargs):
    variance = ku.var(rankings, use_rv=True, kernel=kernel, **kernelargs)
    var_lower_bound = gu.sample_mean_embedding_lowerbound(eps, n, kbar=1, v=variance)
    return variance, var_lower_bound

def calculate_mmds(rankings, nv, kernel, kernelargs):
    mmds = {
        n: mmd.subsample_mmd_distribution(
            rankings, subsample_size=n, rep=100, use_rv=True, use_key=False,
            seed=SEED, disjoint=DISJOINT, replace=REPLACE, kernel=kernel, **kernelargs
        )
        for n in range(2, min(nv // 2 + 1, 50))
    }
    return mmds

Then, we create dataframes consisting of Generalizability scores and Quantiles.

In [187]:
def create_generalizability_dataframe(mmds, logepss):
    ys = {n: [mmd.generalizability(mmde, np.exp(logeps)) for logeps in logepss] for n, mmde in mmds.items()}
    dfy = pd.DataFrame(ys, index=logepss).reset_index().melt(id_vars='index', var_name='n', value_name='generalizability')
    dfy.rename(columns={'index': 'log(eps)'}, inplace=True)
    dfy['n'] = dfy['n'].astype(int)
    return dfy

def create_quantiles_dataframe(mmds):
    qs = {n: np.log(np.quantile(mmde, ALPHA)) for n, mmde in mmds.items()}
    dfq = pd.DataFrame(list(qs.items()), columns=['n', 'log(eps)'])
    dfq['log(n)'] = np.log(dfq['n'])
    return dfq

We now use the quantiles dataframe to calculate `nstar`. To this end, we fit linear regression models on subsets.

In [188]:
def perform_linear_regression_with_cv(dfq):
    # Extracting features and target from DataFrame
    X = dfq[['log(eps)']].values
    y = dfq[['log(n)']].values

    cv = KFold(n_splits=len(y))

    residuals, linear_predictors = [], []

    for train_index, test_index in cv.split(X):
        lr = LinearRegression().fit(X[train_index], y[train_index])

        predicted = lr.predict(X[test_index])
        residuals.extend(y[test_index] - predicted)

        linear_predictors.append(lr)

    return linear_predictors, residuals

def predict_nstar(logepss, linear_predictors, dfq, eps):
    X = dfq[['log(eps)']].values
    y = dfq[['log(n)']].values

    ns_pred_cv = [np.exp(lr.predict(logepss.reshape(-1, 1)).reshape(-1)) for lr in linear_predictors]

    ns_pred = np.exp(LinearRegression().fit(X, y).predict(logepss.reshape(-1, 1)).reshape(-1))

    nstar_cv = [pred[np.argmax(logepss > np.log(eps))] for pred in ns_pred_cv if not np.all(pred == 0)]

    nstar = ns_pred[np.argmax(logepss > np.log(eps))]
    
    nstar_lower, nstar_upper = np.quantile(nstar_cv, [CI_LOWER, CI_UPPER])

    return ns_pred, ns_pred_cv, nstar, nstar_lower, nstar_upper

Lastly, we plot the our results.

In [189]:
# def plot_generalizability_and_quantiles(dfy, dfq, logepss, ns_pred, ns_pred_cv, nstar, nstar_upper, nstar_lower, kernel_dir, eps):
#     # Create figure and axes
#     fig, axes = plt.subplots(2, 1, sharex="all", figsize=(10, 8))
#
#     # Generalizability plot
#     ax = axes[0]
#     sns.lineplot(data=dfy, x="log(eps)", y="generalizability", hue="n", ax=ax, palette=palette)
#     ax.hlines(ALPHA, ls="--", xmin=np.min(logepss), xmax=np.max(logepss), color="black")
#     for n in dfq["n"].unique():
#         ax.vlines(dfq.loc[dfq.n==n, "log(eps)"].iloc[0], ymin=0, ymax=ALPHA, ls=":")
#     sns.despine(ax=ax)
#
#     # Quantiles plot
#     ax = axes[1]
#     ymax = max(ns_pred)
#     sns.lineplot(data=dfq, x="log(eps)", y="n", ax=ax, ls="", marker="o", hue="n", legend=False)
#     for n in dfq["n"].unique():
#         ax.vlines(dfq.loc[dfq.n==n, "log(eps)"].iloc[0], ymin=n, ymax=ymax, ls=":")
#
#     ax.vlines(np.log(eps), ymin=0.1, ymax=ymax, color="black", ls="--")
#     sns.lineplot(x=logepss, y=ns_pred, color="green", ls="-.", ax=ax)
#
#     for it, ns_tmp in enumerate(ns_pred_cv):
#         if np.max(ns_tmp) > 1000:
#             continue
#         sns.lineplot(x=logepss, y=ns_tmp, color="green", ls="-.", alpha=0.5, ax=ax)
#
#     ax.set_xlabel(r"$\log(\varepsilon)$")
#     ax.set_ylabel(r"$n$")
#
#     # N* Lines
#     ax.hlines(nstar, xmin=np.min(logepss), xmax=np.log(eps), ls="-", color="red")
#     ax.hlines(nstar_upper, xmin=np.min(logepss), xmax=np.log(eps), ls="-", color="red", alpha=0.3)
#     ax.hlines(nstar_lower, xmin=np.min(logepss), xmax=np.log(eps), ls="-", color="red", alpha=0.3)
#     ax.set_yscale("log")
#     sns.despine(ax=ax)
#
#     # Finalize and save
#     fig.suptitle(f"Generalizability for $N = {len(ecs):02d}$\n"
#                  fr"$n^* (\alpha={ALPHA}, \varepsilon={eps:.2f}) = {np.ceil(nstar)}$" + "\n"
#                  f"${LR_CONFIDENCE}$-confidence interval: $[{np.ceil(nstar_lower)}, {np.ceil(nstar_upper)}]$")
#     plt.tight_layout()
#     if FORMAT == "pdf" or FORMAT == "all":
#         plt.savefig(kernel_dir / f"N={len(ecs):02d}.pdf")
#     if FORMAT == "png" or FORMAT == "all":
#         plt.savefig(kernel_dir / f"N={len(ecs):02d}.png")
#     plt.close("all")

In [190]:
# plt.ioff()

for fixed_levels, idxs in tqdm(list(groups.items()), position=0, desc="Configurations", leave=True):
    idf = df.loc[idxs].reset_index(drop=True)

    if idf.empty:
        continue

    # fixed levels
    factors_dict = {factor: lvl
                    for factor, lvl in EXPERIMENTAL_FACTORS.items()
                    if lvl not in [None, "_all"]}
    factors_dict.update({factor: idf[factor].unique()[0] for factor, lvl in EXPERIMENTAL_FACTORS.items()
                         if lvl == "_all"})

    # -- convert df to rank matrix
    rank_matrix = ru.get_rankings_from_df(idf, factors=list(EXPERIMENTAL_FACTORS.keys()), 
                                            alternatives=ALTERNATIVES,
                                            target=TARGET,
                                            lower_is_better=False, impute_missing=True)
    rank_matrix = rank_matrix.fillna(rank_matrix.max())

    # -- get all kernels
    kernels = init_kernels(rank_matrix)

    # -- set up the ec pool
    ec_variable = next((key for key, value in EXPERIMENTAL_FACTORS.items() if value is None), None)
    ec_pool = idf[ec_variable].unique()
    ecs = np.array([])

    # for kernelname, (kernel, kernelargs, epsstar) in tqdm(kernels.items(), position=1, desc='Kernels', leave=False):
    for kernelname, (kernel, kernelargs, epsstar, deltastar) in kernels.items():
        nstar_dir, gen_dir, quant_dir = create_experiment_directory(kernelname, factors_dict, epsstar)
        out = []
        # for i in tqdm(range(len(ec_pool) // SAMPLE_SIZE), desc=f'Using {kernelname}', leave=False):
        for i in range(len(ec_pool) // SAMPLE_SIZE):

            if (i+1)*SAMPLE_SIZE > len(ec_pool):
                break
            if (i+1)*SAMPLE_SIZE > 50:
                break

            # -- Sample new rankings from ec pool
            ecs = sample_ecs(ec_pool, (i+1)*SAMPLE_SIZE)
            rankings, nv = compute_rankings(ecs, rank_matrix)

            # -- Compute the lower bound
            variance, var_lower_bound = compute_variance_and_lower_bound(rankings, n=len(ecs), kbar=1, eps=epsstar, kernel=kernel, kernelargs=kernelargs)

            # -- We do not need to compute dfy and dfq again if we have already computed them for another alpha/epsstar
            if f"dfy_{len(ecs)}" in [x.stem for x in gen_dir.glob("*.parquet")] and f"dfmmd_{len(ecs)}" in [x.stem for x in quant_dir.glob("*.parquet")]:
                try:
                    dfy = pd.read_parquet(gen_dir / f"dfy_{len(ecs)}.parquet")
                    dfmmd = pd.read_parquet(quant_dir / f"dfmmd_{len(ecs)}.parquet")

                    dfq = pd.DataFrame(dfmmd.groupby("n")["eps"].quantile(ALPHA)).reset_index()
                    dfq["log(eps)"] = np.log(dfq["eps"])
                    dfq["log(n)"] = np.log(dfq["n"])

                    logepss = dfy["log(eps)"].unique()
                except Exception as e:
                    print(factors_dict)
                    raise e
            else:
                # -- Compute mmds
                mmds = calculate_mmds(rankings, nv, kernel=kernel, kernelargs=kernelargs)
                dfmmd = pd.DataFrame(mmds).melt(var_name="n", value_name="eps")

                # -- Compute generalizability and quantiles

                # - Prepare log(eps) scale
                logepss = np.linspace(np.log(epsstar) - 0.1, np.log(max(np.quantile(mmde, ALPHA) for mmde in mmds.values())) + 0.1, 1000)

                # - Dataframe for generalizability
                dfy = create_generalizability_dataframe(mmds, logepss)

                # - Dataframe for quantiles
                dfq = create_quantiles_dataframe(mmds)

            # -- Linear Regression with Cross-Validation
            try:
                linear_predictors, residuals = perform_linear_regression_with_cv(dfq)
                # -- Predictions
                ns_pred, ns_pred_cv, nstar, nstar_lower, nstar_upper = predict_nstar(logepss, linear_predictors, dfq, epsstar)
                singular = False
            except ValueError:
                nstar = nstar_lower = nstar_upper = 1
                singular = True

            # -- Plotting
            # plot_generalizability_and_quantiles(dfy, dfq, logepss, ns_pred, ns_pred_cv, nstar, nstar_upper, nstar_lower, nstar_dir, epsstar)

            # -- Storing
            result_dict = {
                "kernel": kernelname,
                "alpha": ALPHA,
                "eps": epsstar,
                "delta": deltastar,
                "disjoint": DISJOINT,
                "replace": REPLACE,
                "N": len(ecs),
                "nstar": nstar,
                "nstar_lower": nstar_lower,
                "nstar_upper": nstar_upper,
                "variance": variance,
                "var_lower_bound": var_lower_bound,
                "singular": singular
            }
            result_dict.update(factors_dict)
            out.append(result_dict)

            dfy.to_parquet(gen_dir / f"dfy_{len(ecs)}.parquet")
            # dfq.to_parquet(quant_dir / f"dfq_{len(ecs)}_{ALPHA}.parquet")
            dfmmd.to_parquet(quant_dir / f"dfmmd_{len(ecs)}.parquet")

        # if FORMAT == "gif" or FORMAT == "all":
        #     images = [iio.imread(image) for image in glob.glob(str(nstar_dir / "*.png"))]
        #     iio.mimwrite(nstar_dir / f"nstar.gif", images, duration=750, loop=0)
        # -- Store nstar predictions
        out = pd.DataFrame(out)
        out.to_parquet(nstar_dir / "nstar.parquet")
# plt.ion()

Configurations:   0%|          | 0/463 [00:00<?, ?it/s]

# Plots

First of all, we collect the results for nstar into a single dataframe.

In [64]:
df_nstars = [pd.read_parquet(x)
             for x in tqdm(list(OUTPUT_DIR.glob("**/**/**/nstar.parquet")), desc="Loading dataframes")]
df_nstar = pd.concat(df_nstars).reset_index(drop=True)
df_nstar["eps"] = df_nstar["eps"].round(3)

fixed_factors = [factor for factor, lvl in EXPERIMENTAL_FACTORS.items() if lvl == "_all"]
maxN = df_nstar.groupby(fixed_factors)["N"].max()
df_nstar = df_nstar.join(maxN, on=fixed_factors, rsuffix="max")

Now we need to prepere the dataframe for plotting.
First of all, we fix epsilon to delta=0.05 and let alpha vary.
The corresponding rounded values for epsilon are 0.221 for the Jaccard kernel and 0.224 for the Mallows and Borda kernels.
Second, we make sure that N is at the maximum for every combination of levels of not allowed-to-vary factors.

Let's fix the plotting parameters.

In [14]:
sns.set(style="ticks", palette="flare_r", context="paper", font="times new roman")

preamble = r"""
    \usepackage{mathptmx}
    \usepackage{amsmath}
"""
mpl.use("TkAgg")
mpl.rcParams['text.usetex'] = True
mpl.rcParams['text.latex.preamble'] = preamble
mpl.rc('font', family='Times New Roman')

# pretty names
pc = {"alpha": r"$\alpha^*$", "eps": r"$\varepsilon^*$", "nstar": r"$n^*$", "delta": r"$\delta^*$", "N": r"$N$", "nstar_absrel_error": "relative error"}  # pretty columns
pk = {"borda_kernel_idx_GPT GPT-3 XL": r"$\kappa_b^{\text{GPT3}, 1/n}$", "mallows_kernel_nu_auto": r"$\kappa_m^{1/\binom{n}{2}}$", "jaccard_kernel_k_1": r"$\kappa_j^{1}$"}
pk.update({"borda_kernel_idx_GPT GPT-3 XL": "$g_1$", "mallows_kernel_nu_auto": "$g_3$", "jaccard_kernel_k_1": "$g_2$"})

Let's now add the worst case scenario theoretical upper bound prediction.

In [7]:
@np.vectorize
def theoretical_nstar(alphastar, epsstar, kbar=1):
    beta1 = -2
    beta0 = 2*np.log(np.sqrt(2*kbar) + np.sqrt(-4*kbar * np.log(1-alphastar)))
    return np.exp(beta0 + beta1*np.log(epsstar))

## All plots together for alpha and delta

In [42]:
plt.close("all")
fig, axes = plt.subplots(1, 2, figsize=(5.5, 5.5/2.5), width_ratios=(1, 1), sharey=True)

# ----  ALPHA
ax = axes[0]
dfplot = df_nstar.loc[(df_nstar["delta"] == 0.05) & (df_nstar["N"] == df_nstar["Nmax"])]

# Make dfplot pretty
dfplot = dfplot.rename(columns=pc)
dfplot["kernel"] = dfplot["kernel"].map(pk)
dfplot["nstar_th"] = theoretical_nstar(dfplot[pc["alpha"]], dfplot[pc["eps"]], kbar=1)

# plot
sns.boxplot(dfplot, x=pc["alpha"], y=pc["nstar"], ax=ax, hue="kernel", showfliers=False, palette="cubehelix",
            dodge=True, native_scale=False, fill=False, legend=False,
            width=0.75, boxprops={"linewidth": 1.2}, gap=0.25)
# ax.set(xticks=[0.7, 0.8, 0.9, 0.99])
ax.grid(color="grey", alpha=0.2)

# ----  DELTA
ax = axes[1]
dfplot = df_nstar.loc[(df_nstar["alpha"] == 0.95) & (df_nstar["N"] == df_nstar["Nmax"])]

# Make dfplot pretty
dfplot = dfplot.rename(columns=pc)
dfplot["kernel"] = dfplot["kernel"].map(pk)

# plot
sns.boxplot(dfplot, x=pc["delta"], y=pc["nstar"], ax=ax, hue="kernel", showfliers=False, palette="cubehelix",
            dodge=True, native_scale=False, fill=False, legend=False,
            width=0.75, boxprops={"linewidth": 1.2}, gap=0.25)
# ax.set(xticks=[0.01, 0.1, 0.2, 0.3])
ax.grid(color="grey", alpha=0.2)

# nice legend
ax.legend(*ax.get_legend_handles_labels()).get_frame().set_edgecolor("w")

sns.despine(right=True, top=True)
plt.tight_layout(pad=.5)
plt.subplots_adjust(wspace=.12)
plt.savefig(FIGURES_DIR / "llms_nstar_alpha_delta.pdf")
plt.show()

## Varying $\alpha$

In [196]:
dfplot = df_nstar.loc[(df_nstar["delta"] == 0.05) & (df_nstar["N"] == df_nstar["Nmax"])]

# Make dfplot pretty
dfplot = dfplot.rename(columns=pc)
dfplot["kernel"] = dfplot["kernel"].map(pk)
dfplot["nstar_th"] = theoretical_nstar(dfplot[pc["alpha"]], dfplot[pc["eps"]], kbar=1)

In [197]:
filterwarnings("ignore")

plt.close("all")
order = np.sort(dfplot[pc["alpha"]].unique())

g = sns.FacetGrid(data=dfplot, col="kernel", sharey=True, aspect=1, height=5.5/3)
# add theoretical
# for ax, k in zip(g.axes.flat, ["borda", "jaccard", "mallows"]):
#     alphas = np.linspace(dfplot[pc["alpha"]].min(), dfplot[pc["alpha"]].max(), 100)
#     eps = 0.316 if k == "jaccard" else 0.312
#     nstars_th = theoretical_nstar(alphas, eps)
#     sns.lineplot(x=alphas, y=nstars_th, ax=ax, c="black", linewidth=0.5)

g.map(sns.boxplot, pc["alpha"], pc["nstar"], width=0.4, showfliers=False, native_scale=True, order=order,
      boxprops={"alpha": 0.3})
g.map(sns.swarmplot, pc["alpha"], pc["nstar"], native_scale=True, order=order, size=0.75)
g.map(plt.grid, color="grey", alpha=0.2)

g.set_titles("{col_name}")
g.set(xticks=[0.7, 0.8, 0.9, 0.99])

g.tight_layout(pad=0.5)
g.savefig(FIGURES_DIR / "llm_nstar_alpha.pdf")
plt.show()

## Varying $\delta$

In [198]:
dfplot = df_nstar.loc[(df_nstar["alpha"] == 0.95) & (df_nstar["N"] == df_nstar["Nmax"])]

# Make dfplot pretty
dfplot = dfplot.rename(columns=pc)
dfplot["kernel"] = dfplot["kernel"].map(pk)

In [199]:
filterwarnings("ignore")

plt.close("all")
order = np.sort(dfplot[pc["delta"]].unique())

g = sns.FacetGrid(data=dfplot, col="kernel", sharey=True, aspect=1, height=5.5/3)

# # add theoretical
# for ax, k in zip(g.axes.flat, ["borda", "jaccard", "mallows"]):
#     a = 0.95
#     epss = np.linspace(dfplot[pc["eps"]].min(), dfplot[pc["eps"]].max(), 100)
#     nstars_th = theoretical_nstar(a, epss)
#     sns.lineplot(x=epss, y=nstars_th, ax=ax, c="black", linewidth=0.5)

g.map(sns.boxplot, pc["delta"], pc["nstar"], width=0.4, showfliers=False, native_scale=True, order=order,
      boxprops={"alpha": 0.3})
g.map(sns.swarmplot, pc["delta"], pc["nstar"], native_scale=True, order=order, size=0.75)
g.map(plt.grid, color="grey", alpha=0.2)

g.set_titles("{col_name}")
# g.set(xticks=[0.2, 0.4, 0.6, 0.8], yscale="log")
g.set(xticks=[0.01, 0.1, 0.2, 0.3])

g.tight_layout(pad=0.5)
g.savefig(FIGURES_DIR / "llm_nstar_delta.pdf")
plt.show()

Let's see the weird combinations

In [None]:
# df_ = dfplot.copy()
# df_ = df_.loc[df_["kernel"] == pk["jaccard_kernel_k_1"]]
# df_ = df_.loc[df_[pc["eps"]] == df_[pc["eps"]].max()]
#
# df_.loc[df_[pc["nstar"]] == df_[pc["nstar"]].min()][[pc["nstar"], "model", "tuning", "scoring"]]

## nstar prediction from N

See how the prediction changes with N

In [48]:
true_nstar = df_nstar.loc[df_nstar["N"] == df_nstar["Nmax"]].drop(columns=["N", "Nmax"])
keys = ["kernel", "alpha", "eps"] + ["task_name", "number_of_shots"]
df_ = pd.merge(df_nstar, true_nstar, left_on=keys, right_on=keys, suffixes=("", "_true"))[keys + ["nstar", "nstar_true", "N", "Nmax"]]
df_["nstar_error"] = df_["nstar"] - df_["nstar_true"]
df_["nstar_relative_error"] = (df_["nstar"] - df_["nstar_true"]) / df_["nstar_true"]
df_["nstar_absolute_error"] = np.abs(df_["nstar"] - df_["nstar_true"])
df_["nstar_absrel_error"] = np.abs(df_["nstar"] - df_["nstar_true"]) / df_["nstar_true"]
# df_ = df_.loc[df_["N"] != df_["Nmax"]]
print(len(df_.groupby(["task_name", "number_of_shots"]).groups))
df_ = df_.query("Nmax == 50")
print(len(df_.groupby(["task_name", "number_of_shots"]).groups))

dfplot = df_.copy().query("N < Nmax").rename(columns=pc)
dfplot["kernel"] = dfplot["kernel"].map(pk)

y = pc["nstar_absrel_error"]

fig, ax = plt.subplots(1, 1, figsize=(5.5/2, 2))

sns.boxplot(dfplot, x=pc["N"], y=y, showfliers=False, fliersize=0.3, hue="kernel", palette="cubehelix", ax=ax, legend=True, linewidth=1.2, fill=False, gap=0.2)

ax.grid(color="grey", alpha=.2)
ax.set_yticks([0, 0.2, 0.4, 0.6, 0.8])

ax.set_ylabel("")


# nice legend
ax.legend(*ax.get_legend_handles_labels()).get_frame().set_edgecolor("w")
sns.despine()
plt.tight_layout(pad=.5)

plt.savefig(FIGURES_DIR / "llms_nstar_absrel_error.pdf")
plt.show()

24
9


# Intuitive explanation of generalizability

Plot generalizability as function of 2n, for some fixed epsilon.
On the same plot, plot the average kernel within the 2n sampled experimental conditions.

# Special results

In [59]:
df_nstar.query("nstar == 1")

Unnamed: 0,kernel,alpha,eps,delta,disjoint,replace,N,nstar,nstar_lower,nstar_upper,variance,var_lower_bound,singular,task_name,number_of_shots,Nmax
72,jaccard_kernel_k_1,0.7,0.316,0.05,True,False,10,1.0,1.0,1.0,0.0,0.906679,True,arithmetic,1,20
73,jaccard_kernel_k_1,0.7,0.316,0.05,True,False,20,1.0,1.0,1.0,0.0,0.991291,True,arithmetic,1,20
74,jaccard_kernel_k_1,0.8,0.316,0.05,True,False,10,1.0,1.0,1.0,0.0,0.906679,True,arithmetic,1,20
75,jaccard_kernel_k_1,0.8,0.316,0.05,True,False,20,1.0,1.0,1.0,0.0,0.991291,True,arithmetic,1,20
76,jaccard_kernel_k_1,0.95,0.141,0.01,True,False,10,1.0,1.0,1.0,0.0,0.653773,True,arithmetic,1,20
77,jaccard_kernel_k_1,0.95,0.141,0.01,True,False,20,1.0,1.0,1.0,0.0,0.880127,True,arithmetic,1,20
78,jaccard_kernel_k_1,0.95,0.316,0.05,True,False,10,1.0,1.0,1.0,0.0,0.906679,True,arithmetic,1,20
79,jaccard_kernel_k_1,0.95,0.316,0.05,True,False,20,1.0,1.0,1.0,0.0,0.991291,True,arithmetic,1,20
80,jaccard_kernel_k_1,0.95,0.447,0.1,True,False,10,1.0,1.0,1.0,0.0,0.965059,True,arithmetic,1,20
81,jaccard_kernel_k_1,0.95,0.447,0.1,True,False,20,1.0,1.0,1.0,0.0,0.998779,True,arithmetic,1,20


In [66]:
df_nstar.task_name.value_counts()

task_name
multiemo                405
mult_data_wrangling     405
natural_instructions    405
arithmetic              162
bbq_lite_json            81
conlang_translation      81
gem                      81
linguistic_mappings      81
Name: count, dtype: int64

In [68]:
kernel = "jaccard_kernel_k_1"

df_ = df_nstar.query("kernel==@kernel and alpha==0.95 and delta==0.05")
df_ = df_.loc[df_["N"] == df_["Nmax"]]
df_.loc[(df_["nstar"] == df_["nstar"].min()) | (df_["nstar"] == df_["nstar"].max())]

Unnamed: 0,kernel,alpha,eps,delta,disjoint,replace,N,nstar,nstar_lower,nstar_upper,variance,var_lower_bound,singular,task_name,number_of_shots,Nmax
79,jaccard_kernel_k_1,0.95,0.316,0.05,True,False,20,1.0,1.0,1.0,0.0,0.991291,True,arithmetic,1,20
133,jaccard_kernel_k_1,0.95,0.316,0.05,True,False,20,1.0,1.0,1.0,0.0,0.991291,True,arithmetic,2,20
255,jaccard_kernel_k_1,0.95,0.316,0.05,True,False,10,43.790123,27.412227,56.877656,0.777778,0.349889,False,conlang_translation,0,10


In [77]:
rf = ru.get_rankings_from_df(df.reset_index(drop=True),
                             factors=list(EXPERIMENTAL_FACTORS.keys()),
                             alternatives=ALTERNATIVES,
                             target=TARGET,
                             lower_is_better=False, impute_missing=False)

tol = 0.2
rf = rf.loc[:, rf.isna().sum(axis=0) <= rf.shape[0] * tol]
rf = rf.loc[rf.isna().sum(axis=1) <= rf.shape[1] * tol, :]
rf.loc(axis=1)[:, "arithmetic", 1]

Unnamed: 0_level_0,arithmetic,arithmetic:1_digit_addition,arithmetic:1_digit_division,arithmetic:1_digit_multiplication,arithmetic:1_digit_subtraction,arithmetic:2_digit_addition,arithmetic:2_digit_division,arithmetic:2_digit_multiplication,arithmetic:2_digit_subtraction,arithmetic:3_digit_addition,...,arithmetic:3_digit_multiplication,arithmetic:3_digit_subtraction,arithmetic:4_digit_addition,arithmetic:4_digit_division,arithmetic:4_digit_multiplication,arithmetic:4_digit_subtraction,arithmetic:5_digit_addition,arithmetic:5_digit_division,arithmetic:5_digit_multiplication,arithmetic:5_digit_subtraction
Unnamed: 0_level_1,arithmetic,arithmetic,arithmetic,arithmetic,arithmetic,arithmetic,arithmetic,arithmetic,arithmetic,arithmetic,...,arithmetic,arithmetic,arithmetic,arithmetic,arithmetic,arithmetic,arithmetic,arithmetic,arithmetic,arithmetic
Unnamed: 0_level_2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
GPT GPT-3 Small,20.0,16.0,8.0,23.0,18.0,13.0,18.0,17.0,10.0,14.0,...,19.0,13.0,13.0,15.0,19.0,17.0,12.0,14.0,14.0,9.0
BIG-G T=1 2b,26.0,19.0,8.0,24.0,19.0,14.0,20.0,13.0,21.0,19.0,...,18.0,14.0,18.0,14.0,17.0,22.0,16.0,18.0,13.0,20.0
BIG-G T=1 27b,22.0,18.0,9.0,18.0,7.0,12.0,19.0,13.0,16.0,17.0,...,20.0,18.0,11.0,17.0,16.0,16.0,15.0,20.0,10.0,16.0
BIG-G sparse 2m,42.0,27.0,9.0,27.0,30.0,23.0,23.0,22.0,27.0,23.0,...,23.0,22.0,24.0,21.0,24.0,27.0,21.0,23.0,17.0,26.0
PaLM 8b,13.0,14.0,2.0,13.0,11.0,5.0,2.0,11.0,18.0,12.0,...,15.0,6.0,3.0,9.0,8.0,18.0,8.0,4.0,8.0,14.0
GPT GPT-3 200B,,,,,,,,,,,...,,,,,,,,,,
BIG-G T=1 244m,30.0,21.0,9.0,21.0,21.0,21.0,21.0,19.0,22.0,22.0,...,23.0,19.0,21.0,18.0,24.0,25.0,21.0,21.0,15.0,24.0
BIG-G T=0 244m,28.0,23.0,9.0,20.0,20.0,16.0,20.0,19.0,17.0,20.0,...,20.0,14.0,14.0,19.0,20.0,21.0,19.0,15.0,15.0,22.0
BIG-G sparse 125m,33.0,21.0,8.0,27.0,28.0,21.0,21.0,22.0,25.0,23.0,...,23.0,21.0,23.0,20.0,23.0,25.0,21.0,22.0,16.0,26.0
BIG-G sparse 422m,21.0,20.0,8.0,24.0,14.0,18.0,15.0,12.0,14.0,20.0,...,16.0,15.0,19.0,10.0,17.0,13.0,18.0,12.0,12.0,15.0
