# Demo of core features

First, some imports and some plot settings.

In [3]:
import glob
import imageio.v2 as iio
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import yaml

from collections import defaultdict
from importlib import reload
from itertools import product
from numpy.random import Generator
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from time import time
from tqdm import tqdm

from generalizability import lower_bounds as gu
from generalizability import kernels as ku
from generalizability import probability_distributions as prob
from generalizability import rankings_utils as ru
from generalizability import mmd as mmd

mpl.use("TkAgg")
sns.set_style("ticks")
sns.set_context("notebook")
palette = "flare_r"
sns.set_palette("flare_r")

Next, we load the hyperparameters from the config.

In [4]:
with open("config.yaml", 'r') as file:
    config = yaml.safe_load(file)

DATA_DIR = Path(config['paths']['data_dir'])
FIGURES_DIR = Path(config['paths']['figures_dir'])

FORMAT = config['format']['output']

SEED = config['parameters']['seed']
RNG = np.random.default_rng(SEED)
EPS = config['parameters']['epsilon']
ALPHA = config['parameters']['alpha']
LR_CONFIDENCE = config['parameters']['lr_confidence']
CI_LOWER = (1 - LR_CONFIDENCE) / 2
CI_UPPER = LR_CONFIDENCE + CI_LOWER

DATA_SET = Path(config['data']['dataset_path'])
EXPERIMENTAL_CONDITIONS = config['data']['experimental_conditions']
TARGET = config['data']['target']
ALTERNATIVES = config['data']['alternatives']

DISJOINT = config['sampling']['disjoint']
REPLACE = config['sampling']['replace']

Then, we load the data set into a pandas data frame and we check that only one value in the experimental conditions is set to `None`, indicating that it is variable. 

In [5]:
if DATA_SET.suffix == '.parquet':
    df = pd.read_parquet(DATA_SET)
elif DATA_SET.suffix == '.csv':
    df = pd.read_csv(DATA_SET)
else:
    raise Exception("Please use a Parquet or CSV file as the format of your data")

assert sum(value is None for value in EXPERIMENTAL_CONDITIONS.values()) == 1, "Exactly one element must be None"

We then build a query from the experimental conditions dict

In [6]:
query_string = " and ".join(f"{key} == '{value}'" if isinstance(value, str) else f"{key} == {value}"
                            for key, value in EXPERIMENTAL_CONDITIONS.items() if value is not None)

# Check if query params exist in the df
columns_to_check = set(EXPERIMENTAL_CONDITIONS.keys()).union({TARGET, ALTERNATIVES})
missing_columns = columns_to_check - set(df.columns)
if missing_columns:
    raise ValueError(f"The following columns are missing from the dataframe: {missing_columns}")

# Build query
df = df.query(query_string).reset_index(drop=True)
rv = ru.get_rankings_from_df(df, factors=list(EXPERIMENTAL_CONDITIONS.keys()), alternatives=ALTERNATIVES,
                             target=TARGET,
                             lower_is_better=False, impute_missing=True)
rv = rv.fillna(rv.max())

Next, we define some kernels that cana be used for MMD.

In [7]:
kernels = {
    "mallows_auto": (ku.mallows_kernel, {"nu": "auto"}),
    "jaccard_1": (ku.jaccard_kernel, {"k": 1}),
    #"borda_OHE": (ku.borda_kernel, {"idx": rv.index.get_loc("OHE")}),
    #"borda_DE": (ku.borda_kernel, {"idx": rv.index.get_loc("DE")})
}

In the following, we will use the mallows kernel.

In [8]:
KERNELNAME = "mallows_auto"
KERNEL, KERNELARGS = kernels[KERNELNAME]

We create some directories for the different experiments we run.

In [9]:
# ---- Create directories
exp0_dir_name_parts = [f"name={ALTERNATIVES}"] + [f"{key}={value}" for key, value in EXPERIMENTAL_CONDITIONS.items() if value is not None]
EXP0_DIR = FIGURES_DIR / "_".join(exp0_dir_name_parts)
EXP1_DIR = EXP0_DIR / f"{KERNELNAME}"
EXP2_DIR = EXP1_DIR / f"nstar_N_ALPHA={ALPHA}_eps={EPS}_ci={LR_CONFIDENCE}_disjoint={DISJOINT}_replace={REPLACE}"
for ED in [EXP0_DIR, EXP1_DIR, EXP2_DIR]:
    ED.mkdir(parents=True, exist_ok=True)

We setup the experimental conditions pool to simulate running new experiments.

In [10]:
ec_variable = next((key for key, value in EXPERIMENTAL_CONDITIONS.items() if value is None), None)
ec_pool = df[ec_variable].unique()  # we remove experimental conditions from it to simulate running new experiments
ecs = np.array([])  # ecs on which we have already run experiments
out = []
plt.ioff()

<contextlib.ExitStack at 0x7e43677c88c0>

We will now define a few methods that will be run in a loop until our `ec_pool` is empty.

We start by sampling from our `ec_pool` and converting our samples to the corresponding rankings.

In [11]:
def sample_and_compute_rankings(ecs, ec_pool, rv):
    # Determine the number of samples to take
    sample_size = min(10, len(ec_pool))
    
    # Sample experimental conditions
    new_ecs = RNG.choice(ec_pool, sample_size, replace=False)
    
    # Update the list of sampled conditions
    ecs = np.append(ecs, new_ecs)
    
    # Remove sampled conditions from the pool
    ec_pool = np.setdiff1d(ec_pool, new_ecs)
    
    # Access and process the relevant data
    rv_ = rv.loc[:, ecs]
    
    # Compute the shape of the dataframe
    na, nv = rv_.shape
    
    # Generate rankings from the data
    rankings = ru.SampleAM.from_rank_function_dataframe(rv_)
    
    return ecs, ec_pool, rankings, nv

We then get compute the variance, variance lower bound and MMDs of these rankings.

In [12]:
def compute_variance_and_lower_bound(rankings):
    
    variance = ku.var(rankings, use_rv=True, kernel=KERNEL, **KERNELARGS)
    
    var_lower_bound = gu.sample_mean_embedding_lowerbound(EPS, len(ecs), kbar=1, v=variance)
    
    return variance, var_lower_bound

def calculate_mmds(rankings, nv):
    mmds = {
        n: mmd.subsample_mmd_distribution(
            rankings, subsample_size=n, rep=100, use_rv=True, use_key=False,
            seed=SEED, disjoint=DISJOINT, replace=REPLACE, kernel=KERNEL, **KERNELARGS
        )
        for n in range(2, nv // 2 + 1)
    }
    return mmds

Then, we create dataframes consisting of Generalizability scores and Quantiles.

In [13]:
def create_generalizability_dataframe(mmds, logepss):
    ys = {n: [mmd.generalizability(mmde, np.exp(logeps)) for logeps in logepss] for n, mmde in mmds.items()}
    dfy = pd.DataFrame(ys, index=logepss).reset_index().melt(id_vars='index', var_name='n', value_name='generalizability')
    dfy.rename(columns={'index': 'log(eps)'}, inplace=True)
    dfy['n'] = dfy['n'].astype(int)
    return dfy

def create_quantiles_dataframe(mmds):
    qs = {n: np.log(np.quantile(mmde, ALPHA)) for n, mmde in mmds.items()}
    dfq = pd.DataFrame(list(qs.items()), columns=['n', 'log(eps)'])
    dfq['log(n)'] = np.log(dfq['n'])
    return qs, dfq


We now use the quantiles dataframe to calculate `nstar`. To this end, we fit linear regression models on subsets.

In [14]:
def perform_linear_regression_with_cv(dfq):
    # Extracting features and target from DataFrame
    X = dfq[['log(eps)']].values
    y = dfq[['log(n)']].values

    cv = KFold(n_splits=len(y))

    residuals, linear_predictors = [], []

    for train_index, test_index in cv.split(X):
        lr = LinearRegression().fit(X[train_index], y[train_index])

        predicted = lr.predict(X[test_index])
        residuals.extend(y[test_index] - predicted)

        linear_predictors.append(lr)

    return linear_predictors, residuals

def predict_nstar(logepss, linear_predictors, dfq):
    X = dfq[['log(eps)']].values
    y = dfq[['log(n)']].values

    ns_pred_cv = [np.exp(lr.predict(logepss.reshape(-1, 1)).reshape(-1)) for lr in linear_predictors]
    
    ns_pred = np.exp(LinearRegression().fit(X, y).predict(logepss.reshape(-1, 1)).reshape(-1))
    
    nstar_cv = [pred[np.argmax(logepss > np.log(EPS))] for pred in ns_pred_cv if not np.all(pred == 0)]
    
    nstar = ns_pred[np.argmax(logepss > np.log(EPS))]
    
    nstar_lower, nstar_upper = np.quantile(nstar_cv, [0.05, 0.95])

    return ns_pred, ns_pred_cv, nstar, nstar_lower, nstar_upper

Lastly, we plot the our results.

In [15]:
def plot_generalizability_and_quantiles(dfy, dfq, logepss, ns_pred, ns_pred_cv, mmds, qs, nstar, nstar_upper, nstar_lower):
    # Create figure and axes
    fig, axes = plt.subplots(2, 1, sharex="all", figsize=(10, 8))

    # Generalizability plot
    ax = axes[0]
    sns.lineplot(data=dfy, x="log(eps)", y="generalizability", hue="n", ax=ax, palette=palette)
    ax.hlines(ALPHA, ls="--", xmin=np.min(logepss), xmax=np.max(logepss), color="black")
    for n in mmds.keys():
        ax.vlines(qs[n], ymin=0, ymax=ALPHA, ls=":")
    sns.despine(ax=ax)

    # Quantiles plot
    ax = axes[1]
    ymax = max(ns_pred)
    sns.lineplot(data=dfq, x="log(eps)", y="n", ax=ax, ls="", marker="o", hue="n", legend=False)
    for n in mmds.keys():
        ax.vlines(qs[n], ymin=n, ymax=ymax, ls=":")
    ax.vlines(np.log(EPS), ymin=0.1, ymax=ymax, color="black", ls="--")
    sns.lineplot(x=logepss, y=ns_pred, color="green", ls="-.", ax=ax)
    for it, ns_tmp in enumerate(ns_pred_cv):
        if np.max(ns_tmp) > 1000:
            continue
        sns.lineplot(x=logepss, y=ns_tmp, color="green", ls="-.", alpha=0.5, ax=ax)

    # N* Lines
    ax.hlines(nstar, xmin=np.min(logepss), xmax=np.log(EPS), ls="-", color="red")
    ax.hlines(nstar_upper, xmin=np.min(logepss), xmax=np.log(EPS), ls="-", color="red", alpha=0.3)
    ax.hlines(nstar_lower, xmin=np.min(logepss), xmax=np.log(EPS), ls="-", color="red", alpha=0.3)
    ax.set_yscale("log")
    sns.despine(ax=ax)

    # Finalize and save
    fig.suptitle(f"Generalizability for N = {len(ecs):02d}\n"
                 f"n*(alpha={ALPHA}, eps={EPS}) = {np.ceil(nstar)}\n"
                 f"{LR_CONFIDENCE} confidence interval: [{np.ceil(nstar_lower)}, {np.ceil(nstar_upper)}]")
    plt.tight_layout()
    if FORMAT == "pdf":
        plt.savefig(EXP2_DIR / f"N={len(ecs):02d}.pdf")
    else:
        plt.savefig(EXP2_DIR / f"N={len(ecs):02d}.png")
    plt.close("all")

In [None]:
total_iterations = (len(ec_pool) + 9) // 10

for _ in tqdm(
    range(total_iterations)):
    # -- Sample new rankings from ec pool
    ecs, ec_pool, rankings, nv = sample_and_compute_rankings(ecs, ec_pool, rv)

    # -- Compute the lower bound
    variance, var_lower_bound = compute_variance_and_lower_bound(rankings)

    # -- Compute mmds
    mmds = calculate_mmds(rankings, nv)

    # -- Compute generalizability and quantiles

    # Prepare log(eps) scale
    logepss = np.linspace(np.log(EPS) - 0.1, np.log(max(np.quantile(mmde, ALPHA) for mmde in mmds.values())) + 0.1, 1000)

    # Dataframe for generalizability
    dfy = create_generalizability_dataframe(mmds, logepss)

    # Dataframe for quantiles
    qs, dfq = create_quantiles_dataframe(mmds)

    # Linear Regression with Cross-Validation
    linear_predictors, residuals = perform_linear_regression_with_cv(dfq)
    
    # Predictions
    ns_pred, ns_pred_cv, nstar, nstar_lower, nstar_upper = predict_nstar(logepss, linear_predictors, dfq)

    # -- Plotting
    plot_generalizability_and_quantiles(dfy, dfq, logepss, ns_pred, ns_pred_cv, mmds, qs, nstar, nstar_upper, nstar_lower)

    out.append({
        "kernel": KERNELNAME,
        "alpha": ALPHA,
        "eps": EPS,
        "disjoint": DISJOINT,
        "replace": REPLACE,
        "N": len(ecs),
        "nstar": nstar,
        "nstar_lower": nstar_lower,
        "nstar_upper": nstar_upper,
        "variance": variance,
        "var_lower_bound": var_lower_bound,
    })

Lastly, we generate a gif to show the changes when changing `n`. Also, we store the outputs to a file for debugging purposes.

In [17]:
plt.ion()
if FORMAT == "gif":
    images = [iio.imread(image) for image in glob.glob(str(EXP2_DIR / "*.png"))]
    iio.mimwrite(EXP2_DIR / f"nstar.gif", images, duration=750,
                    loop=0)

# -- Store nstar predictions
out = pd.DataFrame(out)
out.to_parquet(EXP2_DIR / "nstar.parquet")