In [None]:
import numpy as np 
import pandas as pd 
from einops import rearrange, pack

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

In [None]:
plt.style.use("default")
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [None]:
result_dir = "../results/dp-experiment/"
fig_dir = "../figures/dp-experiment/"
dataset = "adult-reduced"
dataset_name = "Adult"
metric_names = {
    "brier": "Brier Score",
    "log_loss": "Cross Entropy",
    # "accuracy": "Accuracy",
    # "auc": "AUC",
    "accuracy_comp": "1 - Accuracy",
    "auc_comp": "1 - AUC",
}
metrics = metric_names.keys()
model_names = {
    "Logistic Regression": "LogR",
    "1-NN": "1-NN",
    "5-NN": "5-NN",
    "Decision Tree": "DT",
    "Random Forest": "RF",
    "Gradient Boosting": "GB",
    "MLP": "MLP",
    "SVM": "SVM",
}

df = pd.read_csv("{}{}/results.csv".format(result_dir, dataset), index_col=False)
df["model_short"] = df.model.apply(lambda m: model_names[m])
real_data_df = pd.read_csv("{}{}/real-data-results.csv".format(result_dir, dataset), index_col=False)
    
df.primal = df.primal.apply(lambda m: "Prob. Avg." if m== "Primal" else "Log Prob. Avg.")
df["method_primal"] = df.apply(lambda row: "{} - {}".format(row["method"], row["primal"]), axis=1)
df["accuracy_comp"] = df.accuracy.apply(lambda acc: 1 - acc)
df["auc_comp"] = df.auc.apply(lambda auc: 1 - auc)

real_data_df["accuracy_comp"] = real_data_df.accuracy.apply(lambda acc: 1 - acc)
real_data_df["auc_comp"] = real_data_df.auc.apply(lambda auc: 1 - auc)

long_df = df.melt(
        id_vars=["repeat_ind", "model", "model_short", "n_syn_datasets", "method", "dataset", "primal", "method_primal"], 
        value_vars=["brier", "log_loss", "accuracy_comp", "auc_comp"]
)

model_order_short = [
    "1-NN", "5-NN", "DT", "RF", "MLP", "GB", "SVM", "LogR"
]
model_order = [
    "1-NN", "5-NN", "Decision Tree", "Random Forest", "MLP", "Gradient Boosting", 
    "SVM", "Logistic Regression"
]
metric_order = list(long_df.variable.unique())
n_repeats = len(df.repeat_ind.unique())

min_real_data_metrics = {}
for metric in metrics:
    metric_df = real_data_df[["model", metric]].groupby(["model"]).mean()
    min_real_data_metrics[metric] = metric_df[metric].iloc[metric_df[metric].argmin()]

In [None]:
def plot_by_method(df, metric, save=False, selected_primal=None):
    if selected_primal is not None:
        df = df[df.primal.apply(lambda val: val in selected_primal)]

    g = sns.FacetGrid(df, col="method_primal", height=2.5, aspect=1.5)
    # g.figure.suptitle(dataset_names[dataset])
    g.map_dataframe(sns.barplot, x="model_short", y=metric, order=model_order_short, hue="n_syn_datasets", palette="flare", errwidth=1.5)
    for ax in g.axes.flatten():
        ax.axhline(min_real_data_metrics[metric], color="black")
    for ax in g.axes.flatten():
        ax.set_axisbelow(True)
        ax.grid()
    g.set(ylim=(min_real_data_metrics[metric] * 0.9, None))
    g.set_xlabels("")
    g.set_ylabels(metric_names[metric])
    g.set_titles("{col_name}", fontweight="bold")
    g.tick_params("x", labelrotation=45)
    g.add_legend(title="m")
    if save:
        plt.savefig("{}{}-{}-by-method.pdf".format(fig_dir, dataset, metric), bbox_inches="tight")
    plt.show()

def plot_all_metrics(long_df, save=False):
    g = sns.FacetGrid(long_df, col="method_primal", row="variable", sharey="row", aspect=1.2)
    # g.figure.suptitle(dataset_names[dataset])
    g.map_dataframe(sns.barplot, x="model_short", y="value", order=model_order_short, hue="n_syn_datasets", palette="flare", errwidth=1.9)
    for row_i in range(g.axes.shape[0]):
        for col_i in range(g.axes.shape[1]):
            g.axes[row_i, col_i].axhline(min_real_data_metrics[metric_order[row_i]], color="black")
            g.axes[row_i, col_i].set_ylim((min_real_data_metrics[metric_order[row_i]] * 0.9, None))
    for ax in g.axes.flatten():
        ax.set_axisbelow(True)
        ax.grid()
    g.set_xlabels("")
    for (metric, method), ax in g.axes_dict.items():
        if metric == "log_loss":
            ax.set_yscale("log")
    for i, ax in enumerate(g.axes[:, 0]):
        ax.set_ylabel(metric_names[metric_order[i]])
    g.set_titles("{col_name}", fontweight="bold")
    g.tick_params("x", labelrotation=45)
    g.add_legend(title="m")
    if save:
        plt.savefig("{}{}-all-metrics.pdf".format(fig_dir, dataset), bbox_inches="tight")
    plt.show()

In [None]:
plot_by_method(df, "brier", save=True, selected_primal=["Prob. Avg."])

In [None]:
plot_all_metrics(long_df, save=True)

In [None]:
table = df.groupby(["model", "method_primal", "n_syn_datasets"])["brier"].aggregate(["mean", "std"])
table["formatted"] = table.apply(lambda row: "{:.2f} $\pm$ {:.3f}".format(row["mean"], row["std"]), axis=1)
table = table.reset_index("n_syn_datasets").pivot(columns="n_syn_datasets", values="formatted")
table.index.rename(["Downstream", "Generator"], inplace=True)
table.columns.rename("m", inplace=True)
table = table.reindex(model_order, level="Downstream", axis="index")
table
# table.style.to_latex(fig_dir + "{}-brier-table.tex".format(dataset), hrules=True, clines="skip-last;data")

In [None]:
reduced_table = table.query("(Downstream in ['1-NN', '5-NN', 'Decision Tree', 'Random Forest', 'Gradient Boosting']) and (Generator in ['AIM - Prob. Avg.', 'NAPSU-MQ - Prob. Avg.'])")
reduced_table