In [None]:
import numpy as np 
import pandas as pd 
from einops import rearrange, pack

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.style.use("default")
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [None]:
result_dir = "../results/regression-datasets/"
fig_dir = "../figures/regression/"
datasets = ["california-housing", "abalone", "insurance", "ACS2018"]

model_names = {
    "Linear Regression": "LR",
    "Ridge Regression": "RR",
    "1-NN": "1-NN",
    "5-NN": "5-NN",
    "Decision Tree": "DT",
    "Random Forest": "RF",
    "Gradient Boosting": "GB",
    "MLP": "MLP",
    "SVM": "SVM",
}
method_names = {
    "ddpm": "DDPM",
    "synthpop-proper": "SP-P",
}
dataset_names = {
    "abalone": "Abalone",
    "ACS2018": "ACS2018",
    "california-housing": "California Housing",
    "insurance": "Insurance",
}
inv_dataset_names = {val: key for key, val in dataset_names.items()}
dataset_order = list(dataset_names.values())

dfs = {}
one_large_dfs = {}
real_data_dfs = {}
for dataset in datasets:
    dfs[dataset] = pd.read_csv("{}{}/results.csv".format(result_dir, dataset), index_col=False)
    dfs[dataset]["model_short"] = dfs[dataset].model.apply(lambda m: model_names[m])
    dfs[dataset].method = dfs[dataset].method.apply(lambda m: method_names[m])
    dfs[dataset]["dataset"] = dataset_names[dataset]
    real_data_dfs[dataset] = pd.read_csv("{}{}/real-data-results.csv".format(result_dir, dataset), index_col=False)
    one_large_dfs[dataset] = pd.read_csv("{}{}/one_large_results.csv".format(result_dir, dataset), index_col=False)
    one_large_dfs[dataset]["model_short"] = one_large_dfs[dataset].model.apply(lambda m: model_names[m])
    one_large_dfs[dataset].method = one_large_dfs[dataset].method.apply(lambda m: method_names[m])
    one_large_dfs[dataset]["dataset"] = dataset_names[dataset]


# model_order_short = list(dfs["california-housing"].model_short.unique())
# model_order_short.remove("LR")
# model_order = list(dfs["california-housing"].model.unique())
model_order = [
    "1-NN", "5-NN", "Decision Tree", "Random Forest", "MLP", "Gradient Boosting", 
    "SVM", "Ridge Regression", "Linear Regression"
]

model_order_short = [
    "1-NN", "5-NN", "DT", "RF", "MLP", "GB", "SVM", "RR"
]
n_repeats = len(dfs["california-housing"].repeat_ind.unique())

# Remove extremely large MSE for linear regression
acs = dfs["ACS2018"]
dfs["ACS2018"] = acs[(acs.model_short != "LR") | (acs.mse_test < 1e6)]

df_all_datasets = pd.concat(dfs, ignore_index=True)

real_data_mses = {}
min_real_data_mses = {}
for dataset in datasets:
    mse_df = real_data_dfs[dataset][["model", "mse"]].groupby(["model"]).mean()
    real_data_mses[dataset] = mse_df.reindex(model_order)
    min_real_data_mses[dataset] = mse_df.mse.iloc[mse_df.mse.argmin()]

In [None]:
for dataset in datasets:
    group_df = dfs[dataset].groupby(["model", "model_short", "method", "n_syn_datasets"], as_index=False).mean()
    print("Lowest MSE for {}".format(dataset_names[dataset]))
    print(group_df.iloc[group_df.mse_test.argmin()])
    print()

In [None]:
estimation_dfs = {}
for dataset in datasets:
    records = []
    df = dfs[dataset]
    for model in model_names.keys():
        for method in method_names.values():
            for repeat_ind in range(n_repeats):
                sdf = df[df.model == model]
                sdf = sdf[sdf.method == method]
                sdf = sdf[sdf.repeat_ind == repeat_ind]
                mse_series_m1 = sdf[sdf.n_syn_datasets == 1]["mse_test"]
                mse_series_m2 = sdf[sdf.n_syn_datasets == 2]["mse_test"]
                mse_1 = mse_series_m1.iloc[0] if len(mse_series_m1) > 0 else np.nan
                mse_2 = mse_series_m2.iloc[0] if len(mse_series_m2) > 0 else np.nan
                records.append({
                    "model": model,
                    "method": method,
                    "repeat_ind": repeat_ind,
                    "mse_1": mse_1,
                    "red_estimate": 2 * (mse_1 - mse_2)
                })
    estimation_dfs[dataset] = pd.DataFrame.from_records(records)

def estimate_mse(row, dataset):
    estimation_df = estimation_dfs[dataset]
    sel_estimate = estimation_df[
        (estimation_df.model == row.model)
        & (estimation_df.method == row.method)
        & (estimation_df.repeat_ind == row.repeat_ind)
    ]
    mse1 = sel_estimate.mse_1.iloc[0]
    red_estimate = sel_estimate.red_estimate.iloc[0]
    estimated_mse = mse1 - (1 - 1 / row.n_syn_datasets) * red_estimate
    return estimated_mse

for dataset in datasets:
    dfs[dataset] = dfs[dataset].assign(est_mse=dfs[dataset].apply(lambda row: estimate_mse(row, dataset), axis=1))

In [None]:
def plot_by_model(df, dataset, save=False):
    g = sns.FacetGrid(df, col="model_short", col_order=model_order_short, col_wrap=5)
    g.figure.suptitle(dataset_names[dataset])
    g.map_dataframe(sns.barplot, x="method", y="mse_test", hue="n_syn_datasets")
    for i, mse in enumerate(real_data_mses[dataset].mse):
        g.axes[i].axhline(mse, linestyle="dashed", color="grey")
        g.axes[i].axhline(min_real_data_mses[dataset], color="black")
    for ax in g.axes:
        ax.set_axisbelow(True)
        ax.grid()
    g.tick_params("x", labelrotation=90)
    g.set_ylabels("MSE")
    g.set_xlabels("")
    g.add_legend()
    if save:
        plt.savefig("{}{}-by-model.pdf".format(fig_dir, dataset), bbox_inches="tight")
    plt.show()

def plot_by_method(df, dataset, save=False, selected_method=None, file_suffix=""):
    if selected_method is not None:
        df = df[df.method.apply(lambda val: val in selected_method)]

    g = sns.FacetGrid(df, col="method", height=2.2, aspect=1.2)
    # g.figure.suptitle(dataset_names[dataset])
    g.map_dataframe(sns.barplot, x="model_short", y="mse_test", order=model_order_short, hue="n_syn_datasets", palette="flare", errwidth=0.7)
    for ax in g.axes.flatten():
        ax.axhline(min_real_data_mses[dataset], color="black")
    g.set(ylim=(min_real_data_mses[dataset] * 0.9, None))
    for ax in g.axes.flatten():
        ax.set_axisbelow(True)
        ax.grid()
    g.tick_params("x", labelrotation=45)
    g.set_ylabels("MSE")
    g.set_xlabels("")
    g.set_titles("{col_name}", fontweight="bold")
    g.add_legend(title="m")
    if save:
        plt.savefig("{}{}-by-method{}.pdf".format(fig_dir, dataset, file_suffix), bbox_inches="tight")
    plt.show()

def plot_by_dataset(df, selected_method, save=False, file_suffix=""):
    df = df[df.method.apply(lambda val: val in selected_method)]

    g = sns.FacetGrid(df, col="dataset", height=2.2, aspect=1.2, sharey=False, col_order=dataset_order)
    g.map_dataframe(sns.barplot, x="model_short", y="mse_test", order=model_order_short, hue="n_syn_datasets", palette="flare", errwidth=1.2)

    for dataset, ax in g.axes_dict.items():
        dataset_key = inv_dataset_names[dataset]
        ax.axhline(min_real_data_mses[dataset_key], color="black")
        ax.set_ylim((min_real_data_mses[dataset_key] * 0.9, None))

    for ax in g.axes.flatten():
        ax.set_axisbelow(True)
        ax.grid()

    g.tick_params("x", labelrotation=45)
    g.set_ylabels("MSE")
    g.set_xlabels("")
    g.add_legend(title="m")
    g.set_titles("{col_name}", fontweight="bold")

    if save:
        plt.savefig("{}by-dataset{}.pdf".format(fig_dir, file_suffix), bbox_inches="tight")

    plt.show()

def plot_mse_est(df, dataset, save=False, plot_order=model_order, file_suffix=""):
    g = sns.FacetGrid(df, col="model", col_wrap=5, col_order=plot_order, height=2.2, aspect=1.1)
    # g.figure.suptitle(dataset_names[dataset])

    g.map_dataframe(
        sns.lineplot, x="n_syn_datasets", y="mse_test", hue="method", style="method",
        err_style="band", markers=True,
    )
    legend_data = {"{} Measured".format(name): line for name, line in g._legend_data.items()}

    g.map_dataframe(
        sns.lineplot, x="n_syn_datasets", y="est_mse", hue="method", style="method",
        linestyle="dashed", palette=["C2", "C3"], err_style="band", markers=["^", "v"],
    )
    legend_data.update({"{} Predicted".format(name): line for name, line in g._legend_data.items()})
    legend_data["DDPM Predicted"].set_linestyle("dashed")
    legend_data["SP-P Predicted"].set_linestyle("dashed")

    g.add_legend(legend_data, label_order=["DDPM Measured", "DDPM Predicted", "SP-P Measured", "SP-P Predicted"], ncol=6, loc="upper right", bbox_to_anchor=(0.5, 0))
    for ax in g.axes.flatten():
        ax.grid()

    g.set_ylabels("MSE")
    g.set_xlabels("m (# Synthetic Datasets)")
    g.set_titles("{col_name}", fontweight="bold")
    if save:
        plt.savefig("{}{}-mse-est{}.pdf".format(fig_dir, dataset, file_suffix), bbox_inches="tight")
    plt.show()

In [None]:
plot_mse_est(dfs["ACS2018"], "ACS2018")

In [None]:
dataset = "california-housing"
plot_by_method(dfs[dataset], dataset)

In [None]:
dataset = "ACS2018"
plot_by_model(dfs[dataset], dataset)

In [None]:
plot_by_dataset(df_all_datasets, ["SP-P"], save=True, file_suffix="-synthpop")

In [None]:
plot_by_dataset(df_all_datasets, ["DDPM"], save=True, file_suffix="-ddpm")

In [None]:
plot_mse_est(dfs["ACS2018"], "ACS2018", plot_order=["1-NN", "5-NN", "Decision Tree", "Random Forest", "MLP"], save=True, file_suffix="-small")

In [None]:
for dataset in datasets:
    # plot_by_model(dfs[dataset], dataset, save=True)
    plot_by_method(dfs[dataset], dataset, save=True, selected_method=["SP-P"])
    plot_by_method(dfs[dataset], dataset, save=True, selected_method=["DDPM"], file_suffix="-ddpm")
    plot_mse_est(dfs[dataset], dataset, save=True)

In [None]:
for dataset, df in dfs.items():
    table = df.groupby(["model", "method", "n_syn_datasets"])["mse_test"].aggregate(["mean", "std"])
    table["formatted"] = table.apply(lambda row: "{:.2f} $\pm$ {:.3f}".format(row["mean"], row["std"]), axis=1)
    table = table.reset_index("n_syn_datasets").pivot(columns="n_syn_datasets", values="formatted")
    table.index.rename(["Downstream", "Generator"], inplace=True)
    table.columns.rename("m", inplace=True)
    table = table.reindex(model_order, level="Downstream", axis="index")
    table.style.to_latex(fig_dir + "{}-table.tex".format(dataset), hrules=True, clines="skip-last;data")

In [None]:
pred_measured_col_name = "Predicted / Measured"
for dataset, df in dfs.items():
    df = df.melt(id_vars=["repeat_ind", "model", "n_syn_datasets", "method"], value_vars=["mse_test", "est_mse"], var_name=pred_measured_col_name)
    table = df.groupby(["model", "n_syn_datasets", "method", pred_measured_col_name])["value"].aggregate(["mean", "std"])
    table["formatted"] = table.apply(lambda row: "{:.2f} $\pm$ {:.3f}".format(row["mean"], row["std"]), axis=1)
    table = table.reset_index("n_syn_datasets").pivot(columns="n_syn_datasets", values="formatted")
    table.index.rename(["Downstream", "Generator", pred_measured_col_name], inplace=True)
    table.columns.rename("m", inplace=True)
    table = table.reindex(model_order, level="Downstream", axis="index")
    table.rename(index={"est_mse": "Predicted", "mse_test": "Measured"}, inplace=True)
    table.style.to_latex(fig_dir + "{}-mse-est-table.tex".format(dataset), hrules=True, clines="skip-last;data")

In [None]:
def plot_one_large_by_method(df, dataset, save=False, selected_method=None, file_suffix=""):
    if selected_method is not None:
        df = df[df.method.apply(lambda val: val in selected_method)]

    g = sns.FacetGrid(df, col="method", height=2.2, aspect=1.2)
    # g.figure.suptitle(dataset_names[dataset])
    g.map_dataframe(sns.barplot, x="model_short", y="mse_test", order=model_order_short, hue="size_mul", palette="flare", errwidth=0.7)
    for ax in g.axes.flatten():
        ax.axhline(min_real_data_mses[dataset], color="black")
    g.set(ylim=(min_real_data_mses[dataset] * 0.9, None))
    for ax in g.axes.flatten():
        ax.set_axisbelow(True)
        ax.grid()
    g.tick_params("x", labelrotation=45)
    g.set_ylabels("MSE")
    g.set_xlabels("")
    g.set_titles("{col_name}", fontweight="bold")
    g.add_legend(title="m")
    if save:
        plt.savefig("{}{}-by-method-one-large{}.pdf".format(fig_dir, dataset, file_suffix), bbox_inches="tight")
    plt.show()

dataset = "california-housing"
plot_one_large_by_method(one_large_dfs[dataset], dataset)

In [None]:
one_large_comparison_dfs = {}
for dataset in datasets:
    one_large_df = one_large_dfs[dataset].copy()
    one_large_df["Combination"] = "One Large"
    multiple_df = dfs[dataset].copy()
    multiple_df["Combination"] = "Multiple"
    multiple_df.rename(columns={"n_syn_datasets": "size_mul"}, inplace=True)
    multiple_df = multiple_df[multiple_df.size_mul <= 5]
    one_large_comparison_dfs[dataset] = pd.concat([one_large_df, multiple_df], ignore_index=True)

one_large_comparison_dfs["abalone"]

In [None]:
def plot_one_large_comparison(df, dataset, save=False, selected_method=None, file_suffix=""):
    if selected_method is not None:
        df = df[df.method.apply(lambda val: val in selected_method)]

    if selected_method is None:
        g = sns.FacetGrid(df, row="method", col="model", height=2.0, aspect=1.7, col_order=model_order[:-1])
    else:
        g = sns.FacetGrid(df, col="model", height=2.0, aspect=1.7, col_wrap=4, col_order=model_order[:-1])
    # g.figure.suptitle(dataset_names[dataset])
    g.map_dataframe(sns.lineplot, x="size_mul", y="mse_test", hue="Combination", style="Combination", markers=True)
    g.set(ylim=(min_real_data_mses[dataset] * 0.9, None))
    for ax in g.axes.flatten():
        ax.set_axisbelow(True)
        ax.grid()
    g.set_ylabels("MSE")
    g.set_xlabels("Size Mul.")
    g.set_titles("{col_name}", fontweight="bold")
    g.add_legend(title="Combination")
    if save:
        plt.savefig("{}{}-by-method-one-large{}.pdf".format(fig_dir, dataset, file_suffix), bbox_inches="tight")
    plt.show()

print(dataset)
plot_one_large_comparison(one_large_comparison_dfs[dataset], dataset, selected_method="SP-P")

In [None]:
for dataset in datasets:
    plot_one_large_comparison(one_large_comparison_dfs[dataset], dataset, selected_method="SP-P", save=True)