Let us now explore the experiment results for the first few models to narrow the hyperparameter search space for the KDE models:

In [1]:
import json
import pandas as pd

res = []
with open("../results/experiments-results.jsonl", "r") as f:
    for line in f:
        res.append(json.loads(line))
# Convert to DataFrame for easier analysis
res = pd.DataFrame(res)
categorical_cols = ["model_class", "categorical_estimator_class", "continuous_estimator_class", "dataset"]
for col in categorical_cols:
    res[col] = res[col].astype("category")
res.head()

Unnamed: 0,model_class,categorical_estimator_class,continuous_estimator_class,dataset,num_folds,fold_index,categorical_estimator_params,continuous_estimator_params,accuracy,b_recall,b_precision,b_f1_score,s_recall,s_precision,s_f1_score,ams_score
0,CategoricalAwareBespokeNB,RobustCategoricalEstimator,RobustHistogramEstimator,drop-columns,10,9,{},"{'laplace_smoothing': 1e-07, 'bins': 520}",0.685881,0.703805,0.795453,0.746828,0.651351,0.533039,0.586286,0.428922
1,CategoricalAwareBespokeNB,RobustCategoricalEstimator,RobustHistogramEstimator,drop-columns,10,5,{},"{'laplace_smoothing': 0.001, 'bins': 730}",0.682919,0.704658,0.791346,0.74549,0.640902,0.528918,0.57955,0.439115
2,BespokeNB,CategoricalEstimator,HistogramEstimator,drop-columns,10,8,{},{'bins': 350},0.746528,0.861086,0.778633,0.817786,0.523594,0.6595,0.583741,0.541244
3,CategoricalAwareBespokeNB,RobustCategoricalEstimator,RobustHistogramEstimator,original,10,5,{},"{'laplace_smoothing': 0, 'bins': 410}",0.731359,0.739348,0.834166,0.7839,0.715918,0.586966,0.645061,0.473895
4,BespokeNB,RobustCategoricalEstimator,RobustHistogramEstimator,drop-columns,10,4,{},"{'laplace_smoothing': 0.01, 'bins': 920}",0.658016,0.530869,0.9146,0.6718,0.904073,0.498952,0.643023,0.519674


### Final Parameter Selection

Now that we have all the results, we can select the best parameters for each model.

In [2]:
# For each combination, find the best hyperparameters based on the highest AMS score
import matplotlib.pyplot as plt
import seaborn as sns
import json

# Get all the unique combinations of model_class, continuous_estimator_class, categorical_estimator_class, and dataset
unique_combinations = res[
    ["model_class", "continuous_estimator_class", "categorical_estimator_class", "dataset"]
].drop_duplicates()
tline = "├"
hline = "─"
vline = "│"

final_experiments = []
for model_class in res["model_class"].unique():
    if not any(res["model_class"] == model_class):
        continue
    print(f"Model Class: {model_class}")
    for continuous_estimator_class in res["continuous_estimator_class"].unique():
        if not any(
            (res["model_class"] == model_class) & (res["continuous_estimator_class"] == continuous_estimator_class)
        ):
            continue
        print(f" {tline}{hline*2} Continuous Estimator Class: {continuous_estimator_class}")
        for categorical_estimator_class in res["categorical_estimator_class"].unique():
            if not any(
                (res["model_class"] == model_class)
                & (res["continuous_estimator_class"] == continuous_estimator_class)
                & (res["categorical_estimator_class"] == categorical_estimator_class)
            ):
                continue
            print(f" {vline}   Categorical Estimator Class: {categorical_estimator_class}")
            for dataset in res["dataset"].unique():
                if not any(
                    (res["model_class"] == model_class)
                    & (res["continuous_estimator_class"] == continuous_estimator_class)
                    & (res["categorical_estimator_class"] == categorical_estimator_class)
                    & (res["dataset"] == dataset)
                ):
                    continue
                print(f" {vline}   {tline}{hline*2} Dataset: {dataset}")
                subset = res[
                    (res["model_class"] == model_class)
                    & (res["continuous_estimator_class"] == continuous_estimator_class)
                    & (res["categorical_estimator_class"] == categorical_estimator_class)
                    & (res["dataset"] == dataset)
                ]

                # Compute the average AMS for each hyperparameter setting
                avg_ams = subset.groupby(
                    subset["continuous_estimator_params"].apply(lambda x: json.dumps(x, sort_keys=True))
                )["ams_score"].mean()
                # Find the best hyperparameter setting
                best_hyperparam = avg_ams.idxmax()
                best_ams = avg_ams.max()
                final_experiments.append(
                    {
                        "model_class": model_class,
                        "continuous_estimator_class": continuous_estimator_class,
                        "categorical_estimator_class": categorical_estimator_class,
                        "dataset": dataset,
                        "continuous_estimator_params": json.loads(best_hyperparam),
                    }
                )
                for param_str, param_value in json.loads(best_hyperparam).items():
                    print(f" {vline}   {vline}   {vline}   {param_str}: {param_value}")
                print(f" {vline}   {vline}   {vline}   -----------------------------")
                print(f" {vline}   {vline}   {vline}   Number of Experiments: {len(subset)}")
                print(f" {vline}   {vline}   {vline}   Best Average AMS: {best_ams:.4f}")

Model Class: CategoricalAwareBespokeNB
 ├── Continuous Estimator Class: RobustHistogramEstimator
 │   Categorical Estimator Class: RobustCategoricalEstimator
 │   ├── Dataset: drop-columns
 │   │   │   bins: 20
 │   │   │   laplace_smoothing: 0.0001
 │   │   │   -----------------------------
 │   │   │   Number of Experiments: 10000
 │   │   │   Best Average AMS: 0.4723
 │   ├── Dataset: original
 │   │   │   bins: 20
 │   │   │   laplace_smoothing: 0
 │   │   │   -----------------------------
 │   │   │   Number of Experiments: 9900
 │   │   │   Best Average AMS: 0.5134
 ├── Continuous Estimator Class: RobustGaussianEstimator
 │   Categorical Estimator Class: RobustCategoricalEstimator
 │   ├── Dataset: drop-columns
 │   │   │   laplace_smoothing: 0.0001
 │   │   │   -----------------------------
 │   │   │   Number of Experiments: 100
 │   │   │   Best Average AMS: 0.3983
 │   ├── Dataset: original
 │   │   │   laplace_smoothing: 0
 │   │   │   -----------------------------
 │   │   

In [3]:
len(final_experiments)
with open("../results/final-experiments.jsonl", "w") as f:
    for exp in final_experiments:
        f.write(json.dumps(exp) + "\n")