In [1]:
# Load the data and prepare the clean variables.
import pandas as pd
import numpy as np

df_train = pd.read_csv("../data/processed/train_no_preprocess.csv")

X_train = df_train.drop(columns=["Label", "Weight"])
y_train = df_train["Label"]
weights_train = df_train["Weight"]
categorical_features = df_train.columns.get_indexer(["PRI_jet_num"]).tolist()

df_train_drop_rows = df_train[~X_train.isna().any(axis=1)].reset_index(drop=True)
X_train_drop_rows = df_train_drop_rows.drop(columns=["Label", "Weight"])
y_train_drop_rows = df_train_drop_rows["Label"]
weights_train_drop_rows = df_train_drop_rows["Weight"]
categorical_features_drop_rows = df_train_drop_rows.columns.get_indexer(["PRI_jet_num"]).tolist()

cols_dropped = X_train.columns[X_train.isna().any(axis=0)]
df_train_drop_cols = df_train.drop(columns=cols_dropped)
X_train_drop_cols = df_train_drop_cols.drop(columns=["Label", "Weight"])
y_train_drop_cols = df_train_drop_cols["Label"]
weights_train_drop_cols = df_train_drop_cols["Weight"]
categorical_features_drop_cols = df_train_drop_cols.columns.get_indexer(["PRI_jet_num"]).tolist()


df_test = pd.read_csv("../data/processed/test_no_preprocess.csv")

X_test = df_test.drop(columns=["Label", "Weight"])
y_test = df_test["Label"]
weights_test = df_test["Weight"]
categorical_features = df_test.columns.get_indexer(["PRI_jet_num"]).tolist()

df_test_drop_rows = df_test[~X_test.isna().any(axis=1)].reset_index(drop=True)
X_test_drop_rows = df_test_drop_rows.drop(columns=["Label", "Weight"])
y_test_drop_rows = df_test_drop_rows["Label"]
weights_test_drop_rows = df_test_drop_rows["Weight"]
categorical_features_drop_rows = df_test_drop_rows.columns.get_indexer(["PRI_jet_num"]).tolist()

df_test_drop_cols = df_test.drop(columns=cols_dropped)
X_test_drop_cols = df_test_drop_cols.drop(columns=["Label", "Weight"])
y_test_drop_cols = df_test_drop_cols["Label"]
weights_test_drop_cols = df_test_drop_cols["Weight"]
categorical_features_drop_cols = df_test_drop_cols.columns.get_indexer(["PRI_jet_num"]).tolist()

# Convert them to numpy and store them in a datasets dictionary for easy reference.
datasets = {
    "original": (
        X_train.to_numpy(),
        y_train.to_numpy(),
        weights_train.to_numpy(),
        X_test.to_numpy(),
        y_test.to_numpy(),
        weights_test.to_numpy(),
        categorical_features,
    ),
    "drop-rows": (
        X_train_drop_rows.to_numpy(),
        y_train_drop_rows.to_numpy(),
        weights_train_drop_rows.to_numpy(),
        X_test_drop_rows.to_numpy(),
        y_test_drop_rows.to_numpy(),
        weights_test_drop_rows.to_numpy(),
        categorical_features_drop_rows,
    ),
    "drop-columns": (
        X_train_drop_cols.to_numpy(),
        y_train_drop_cols.to_numpy(),
        weights_train_drop_cols.to_numpy(),
        X_test_drop_cols.to_numpy(),
        y_test_drop_cols.to_numpy(),
        weights_test_drop_cols.to_numpy(),
        categorical_features_drop_cols,
    ),
}
del (
    df_train,
    X_train,
    y_train,
    df_train_drop_rows,
    X_train_drop_rows,
    y_train_drop_rows,
    df_train_drop_cols,
    X_train_drop_cols,
    y_train_drop_cols,
    categorical_features,
    categorical_features_drop_rows,
    categorical_features_drop_cols,
    df_test,
    X_test,
    y_test,
    df_test_drop_rows,
    X_test_drop_rows,
    y_test_drop_rows,
    df_test_drop_cols,
    X_test_drop_cols,
    y_test_drop_cols,
    weights_test_drop_cols,
)  # Free memory

In [2]:
from typing import Optional, Union
import sys, os

sys.path.append(os.path.abspath("../"))

from src.naive_bayes.experiments import ExperimentBase, run_all_experiments, load_experiment_set


class FinalExperiment(ExperimentBase):
    def _get_train_test_data(self, datasets):
        X_train, y_train, weights_train, X_test, y_test, weights_test, categorical_features = datasets[self.dataset]
        return X_train, y_train, weights_train, X_test, y_test, weights_test, categorical_features

In [5]:
import warnings


final_experiments = load_experiment_set(FinalExperiment, "../results/final-experiments.jsonl")
np.seterr(divide="ignore", invalid="ignore")
with warnings.catch_warnings():
    run_all_experiments(
        experiments=set(final_experiments),
        datasets=datasets,
        results_file="../results/final-experiments-results.jsonl",
        failed_file="../results/final-experiments-failed.jsonl",
    )

Running experiments: 0it [00:00, ?it/s]


For the report, let us print all the results in a $\LaTeX$ table.

In [7]:
import warnings

final_experiment_results = load_experiment_set(FinalExperiment, "../results/final-experiments-results.jsonl")

final_latex_table_lines = []
for model_class, model_name in [
    ("BespokeNB", ""),
    ("CategoricalAwareBespokeNB", "with Categorical Dependency"),
]:
    for continuous_estimator_class, estimator_name in [
        ("GaussianEstimator", "Gaussian"),
        ("HistogramEstimator", "Histogram"),
        ("EagerGaussianKDEstimator", "Eager Gaussian KDE"),
        ("YeoJohnsonGaussianEstimator", "Yeo-Johnson Gaussian"),
        ("RobustGaussianEstimator", "Robust Gaussian"),
        ("RobustHistogramEstimator", "Robust Histogram"),
        ("RobustEagerGaussianKDEstimator", "Robust Eager Gaussian KDE"),
        ("RobustYeoJohnsonGaussianEstimator", "Robust Yeo-Johnson Gaussian"),
    ]:
        for dataset, dataset_name in [
            ("original", ""),
            ("drop-rows", " (Drop Missing Rows)"),
            ("drop-columns", " (Drop Missing Columns)"),
        ]:
            if "Robust" in continuous_estimator_class and dataset != "original":
                # Ignore these results, as they make little sense
                continue
            subset = list(
                filter(
                    lambda ex: (ex.model_class == model_class)
                    & (ex.continuous_estimator_class == continuous_estimator_class)
                    & (ex.dataset == dataset),
                    final_experiment_results,
                )
            )
            if len(subset) == 0:
                warnings.warn(f"No results for {estimator_name} NB with {model_name}")
                continue
            best = max(subset, key=lambda ex: ex.result.ams_score)
            macro_f1 = (best.result.b_f1_score + best.result.s_f1_score) / 2
            final_latex_table_lines.append(
                f"{estimator_name} NB {model_name}{dataset_name}& "
                f"{best.result.ams_score:.2f} & "
                "- & "
                f"{best.result.accuracy:.2f} & "
                f"{macro_f1:.2f} \\\\"
            )

print("LaTeX table lines for final experiment results:")
for line in final_latex_table_lines:
    print(line)

LaTeX table lines for final experiment results:
Gaussian NB  (Drop Missing Rows)& 0.43 & - & 0.74 & 0.74 \\
Gaussian NB  (Drop Missing Columns)& 0.61 & - & 0.72 & 0.68 \\
Histogram NB  (Drop Missing Rows)& 0.54 & - & 0.74 & 0.72 \\
Histogram NB  (Drop Missing Columns)& 0.72 & - & 0.67 & 0.67 \\
Eager Gaussian KDE NB  (Drop Missing Rows)& 0.50 & - & 0.79 & 0.79 \\
Eager Gaussian KDE NB  (Drop Missing Columns)& 0.75 & - & 0.74 & 0.71 \\
Yeo-Johnson Gaussian NB  (Drop Missing Rows)& 0.25 & - & 0.58 & 0.46 \\
Yeo-Johnson Gaussian NB  (Drop Missing Columns)& 0.03 & - & 0.66 & 0.40 \\
Robust Gaussian NB & 0.60 & - & 0.74 & 0.68 \\
Robust Histogram NB & 0.76 & - & 0.72 & 0.71 \\
Robust Eager Gaussian KDE NB & 0.60 & - & 0.72 & 0.69 \\
Robust Yeo-Johnson Gaussian NB & 0.20 & - & 0.67 & 0.43 \\
Robust Gaussian NB with Categorical Dependency& 0.59 & - & 0.74 & 0.69 \\
Robust Histogram NB with Categorical Dependency& 0.68 & - & 0.74 & 0.72 \\
Robust Eager Gaussian KDE NB with Categorical Dependen

