In [1]:
# Load the data and prepare the clean variables.
import pandas as pd
import numpy as np

df_train = pd.read_csv("../data/processed/train_no_preprocess.csv")

X_train = df_train.drop(columns=["Label", "Weight"])
y_train = df_train["Label"]
weights_train = df_train["Weight"]
categorical_features = df_train.columns.get_indexer(["PRI_jet_num"]).tolist()

df_train_drop_rows = df_train[~X_train.isna().any(axis=1)].reset_index(drop=True)
X_train_drop_rows = df_train_drop_rows.drop(columns=["Label", "Weight"])
y_train_drop_rows = df_train_drop_rows["Label"]
weights_train_drop_rows = df_train_drop_rows["Weight"]
categorical_features_drop_rows = df_train_drop_rows.columns.get_indexer(["PRI_jet_num"]).tolist()

cols_dropped = X_train.columns[X_train.isna().any(axis=0)]
df_train_drop_cols = df_train.drop(columns=cols_dropped)
X_train_drop_cols = df_train_drop_cols.drop(columns=["Label", "Weight"])
y_train_drop_cols = df_train_drop_cols["Label"]
weights_train_drop_cols = df_train_drop_cols["Weight"]
categorical_features_drop_cols = df_train_drop_cols.columns.get_indexer(["PRI_jet_num"]).tolist()


df_test = pd.read_csv("../data/processed/test_no_preprocess.csv")

X_test = df_test.drop(columns=["Label", "Weight"])
y_test = df_test["Label"]
weights_test = df_test["Weight"]
categorical_features = df_test.columns.get_indexer(["PRI_jet_num"]).tolist()

df_test_drop_rows = df_test[~X_test.isna().any(axis=1)].reset_index(drop=True)
X_test_drop_rows = df_test_drop_rows.drop(columns=["Label", "Weight"])
y_test_drop_rows = df_test_drop_rows["Label"]
weights_test_drop_rows = df_test_drop_rows["Weight"]
categorical_features_drop_rows = df_test_drop_rows.columns.get_indexer(["PRI_jet_num"]).tolist()

df_test_drop_cols = df_test.drop(columns=cols_dropped)
X_test_drop_cols = df_test_drop_cols.drop(columns=["Label", "Weight"])
y_test_drop_cols = df_test_drop_cols["Label"]
weights_test_drop_cols = df_test_drop_cols["Weight"]
categorical_features_drop_cols = df_test_drop_cols.columns.get_indexer(["PRI_jet_num"]).tolist()

# Convert them to numpy and store them in a datasets dictionary for easy reference.
datasets = {
    "original": (
        X_train.to_numpy(),
        y_train.to_numpy(),
        weights_train.to_numpy(),
        X_test.to_numpy(),
        y_test.to_numpy(),
        weights_test.to_numpy(),
        categorical_features,
    ),
    "drop-rows": (
        X_train_drop_rows.to_numpy(),
        y_train_drop_rows.to_numpy(),
        weights_train_drop_rows.to_numpy(),
        X_test_drop_rows.to_numpy(),
        y_test_drop_rows.to_numpy(),
        weights_test_drop_rows.to_numpy(),
        categorical_features_drop_rows,
    ),
    "drop-columns": (
        X_train_drop_cols.to_numpy(),
        y_train_drop_cols.to_numpy(),
        weights_train_drop_cols.to_numpy(),
        X_test_drop_cols.to_numpy(),
        y_test_drop_cols.to_numpy(),
        weights_test_drop_cols.to_numpy(),
        categorical_features_drop_cols,
    ),
}
del (
    df_train,
    X_train,
    y_train,
    df_train_drop_rows,
    X_train_drop_rows,
    y_train_drop_rows,
    df_train_drop_cols,
    X_train_drop_cols,
    y_train_drop_cols,
    categorical_features,
    categorical_features_drop_rows,
    categorical_features_drop_cols,
    df_test,
    X_test,
    y_test,
    df_test_drop_rows,
    X_test_drop_rows,
    y_test_drop_rows,
    df_test_drop_cols,
    X_test_drop_cols,
    y_test_drop_cols,
    weights_test_drop_cols,
    categorical_features_drop_cols,
)  # Free memory

NameError: name 'categorical_features_drop_cols' is not defined

In [2]:
from typing import Dict, Optional, Literal, Union
from pydantic import BaseModel


class FinalExperiment(BaseModel):
    model_class: Literal["BespokeNB", "CategoricalAwareBespokeNB"]
    categorical_estimator_class: Literal["CategoricalEstimator", "RobustCategoricalEstimator"]
    continuous_estimator_class: Literal[
        "GaussianEstimator",
        "RobustGaussianEstimator",
        "HistogramEstimator",
        "RobustHistogramEstimator",
        "EagerGaussianKDEstimator",
        "RobustEagerGaussianKDEstimator",
        "YeoJohnsonGaussianEstimator",
        "RobustYeoJohnsonGaussianEstimator",
    ]
    dataset: Literal["original", "drop-rows", "drop-columns"]
    categorical_estimator_params: Dict[str, Union[Optional[float], Optional[int]]] = {}
    continuous_estimator_params: Dict[str, Union[Optional[float], Optional[int]]] = {}

    def __hash__(self):
        return hash(
            (
                self.model_class,
                self.categorical_estimator_class,
                self.continuous_estimator_class,
                frozenset(self.categorical_estimator_params.items()),
                frozenset(self.continuous_estimator_params.items()),
                self.dataset,
            )
        )

    def __eq__(self, other):
        if not isinstance(other, FinalExperiment):
            return NotImplemented
        return (
            self.model_class == other.model_class
            and self.categorical_estimator_class == other.categorical_estimator_class
            and self.continuous_estimator_class == other.continuous_estimator_class
            and self.categorical_estimator_params == other.categorical_estimator_params
            and self.continuous_estimator_params == other.continuous_estimator_params
            and self.dataset == other.dataset
        )


class FinalExperimentResult(FinalExperiment):
    accuracy: float
    b_recall: float
    b_precision: float
    b_f1_score: float
    s_recall: float
    s_precision: float
    s_f1_score: float
    ams_score: float

In [11]:
import sys
import os
from typing import Type
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

sys.path.append(os.path.abspath("../"))

import src.naive_bayes
import src.evaluate


def _instantiate_estimator(
    estimator_cls: Type[src.naive_bayes.ProbabilityEstimator],
    estimator_params: Dict[str, Union[Optional[float], Optional[int]]],
) -> src.naive_bayes.ProbabilityEstimator:
    init_kwargs = {}
    for key in estimator_cls.__init__.__code__.co_varnames[1:]:
        if not key in estimator_params:
            raise ValueError(f"Missing value for estimator parameter: {key}")
        init_kwargs[key] = estimator_params[key]
    return estimator_cls(**init_kwargs)


def _get_estimator_instances(
    experiment: FinalExperiment, num_features: int, categorical_features: list[int]
) -> Dict[int, src.naive_bayes.ProbabilityEstimator]:
    categorical_estimator_cls = getattr(src.naive_bayes, experiment.categorical_estimator_class)
    continuous_estimator_cls = getattr(src.naive_bayes, experiment.continuous_estimator_class)
    instances = {}
    for feature in range(num_features):
        if feature in categorical_features:
            instances[feature] = _instantiate_estimator(
                categorical_estimator_cls, experiment.categorical_estimator_params
            )
        else:
            instances[feature] = _instantiate_estimator(
                continuous_estimator_cls, experiment.continuous_estimator_params
            )
    return instances


def _get_model_instance(
    experiment: FinalExperiment, num_features: int, categorical_features: list[int]
) -> src.naive_bayes.BespokeNB | src.naive_bayes.CategoricalAwareBespokeNB:
    model_cls = getattr(src.naive_bayes, experiment.model_class)
    estimators = _get_estimator_instances(
        experiment, num_features=num_features, categorical_features=categorical_features
    )
    if model_cls == src.naive_bayes.BespokeNB:
        return model_cls(estimators=estimators)
    elif model_cls == src.naive_bayes.CategoricalAwareBespokeNB:
        return model_cls(
            estimators=estimators,
            categorical_features=categorical_features,
        )
    else:
        raise ValueError(f"Unknown model class: {experiment.model_class}")


def run_experiment(experiment: FinalExperiment) -> FinalExperimentResult:
    X_train, y_train, weights_train, X_test, y_test, weights_test, categorical_features = datasets[experiment.dataset]

    model = _get_model_instance(experiment, num_features=X_train.shape[1], categorical_features=categorical_features)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    (b_precision, s_precision), (b_recall, s_recall), (b_f1_score, s_f1_score), _ = precision_recall_fscore_support(
        y_test, y_pred, labels=["b", "s"], average=None, zero_division=0
    )
    ams_score = src.evaluate.ams_score(y_true=y_test, y_pred=y_pred, weights=weights_test)
    return FinalExperimentResult(
        **experiment.model_dump(),
        accuracy=accuracy,
        b_recall=b_recall,
        b_precision=b_precision,
        b_f1_score=b_f1_score,
        s_recall=s_recall,
        s_precision=s_precision,
        s_f1_score=s_f1_score,
        ams_score=ams_score,
    )

In [12]:
from typing import Set, TypeVar, Type
import os

T = TypeVar("T", bound=Union[FinalExperimentResult, FinalExperiment])


def store_experiment_set(experiments: Set[FinalExperimentResult] | Set[FinalExperiment], filename: str) -> None:
    """Store a set of FinalExperimentResult to a JSONL file."""
    if not os.path.exists(os.path.dirname(filename)):
        os.makedirs(os.path.dirname(filename))
    with open(filename, "w") as f:
        for er in experiments:
            if not isinstance(er, (FinalExperimentResult, FinalExperiment)):
                raise ValueError(f"Invalid type in experiments set: {type(er)}")
            f.write(er.model_dump_json() + "\n")


def load_experiment_set(cls: Type[T], filename: str) -> Set[T]:
    """Load a set of ExperimentResult from a JSONL file."""
    if not os.path.exists(filename):
        return set()
    experiments: Set[T] = set()
    with open(filename, "r") as f:
        for line in f:
            experiments.add(cls.model_validate_json(line.strip()))
    return experiments

In [None]:
from typing import Set, Tuple
from tqdm import tqdm
import warnings
import pandas as pd


def run_all_experiments(
    experiments: Set[FinalExperiment], *, verbose: bool = True, results_directory: Optional[os.PathLike] = None
) -> Tuple[Set[FinalExperimentResult], Set[FinalExperiment]]:
    if results_directory is None:
        results: Set[FinalExperimentResult] = set()
        failed: Set[FinalExperiment] = set()
    else:
        results: Set[FinalExperimentResult] = load_experiment_set(
            FinalExperimentResult, os.path.join(results_directory, "final-experiments-results.jsonl")
        )
        failed: Set[FinalExperiment] = load_experiment_set(
            FinalExperiment, os.path.join(results_directory, "final-experiments-failed.jsonl")
        )
    missing_experiments = experiments - results
    # store_experiment_set( # For debugging only
    #     missing_experiments,
    #     os.path.join(results_directory, "experiments-missing.jsonl"),
    # )
    bar = tqdm(total=len(missing_experiments), desc="Running experiments", disable=not verbose)
    try:
        for experiment in missing_experiments:
            # Skip experiments that have already been run
            if experiment in results:
                bar.update(1)
                continue
            bar.set_postfix_str(
                f"{experiment.model_class} with {experiment.continuous_estimator_class} on {experiment.dataset}"
            )
            bar.refresh()
            try:
                results.add(run_experiment(experiment))
            except Exception as e:
                warnings.warn(f"Experiment failed: {e}")
                failed.add(experiment)
            bar.update(1)
            if results_directory is not None:
                # Save intermediate results
                store_experiment_set(
                    results,
                    os.path.join(results_directory, "final-experiments-results.jsonl"),
                )
                store_experiment_set(
                    failed,
                    os.path.join(results_directory, "final-experiments-failed.jsonl"),
                )
    finally:
        bar.close()
    return results, failed

In [14]:
final_experiments = load_experiment_set(FinalExperiment, "../results/final-experiments.jsonl")
np.seterr(divide="ignore", invalid="ignore")
with warnings.catch_warnings():
    run_all_experiments(experiments=set(final_experiments), results_directory="../results/")

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
Running experiments: 100%|██████████| 21/21 [39:20<00:00, 112.38s/it, BespokeNB with EagerGaussianKDEstimator on drop-columns]                  


For the report, let us print all the results in a $\LaTeX$ table.

In [23]:
import warnings

final_experiment_results = load_experiment_set(FinalExperimentResult, "../results/final-experiments-results.jsonl")
final_experiment_results = pd.DataFrame([er.model_dump() for er in final_experiment_results])

final_latex_table_lines = []
for model_class, model_name in [
    ("BespokeNB", ""),
    ("CategoricalAwareBespokeNB", "with Categorical Dependency"),
]:
    for continuous_estimator_class, estimator_name in [
        ("GaussianEstimator", "Gaussian"),
        ("HistogramEstimator", "Histogram"),
        ("EagerGaussianKDEstimator", "Eager Gaussian KDE"),
        ("YeoJohnsonGaussianEstimator", "Yeo-Johnson Gaussian"),
        ("RobustGaussianEstimator", "Robust Gaussian"),
        ("RobustHistogramEstimator", "Robust Histogram"),
        ("RobustEagerGaussianKDEstimator", "Robust Eager Gaussian KDE"),
        ("RobustYeoJohnsonGaussianEstimator", "Robust Yeo-Johnson Gaussian"),
    ]:
        for dataset, dataset_name in [
            ("original", ""),
            ("drop-rows", " (Drop Missing Rows)"),
            ("drop-columns", " (Drop Missing Columns)"),
        ]:
            if "Robust" in continuous_estimator_class and dataset != "original":
                # Ignore these results, as they make little sense
                continue
            subset = final_experiment_results[
                (final_experiment_results["model_class"] == model_class)
                & (final_experiment_results["continuous_estimator_class"] == continuous_estimator_class)
                & (final_experiment_results["dataset"] == dataset)
            ]
            if subset.empty:
                warnings.warn(f"No results for {estimator_name} NB with {model_name}")
                continue
            best = subset["ams_score"].max()
            best_subset = subset[subset["ams_score"] == best]
            for j, exp in best_subset.iterrows():
                macro_f1 = (exp["b_f1_score"] + exp["s_f1_score"]) / 2
                final_latex_table_lines.append(
                    f"{estimator_name} NB {model_name}{dataset_name}& "
                    f"{exp['ams_score']:.2f} & "
                    "- & "
                    f"{exp['accuracy']:.2f} & "
                    f"{macro_f1:.2f} \\\\"
                )

print("LaTeX table lines for final experiment results:")
for line in final_latex_table_lines:
    print(line)

LaTeX table lines for final experiment results:
Gaussian NB  (Drop Missing Rows)& 0.43 & - & 0.74 & 0.74 \\
Gaussian NB  (Drop Missing Columns)& 0.61 & - & 0.72 & 0.68 \\
Histogram NB  (Drop Missing Rows)& 0.54 & - & 0.74 & 0.72 \\
Histogram NB  (Drop Missing Columns)& 0.72 & - & 0.67 & 0.67 \\
Eager Gaussian KDE NB  (Drop Missing Rows)& 0.50 & - & 0.79 & 0.79 \\
Eager Gaussian KDE NB  (Drop Missing Columns)& 0.75 & - & 0.74 & 0.71 \\
Yeo-Johnson Gaussian NB  (Drop Missing Rows)& 0.25 & - & 0.58 & 0.46 \\
Yeo-Johnson Gaussian NB  (Drop Missing Columns)& 0.03 & - & 0.66 & 0.40 \\
Robust Gaussian NB & 0.60 & - & 0.74 & 0.68 \\
Robust Histogram NB & 0.76 & - & 0.72 & 0.71 \\
Robust Eager Gaussian KDE NB & 0.60 & - & 0.72 & 0.69 \\
Robust Yeo-Johnson Gaussian NB & 0.20 & - & 0.67 & 0.43 \\
Robust Gaussian NB with Categorical Dependency& 0.59 & - & 0.74 & 0.69 \\
Robust Histogram NB with Categorical Dependency& 0.68 & - & 0.74 & 0.72 \\
Robust Eager Gaussian KDE NB with Categorical Dependen

