In [1]:
import logging


logging.basicConfig(
    level=logging.INFO,
    format="%(levelname)s: %(message)s",
    force=True,
)

In [None]:
from src.datasets.cifar10_dataset import CIFAR10Dataset


X, y = CIFAR10Dataset.get_xy()

In [2]:
train, test = CIFAR10Dataset.get_dataloaders()

  self.X = torch.tensor(X, dtype=torch.float32)
  self.y = torch.tensor(y, dtype=torch.int64)


In [None]:
import numpy as np
from scipy.stats import wilcoxon


def compare_architectures(scores_a, scores_b, alpha=0.05, architecture_names=None):
    """
    Compare two lists of paired performance scores using Wilcoxon Signed-Rank Test.

    Parameters:
    -----------
    scores_a : array-like
        Performance scores for architecture A (e.g., accuracy, F1-score)
    scores_b : array-like
        Performance scores for architecture B (must be same length as scores_a)
    alpha : float, default=0.05
        Significance level for the statistical test
    architecture_names : tuple, default=None
        Optional names for the architectures (name_a, name_b)

    Returns:
    --------
    str
        One of: "A is better", "B is better", or "Can't tell"

    Raises:
    -------
    ValueError
        If input arrays have different lengths or other validation issues
    """

    # Convert to numpy arrays for easier handling
    scores_a = np.array(scores_a)
    scores_b = np.array(scores_b)

    # Validation
    if len(scores_a) != len(scores_b):
        raise ValueError("Both score lists must have the same length")

    if len(scores_a) < 3:
        raise ValueError("Need at least 3 paired observations for meaningful results")

    # Set default names if not provided
    if architecture_names is None:
        name_a, name_b = "A", "B"
    else:
        name_a, name_b = architecture_names

    try:
        # Perform Wilcoxon Signed-Rank Test
        # alternative='two-sided' tests if medians are different
        statistic, p_value = wilcoxon(scores_a, scores_b, alternative="two-sided")

        # If not significant, we can't tell which is better
        if p_value >= alpha:
            return "Can't tell"

        # If significant, determine which is better based on median difference
        median_diff = np.median(scores_a - scores_b)

        if median_diff > 0:
            return f"{name_a} is better"
        else:
            return f"{name_b} is better"

    except ValueError as e:
        # Handle cases where all differences are zero or other issues
        if "zero" in str(e).lower():
            return "Can't tell"
        else:
            raise e


# Example usage and demonstration
if __name__ == "__main__":
    # Example 1: Architecture A clearly better
    scores_arch_a = [
        75.80645161290323,
        80.64516129032258,
        48.38709677419355,
        41.935483870967744,
        85.48387096774194,
        75.80645161290323,
        79.03225806451613,
        72.58064516129032,
        72.58064516129032,
        66.12903225806451,
    ]
    scores_arch_b = [
        74.19354838709677,
        67.74193548387096,
        45.16129032258065,
        38.70967741935484,
        83.87096774193549,
        70.96774193548387,
        82.25806451612904,
        75.80645161290323,
        70.96774193548387,
        79.03225806451613,
    ]

    result = compare_architectures(scores_arch_a, scores_arch_b)
    print(f"Example 1: {result}")

Example 1: Can't tell


In [None]:
from src.datasets.mnist_dataset import MiniMNISTDataset
from src.experiments.experiment1 import get_LeNet5_params
from src.models.compression.enums import Activation, NNParamsCompMode
from src.models.eval import KFoldNNArchitectureEvaluator
from src.models.nn import ActivationParams


datapoints = []

for compression in NNParamsCompMode:
    for activation in Activation:
        print(f"Evaluating LeNet5 with {compression} and {activation}...")

        model_params = get_LeNet5_params(
            DatasetClass=MiniMNISTDataset,
            conv_weight_qmode=compression,
            conv_activation=ActivationParams(activation),
            fc_weight_qmode=compression,
            fc_activation=ActivationParams(activation),
        )
        evaluator = KFoldNNArchitectureEvaluator(model_params)

        try:
            stats = evaluator.evaluate_accuracy(1)
        except Exception as e:
            logging.error(f"Error evaluating {compression} with {activation}: {e}")
            continue

        datapoints.append(
            {
                "architecture": "LeNet5",
                "dataset": MiniMNISTDataset.__name__,
                "compression": compression,
                "activation": activation,
                "top-1": stats["max"],
                "mean": stats["mean"],
                "accuracies": stats["accuracies"],
            }
        )

Evaluating LeNet5 with NNParamsCompMode.NONE and Activation.NONE...
Evaluating LeNet5 with NNParamsCompMode.NONE and Activation.RELU...
Evaluating LeNet5 with NNParamsCompMode.NONE and Activation.BINARIZE...
Evaluating LeNet5 with NNParamsCompMode.NONE and Activation.BINARIZE_RESTE...
Evaluating LeNet5 with NNParamsCompMode.NONE and Activation.TERNARIZE...
Evaluating LeNet5 with NNParamsCompMode.NBITS and Activation.NONE...


NotImplementedError: NBITS compression mode is not implemented for convolutional layers

In [3]:
datapoints

[{'architecture': 'LeNet5',
  'dataset': 'MiniMNISTDataset',
  'compression': <NNParamsCompMode.NONE: 'none'>,
  'activation': <Activation.NONE: 'none'>,
  'top-1': 93.375,
  'mean': np.float64(92.525),
  'accuracies': [93.375, 92.75, 92.375, 91.125, 93.0]},
 {'architecture': 'LeNet5',
  'dataset': 'MiniMNISTDataset',
  'compression': <NNParamsCompMode.NONE: 'none'>,
  'activation': <Activation.RELU: 'relu'>,
  'top-1': 96.625,
  'mean': np.float64(96.1),
  'accuracies': [96.625, 96.0, 95.875, 95.625, 96.375]},
 {'architecture': 'LeNet5',
  'dataset': 'MiniMNISTDataset',
  'compression': <NNParamsCompMode.NONE: 'none'>,
  'activation': <Activation.BINARIZE: 'binary'>,
  'top-1': 90.375,
  'mean': np.float64(89.325),
  'accuracies': [89.625, 90.0, 90.375, 88.375, 88.25]},
 {'architecture': 'LeNet5',
  'dataset': 'MiniMNISTDataset',
  'compression': <NNParamsCompMode.NONE: 'none'>,
  'activation': <Activation.BINARIZE_RESTE: 'binary_ReSTE'>,
  'top-1': 93.125,
  'mean': np.float64(91.875