Centroid-based classifier using a QuantumFloat Hamming-distance engine.

This script:
    - loads the Iris dataset (petal length & width),
    - computes class centroids in the scaled 2D feature space,
    - sweeps precision msize for integer quantization,
    - classifies each sample by nearest centroid using a Hamming-distance
      QuantumFloat kernel,
    - aggregates timing and qubit metrics,
    - estimates circuit resources per distance call for each precision,
    - saves confusion matrices, metrics, and per-precision predictions as CSVs.

In [None]:
import os
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix

from qfloat_hamming import (
    load_iris_two_features_scaled,
    hamming_distance_trainpoint,
    estimate_resources_per_distance_call,
)

BASE_OUT_DIR = "Outputs"
os.makedirs(BASE_OUT_DIR, exist_ok=True)

In [None]:
def compute_class_centroids(
    X: np.ndarray,
    y: np.ndarray,
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Compute class centroids in the 2D feature space.

    Args:
        X:
            Array of shape (n_samples, 2) with scaled features.

        y:
            Array of shape (n_samples,) with class labels in {0, 1, 2}.

    Returns:
        centroids:
            Array of shape (n_classes, 2) with per-class mean features.

        class_labels:
            Array with the distinct class labels used in centroids.
    """
    classes = np.unique(y)
    centroids: List[np.ndarray] = []

    for cls in classes:
        X_cls = X[y == cls]
        mu_cls = X_cls.mean(axis=0)
        centroids.append(mu_cls)

    centroids_arr = np.asarray(centroids)
    return centroids_arr, classes


def centroid_nn_hamming(
    X_test: np.ndarray,
    centroids: np.ndarray,
    centroid_labels: np.ndarray,
    msize: int,
    shots: int = 512,
    debug: bool = True,
    compute_resources: bool = True,
) -> Tuple[np.ndarray, Dict[str, float]]:
    """
    Nearest-centroid classifier using a QuantumFloat-based Hamming-distance engine.

    For each test point, the function:
        - quantizes the test point and class centroids into integers in
          [0, 2^msize - 1],
        - computes the Hamming distance to each centroid using
          hamming_distance_trainpoint,
        - assigns the label of the nearest centroid.

    Args:
        X_test:
            Array of shape (n_test, 2) with scaled test features.

        centroids:
            Array of shape (n_centroids, 2) with class centroids in scaled space.

        centroid_labels:
            Array of shape (n_centroids,) giving the label for each centroid.

        msize:
            Number of bits for integer quantization per scalar feature.

        shots:
            Number of measurement shots for the distance register.

        debug:
            If True, prints per-sample progress.

        compute_resources:
            If True, builds and compiles a representative distance circuit once
            per msize and adds gate-depth metrics to the returned metrics.

    Returns:
        y_pred:
            Array of shape (n_test,) with predicted class labels.

        metrics:
            Dictionary with aggregate timing, qubit, and resource metrics
            for this precision.
    """
    n_test = X_test.shape[0]
    n_centroids = centroids.shape[0]

    scale = (2 ** msize) - 1
    X_test_int = np.round(X_test * scale).astype(int)
    centroids_int = np.round(centroids * scale).astype(int)

    y_pred = np.empty(n_test, dtype=int)

    total_encoding_time = 0.0
    total_encoding_qubits = 0.0
    total_classification_time = 0.0
    max_total_qubits = 0.0
    num_distance_calls = 0

    for i in range(n_test):
        x_int = X_test_int[i]
        distances: List[Tuple[int, int]] = []

        if debug:
            print(
                f"[qrisp][Centroid-Hamming][msize={msize}] "
                f"test sample {i+1}/{n_test}: x={X_test[i]}"
            )

        for centroid_vec_float, centroid_vec_int, label_k in zip(
            centroids,
            centroids_int,
            centroid_labels,
        ):
            d_ham, meta = hamming_distance_trainpoint(
                test_vec_int=x_int,
                train_vec_int=centroid_vec_int,
                msize=msize,
                shots=shots,
            )

            distances.append((d_ham, int(label_k)))

            total_encoding_time += meta["encoding_time"]
            total_encoding_qubits += meta["encoding_qubits"]
            total_classification_time += meta["total_time"]
            max_total_qubits = max(max_total_qubits, meta["total_qubits"])
            num_distance_calls += 1

        distances.sort(key=lambda pair: pair[0])
        best_distance, best_label = distances[0]
        y_pred[i] = best_label

    num_scalar_encodes = num_distance_calls * 4.0

    avg_encoding_time_per_feature = total_encoding_time / num_scalar_encodes
    avg_encoding_qubits_per_call = total_encoding_qubits / num_distance_calls
    avg_encoding_qubits_per_feature = avg_encoding_qubits_per_call / 4.0
    avg_classification_time_per_sample = total_classification_time / n_test
    avg_encoding_time_per_sample = total_encoding_time / n_test

    metrics: Dict[str, float] = {
        "msize": float(msize),
        "avg_encoding_time_per_feature": float(avg_encoding_time_per_feature),
        "avg_encoding_qubits_per_call": float(avg_encoding_qubits_per_call),
        "avg_encoding_qubits_per_feature": float(avg_encoding_qubits_per_feature),
        "avg_classification_time_per_sample": float(avg_classification_time_per_sample),
        "avg_encoding_time_per_sample": float(avg_encoding_time_per_sample),
        "max_total_qubits_per_distance_call": float(max_total_qubits),
        "num_test_samples": float(n_test),
        "num_centroids": float(n_centroids),
        "num_distance_calls": float(num_distance_calls),
    }

    if compute_resources:
        res = estimate_resources_per_distance_call(msize)
        metrics.update(res)

    return y_pred, metrics


In [None]:
if __name__ == "__main__":
    X_scaled, y_full, feature_names, target_names = load_iris_two_features_scaled()
    centroids, centroid_labels = compute_class_centroids(X_scaled, y_full)

    print("Centroids (scaled [0,1], classical):")
    for label, mu in zip(centroid_labels, centroids):
        print(
            f"  class {label} ({target_names[label]}): "
            f"{feature_names[0]}={mu[0]:.4f}, {feature_names[1]}={mu[1]:.4f}"
        )

    msizes = [3, 4, 5, 6, 7, 8]

    metrics_rows: List[Dict[str, float]] = []
    pred_df = pd.DataFrame(
        {
            "sample_index": np.arange(len(y_full), dtype=int),
            "true_label": y_full.astype(int),
        }
    )

    for msize in msizes:
        print(
            f"\nRunning centroid classifier (Hamming) with QuantumFloat msize={msize}"
        )

        y_pred, metrics = centroid_nn_hamming(
            X_test=X_scaled,
            centroids=centroids,
            centroid_labels=centroid_labels,
            msize=msize,
            shots=512,
            debug=True,
            compute_resources=True,
        )

        acc = accuracy_score(y_full, y_pred)
        cm = confusion_matrix(y_full, y_pred, labels=[0, 1, 2])

        print(f"  Accuracy (msize={msize}): {acc:.3f}")
        print("  Confusion matrix (rows=true, cols=pred):")
        print(cm)

        metrics_row: Dict[str, float] = {
            "msize": float(msize),
            "accuracy": float(acc),
        }
        metrics_row.update(metrics)
        metrics_rows.append(metrics_row)

        cm_df = pd.DataFrame(
            cm,
            index=[f"true_{cls}" for cls in target_names],
            columns=[f"pred_{cls}" for cls in target_names],
        )
        cm_path = os.path.join(BASE_OUT_DIR, f"confusion_centroid_precision_{msize}.csv")
        cm_df.to_csv(cm_path, float_format="%.6e")

        pred_df[f"pred_m{msize}"] = y_pred.astype(int)

    metrics_df = pd.DataFrame(metrics_rows)
    metrics_path = os.path.join(BASE_OUT_DIR, "centroid_metrics_and_resources_by_precision.csv")
    metrics_df.to_csv(metrics_path, index=False, float_format="%.6e")

    preds_path = os.path.join(BASE_OUT_DIR, "centroid_predictions_by_precision.csv")
    pred_df.to_csv(preds_path, index=False)
