k-Nearest Neighbors classifier using a QuantumFloat Hamming-distance engine.

This script:
    - loads the Iris dataset (petal length & width),
    - performs a train/test split,
    - sweeps precision msize for integer quantization,
    - runs brute-force kNN with a QuantumFloat-based Hamming-distance kernel,
    - aggregates timing and qubit metrics,
    - estimates circuit resources per distance call for each precision,
    - saves metrics as a CSV.

In [None]:
import os
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

from qfloat_hamming import (
    load_iris_two_features_scaled,
    hamming_distance_trainpoint,
    estimate_resources_per_distance_call,
)

BASE_OUT_DIR = "Outputs"
os.makedirs(BASE_OUT_DIR, exist_ok=True)


In [None]:
def qrisp_knn_hamming(
    X_test: np.ndarray,
    X_train: np.ndarray,
    y_train: np.ndarray,
    msize: int,
    k: int = 3,
    shots: int = 512,
    debug: bool = True,
    compute_resources: bool = True,
) -> Tuple[np.ndarray, Dict[str, float]]:
    """
    Brute-force kNN classifier using a QuantumFloat-based Hamming-distance engine.

    For each test point, the function:
        - quantizes the test and train features into integers in [0, 2^msize - 1],
        - computes the Hamming distance to each train sample using
          hamming_distance_trainpoint,
        - selects the majority label among the k nearest neighbors.

    Args:
        X_test:
            Array of shape (n_test, 2) with scaled test features.

        X_train:
            Array of shape (n_train, 2) with scaled train features.

        y_train:
            Array of shape (n_train,) with class labels in {0, 1, 2}.

        msize:
            Number of bits for integer quantization per scalar feature.

        k:
            Number of neighbors to consider for the majority vote.

        shots:
            Number of measurement shots for the distance register.

        debug:
            If True, prints per-sample progress logs.

        compute_resources:
            If True, builds and compiles a representative distance circuit once
            per msize and adds gate-depth metrics to the returned metrics.

    Returns:
        y_pred:
            Array of shape (n_test,) with predicted class labels.

        metrics:
            Dictionary with aggregate timing, qubit, and resource metrics
            for this precision.
    """
    n_test = X_test.shape[0]
    n_train = X_train.shape[0]

    scale = (2 ** msize) - 1
    X_test_int = np.round(X_test * scale).astype(int)
    X_train_int = np.round(X_train * scale).astype(int)

    y_pred = np.zeros(n_test, dtype=int)

    total_encoding_time = 0.0
    total_encoding_qubits = 0.0
    total_classification_time = 0.0
    max_total_qubits = 0.0
    num_distance_calls = 0

    for i in range(n_test):
        x_int = X_test_int[i]
        distances: List[Tuple[int, int]] = []

        if debug:
            print(
                f"[qrisp][kNN-Hamming][msize={msize}] "
                f"test sample {i+1}/{n_test}: x={X_test[i]}"
            )

        for j in range(n_train):
            train_vec_int = X_train_int[j]
            label_j = int(y_train[j])

            d_ham, meta = hamming_distance_trainpoint(
                test_vec_int=x_int,
                train_vec_int=train_vec_int,
                msize=msize,
                shots=shots,
            )

            distances.append((d_ham, label_j))

            total_encoding_time += meta["encoding_time"]
            total_encoding_qubits += meta["encoding_qubits"]
            total_classification_time += meta["total_time"]
            max_total_qubits = max(max_total_qubits, meta["total_qubits"])
            num_distance_calls += 1

        distances.sort(key=lambda pair: pair[0])
        k_nearest = distances[:k]
        labels_k = [lbl for (_, lbl) in k_nearest]
        counts = np.bincount(labels_k, minlength=3)
        y_pred[i] = int(np.argmax(counts))

    num_scalar_encodes = num_distance_calls * 4.0

    avg_encoding_time_per_feature = total_encoding_time / num_scalar_encodes
    avg_encoding_qubits_per_call = total_encoding_qubits / num_distance_calls
    avg_encoding_qubits_per_feature = avg_encoding_qubits_per_call / 4.0
    avg_classification_time_per_sample = total_classification_time / n_test
    avg_encoding_time_per_sample = total_encoding_time / n_test

    metrics: Dict[str, float] = {
        "msize": float(msize),
        "k": float(k),
        "avg_encoding_time_per_feature": float(avg_encoding_time_per_feature),
        "avg_encoding_qubits_per_call": float(avg_encoding_qubits_per_call),
        "avg_encoding_qubits_per_feature": float(avg_encoding_qubits_per_feature),
        "avg_classification_time_per_sample": float(avg_classification_time_per_sample),
        "avg_encoding_time_per_sample": float(avg_encoding_time_per_sample),
        "max_total_qubits_per_distance_call": float(max_total_qubits),
        "num_test_samples": float(n_test),
        "num_train_samples": float(n_train),
        "num_distance_calls": float(num_distance_calls),
    }

    if compute_resources:
        res = estimate_resources_per_distance_call(msize)
        metrics.update(res)

    return y_pred, metrics

In [None]:
if __name__ == "__main__":
    X_scaled, y_full, feature_names, target_names = load_iris_two_features_scaled()

    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled,
        y_full,
        test_size=0.25,
        stratify=y_full,
        random_state=42,
    )

    print(f"Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}")
    print(f"Target names: {target_names}\n")

    msizes = [3, 4, 5, 6, 7, 8]
    k = 3

    metrics_rows: List[Dict[str, float]] = []

    for msize in msizes:
        print(f"Running Qrisp kNN (Hamming) with QuantumFloat msize={msize}, k={k}")

        y_pred, metrics = qrisp_knn_hamming(
            X_test=X_test,
            X_train=X_train,
            y_train=y_train,
            msize=msize,
            k=k,
            shots=512,
            debug=True,
            compute_resources=True,
        )

        cm = confusion_matrix(y_test, y_pred, labels=[0, 1, 2])
        acc = accuracy_score(y_test, y_pred)

        print(f"  Accuracy (msize={msize}): {acc:.3f}")
        print("  Confusion matrix (rows=true, cols=pred):")
        print(cm)

        cm_df = pd.DataFrame(
            cm,
            index=[f"true_{cls}" for cls in target_names],
            columns=[f"pred_{cls}" for cls in target_names],
        )
        cm_path = os.path.join(BASE_OUT_DIR, f"confusion_knn_precision_{msize}.csv")
        cm_df.to_csv(cm_path, float_format="%.6e")

        metrics_row: Dict[str, float] = {
            "msize": float(msize),
            "k": float(k),
            "accuracy": float(acc),
        }
        metrics_row.update(metrics)
        metrics_rows.append(metrics_row)

    metrics_df = pd.DataFrame(metrics_rows)
    metrics_csv_path = os.path.join(
        BASE_OUT_DIR, "knn_metrics_and_resources_by_precision.csv"
    )
    metrics_df.to_csv(metrics_csv_path, index=False, float_format="%.6e")
