# Quantum Learning Procedure

In [2]:
import sys
from pathlib import Path
qml_path = (Path.cwd() / "../../QML").resolve()
sys.path.insert(0, str(qml_path))

from Qsun.Qkernels import *
from Qsun.Qgates import *
from Qsun.Qmeas import *
from Qsun.Qcircuit import *
from Qsun.Qwave import *
from Qsun.Qencodes import *
from Qsun.Qdata import *

import numpy as np
import matplotlib.pyplot as plt
from src.load_datasets import *
from src.kernel_evaluation import *
from tqdm import tqdm

np.random.seed(1234)

### Loading 9 ansatzes from Qencodes.py

In [3]:
ENCODING_REGISTER = {
    "YZ_CX": {
        "fn": YZ_CX_encode,
        "has_params": True,
        "has_layers": True,
    },
    "HighDim": {
        "fn": HighDim_encode,
        "has_params": False,
        "has_layers": False,
    },
    "HZY_CZ": {
        "fn": HZY_CZ_encode,
        "has_params": True,
        "has_layers": True,
    },
    "Chebyshev": {
        "fn": Chebyshev_encode,
        "has_params": True,
        "has_layers": True,
    },
    "ParamZFeatureMap": {
        "fn": ParamZFeatureMap_encode,
        "has_params": True,
        "has_layers": True,
    },
    "SeparableRX": {
        "fn": SeparableRXEncoding_encode,
        "has_params": False,
        "has_layers": False,
    },
    "HardwareEfficientRx": {
        "fn": HardwareEfficientEmbeddingRx_encode,
        "has_params": False,
        "has_layers": True,
    },
    "ZFeatureMap": {
        "fn": ZFeatureMap_encode,
        "has_params": False,
        "has_layers": True,
    },
    "ZZFeatureMap": {
        "fn": ZZFeatureMap_encode,
        "has_params": False,
        "has_layers": True,
    },
}

def encode_sample(sample: np.ndarray, encoding_name: str, n_layers: int = 2, 
                  params: np.ndarray = None):
    if encoding_name not in ENCODING_REGISTER:
        raise ValueError(f"Unknown encoding: {encoding_name}")
    config = ENCODING_REGISTER[encoding_name]
    fn = config["fn"]
    if encoding_name == "YZ_CX":
        return fn(sample, params=params, n_layers=n_layers)
    elif encoding_name == "HZY_CZ":
        return fn(sample, params=params, n_layers=n_layers)
    elif encoding_name == "Chebyshev":
        return fn(sample, params=params, n_layers=n_layers)
    elif encoding_name == "ParamZFeatureMap":
        return fn(sample, params=params, n_layers=n_layers)
    elif encoding_name == "HardwareEfficientRx":
        return fn(sample, n_layers=n_layers)
    elif encoding_name == "ZFeatureMap":
        return fn(sample, n_layers=n_layers)
    elif encoding_name == "ZZFeatureMap":
        return fn(sample, n_layers=n_layers)
    elif encoding_name == "HighDim":
        return fn(sample)
    elif encoding_name == "SeparableRX":
        return fn(sample)
    else:
        raise ValueError(f"Unknown encoding: {encoding_name}")

### Quantum Embedding Kernels

In [4]:
def kernel_matrix(X_train, X_test,
                encoding_name, n_layers=2,
                params=None, random_state=42):
    n_train = X_train.shape[0]
    n_test = X_test.shape[0]
    encoded_train = []
    for i in range(n_train):
        state = encode_sample(X_train[i], encoding_name, n_layers, params)
        encoded_train.append(state)
    encoded_test = []
    for i in range(n_test):
        state = encode_sample(X_test[i], encoding_name, n_layers, params)
        encoded_test.append(state)
    K_train = np.zeros((n_train, n_train))
    for i in range(n_train):
        for j in range(i, n_train):
            k_ij = state_product(encoded_train[i], encoded_train[j])**2
            K_train[i, j] = k_ij
            K_train[j, i] = k_ij
    K_test = np.zeros((n_test, n_train))
    for i in range(n_test):
        for j in range(n_train):
            K_test[i, j] = state_product(encoded_test[i], encoded_train[j])**2
    
    return K_train, K_test

def total_kernels(X_train, X_test,
                encoding_names=None, n_layers=2,
                random_state=42):
    if encoding_names is None:
        encoding_names = list(ENCODING_REGISTER.keys())
    results = {}
    for name in encoding_names:
        try:
            K_train, K_test = kernel_matrix(
                X_train, X_test, name, n_layers, 
                random_state=random_state)
            results[name] = (K_train, K_test)
        except Exception as e:
                print(f"  Error: {e}")
    
    return results

def get_available_encodings():
    return list(ENCODING_REGISTER.keys())

In [5]:
datasets = load_datasets(
    data_dir="datasets", 
    max_qubit=4, 
    include_variants=True 
)

X_train, X_test, y_train, y_test = datasets["Iris"]

print(f"\nIris dataset: Train {X_train.shape}, Test {X_test.shape}")
print(f"Available encodings: {get_available_encodings()}")

print("Sample encoding: ZFeatureMap")

K_train, K_test = kernel_matrix(
        X_train, X_test, "ZFeatureMap", n_layers=2)



Iris dataset: Train (80, 4), Test (20, 4)
Available encodings: ['YZ_CX', 'HighDim', 'HZY_CZ', 'Chebyshev', 'ParamZFeatureMap', 'SeparableRX', 'HardwareEfficientRx', 'ZFeatureMap', 'ZZFeatureMap']
Sample encoding: ZFeatureMap


In [6]:
datasets = load_datasets(
    data_dir="datasets", 
    max_qubit=4, 
    include_variants=True  
)
for name, (X_tr, X_te, y_tr, y_te) in datasets.items():
    print(f"\n{name} dataset: Train {X_tr.shape}, Test {X_te.shape}")



Blobs_F2C2 dataset: Train (160, 2), Test (40, 2)

Blobs_F2C3 dataset: Train (160, 2), Test (40, 2)

Blobs_F2C4 dataset: Train (160, 2), Test (40, 2)

Blobs_F4C2 dataset: Train (160, 4), Test (40, 4)

Blobs_F4C3 dataset: Train (160, 4), Test (40, 4)

Blobs_F4C4 dataset: Train (160, 4), Test (40, 4)

Blobs_F2C2_std3 dataset: Train (160, 2), Test (40, 2)

Blobs_F2C2_std10 dataset: Train (160, 2), Test (40, 2)

Blobs_F4C2_std3 dataset: Train (160, 4), Test (40, 4)

Blobs_F4C2_std10 dataset: Train (160, 4), Test (40, 4)

Circle_n05_f5 dataset: Train (160, 2), Test (40, 2)

Circle_n10_f5 dataset: Train (160, 2), Test (40, 2)

Circle_n15_f5 dataset: Train (160, 2), Test (40, 2)

Circle_n10_f3 dataset: Train (160, 2), Test (40, 2)

Circle_n10_f8 dataset: Train (160, 2), Test (40, 2)

Moons_n05 dataset: Train (160, 2), Test (40, 2)

Moons_n10 dataset: Train (160, 2), Test (40, 2)

Moons_n15 dataset: Train (160, 2), Test (40, 2)

Moons_n25 dataset: Train (160, 2), Test (40, 2)

XOR dataset: Tra

### Model Execution

In [7]:
def total_runs(dataset_name="Iris", 
               encodings=None,
               n_layers=2,
               n_runs=10,
               test_size=0.2,
               random_state=42,
               include_variants=True):
    if encodings is None:
        encodings = get_available_encodings()
    print(f"Dataset: {dataset_name}")    
    
    results_accumulator = {enc: {m: {"train": [], "test": []} 
                                  for m in ["SVM"]} 
                           for enc in encodings}
    
    for run in tqdm(range(n_runs)):
        seed = random_state + run
        datasets = load_datasets(
            data_dir="datasets",
            random_state=seed, 
            test_size=test_size, 
            max_qubit=4,
            include_variants=include_variants
        )
        X_train, X_test, y_train, y_test = datasets[dataset_name]
        kernels = total_kernels(X_train, X_test, encodings, n_layers, seed)
        
        for enc_name, (K_train, K_test) in kernels.items():
            for model_name in ["SVM"]:
                result = evaluate_kernel(
                    K_train, K_test, y_train, y_test, enc_name, model_name
                )
                results_accumulator[enc_name][model_name]["train"].append(result.train_accuracy)
                results_accumulator[enc_name][model_name]["test"].append(result.test_accuracy)
    
    all_results = {}
    for enc_name in encodings:
        enc_results = []
        for model_name in ["SVM"]:
            train_scores = results_accumulator[enc_name][model_name]["train"]
            test_scores = results_accumulator[enc_name][model_name]["test"]
            enc_results.append(KernelEvaluation(
                model_name=model_name,
                encoding_name=enc_name,
                train_accuracy=np.mean(train_scores),
                test_accuracy=np.mean(test_scores),
                train_std=np.std(train_scores),
                test_std=np.std(test_scores)
            ))
        all_results[enc_name] = enc_results
    
    return {"results": all_results}


def run_all_datasets(encodings=None, n_layers=2, n_runs=10, 
                     test_size=0.2, random_state=42, include_variants=True):
    """Run experiments on all available datasets."""
    datasets = load_datasets(
        data_dir="datasets", 
        max_qubit=4, 
        include_variants=include_variants
    )
    
    all_results = {}
    dataset_names = list(datasets.keys())
    
    print(f"Total datasets to process: {len(dataset_names)}")
    print("=" * 60)
    
    for dataset_name in dataset_names:
        result = total_runs(
            dataset_name=dataset_name,
            encodings=encodings,
            n_layers=n_layers,
            n_runs=n_runs,
            test_size=test_size,
            random_state=random_state,
            include_variants=include_variants
        )
        all_results[dataset_name] = result
        print()
    
    return all_results

In [8]:
def summary(all_results):
    """Summary for single dataset results."""
    print(f"{'Encoding':<22} {'Model':<6} {'Train':<18} {'Test':<18}")
    print("-" * 75)
    
    best_test_acc = 0
    best_config = None
    
    for encoding_name, results in all_results.items():
        for r in results:
            train_str = f"{r.train_accuracy:.4f} ± {r.train_std:.4f}"
            test_str = f"{r.test_accuracy:.4f} ± {r.test_std:.4f}"
            print(f"{r.encoding_name:<22} {r.model_name:<6} {train_str:<18} {test_str:<18}")
            if r.test_accuracy > best_test_acc:
                best_test_acc = r.test_accuracy
                best_config = r
    
    print("-" * 75)
    print(f"Best: {best_config.encoding_name} + {best_config.model_name} = {best_test_acc:.4f} ± {best_config.test_std:.4f}")


def summary_all_datasets(all_dataset_results, model_name="SVM"):
    """Summary table showing best kernel for each dataset."""
    print(f"{'Dataset':<25} {'Best Kernel':<22} {'Test Accuracy':<18}")
    print("=" * 70)
    
    summary_data = []
    
    for dataset_name, result_dict in all_dataset_results.items():
        results = result_dict["results"]
        
        best_acc = 0
        best_kernel = None
        best_std = 0
        
        for enc_name, enc_results in results.items():
            for r in enc_results:
                if r.model_name == model_name and r.test_accuracy > best_acc:
                    best_acc = r.test_accuracy
                    best_kernel = enc_name
                    best_std = r.test_std
        
        summary_data.append({
            "Dataset": dataset_name,
            "Best_Kernel": best_kernel,
            "Accuracy": best_acc,
            "Std": best_std
        })
        
        acc_str = f"{best_acc:.4f} ± {best_std:.4f}"
        print(f"{dataset_name:<25} {best_kernel:<22} {acc_str:<18}")
    
    print("=" * 70)
    print(f"Total: {len(all_dataset_results)} datasets")
    
    return pd.DataFrame(summary_data)

# Table summary for accuracy corresponds to all datasets

In [9]:
def create_summary_table(all_dataset_results, 
                         model_name="SVM",
                         show_std=False) -> pd.DataFrame:
    encodings = list(ENCODING_REGISTER.keys())
    
    table_data = []
    for dataset_name, result_dict in all_dataset_results.items():
        row = {"Dataset": dataset_name}
        results = result_dict["results"]
        
        for enc_name in encodings:
            if enc_name in results:
                for r in results[enc_name]:
                    if r.model_name == model_name:
                        if show_std:
                            row[enc_name] = f"{r.test_accuracy:.4f} ± {r.test_std:.4f}"
                        else:
                            row[enc_name] = r.test_accuracy
                        break
            else:
                row[enc_name] = None if not show_std else "-"
        
        table_data.append(row)
    
    df = pd.DataFrame(table_data)
    df = df.set_index("Dataset")
    
    return df

In [10]:
datasets = load_datasets(
    data_dir="datasets", 
    max_qubit=4,
    include_variants=True
)

print(f"Total datasets: {len(datasets)}")
print("=" * 60)

all_results = {}

for dataset_name in datasets.keys():
    print(f"\nProcessing: {dataset_name}")
    print("-" * 60)
    
    result = total_runs(
        dataset_name=dataset_name,
        n_layers=2,
        n_runs=10,
        random_state=42,
        include_variants=True
    )
    all_results[dataset_name] = result

print("\n" + "=" * 80)
print("SUMMARY TABLE (Test Accuracy - SVM)")
print("=" * 80)
df_svm = create_summary_table(all_results, model_name="SVM", show_std=False)
print(df_svm.to_string())


Total datasets: 28

Processing: Blobs_F2C2
------------------------------------------------------------
Dataset: Blobs_F2C2


100%|██████████| 10/10 [00:03<00:00,  3.28it/s]



Processing: Blobs_F2C3
------------------------------------------------------------
Dataset: Blobs_F2C3


100%|██████████| 10/10 [00:03<00:00,  3.15it/s]



Processing: Blobs_F2C4
------------------------------------------------------------
Dataset: Blobs_F2C4


100%|██████████| 10/10 [00:03<00:00,  3.12it/s]



Processing: Blobs_F4C2
------------------------------------------------------------
Dataset: Blobs_F4C2


100%|██████████| 10/10 [00:06<00:00,  1.62it/s]



Processing: Blobs_F4C3
------------------------------------------------------------
Dataset: Blobs_F4C3


100%|██████████| 10/10 [00:06<00:00,  1.64it/s]



Processing: Blobs_F4C4
------------------------------------------------------------
Dataset: Blobs_F4C4


100%|██████████| 10/10 [00:05<00:00,  1.67it/s]



Processing: Blobs_F2C2_std3
------------------------------------------------------------
Dataset: Blobs_F2C2_std3


100%|██████████| 10/10 [00:02<00:00,  3.41it/s]



Processing: Blobs_F2C2_std10
------------------------------------------------------------
Dataset: Blobs_F2C2_std10


100%|██████████| 10/10 [00:03<00:00,  3.26it/s]



Processing: Blobs_F4C2_std3
------------------------------------------------------------
Dataset: Blobs_F4C2_std3


100%|██████████| 10/10 [00:06<00:00,  1.50it/s]



Processing: Blobs_F4C2_std10
------------------------------------------------------------
Dataset: Blobs_F4C2_std10


100%|██████████| 10/10 [00:06<00:00,  1.60it/s]



Processing: Circle_n05_f5
------------------------------------------------------------
Dataset: Circle_n05_f5


100%|██████████| 10/10 [00:03<00:00,  3.29it/s]



Processing: Circle_n10_f5
------------------------------------------------------------
Dataset: Circle_n10_f5


100%|██████████| 10/10 [00:03<00:00,  3.23it/s]



Processing: Circle_n15_f5
------------------------------------------------------------
Dataset: Circle_n15_f5


100%|██████████| 10/10 [00:02<00:00,  3.37it/s]



Processing: Circle_n10_f3
------------------------------------------------------------
Dataset: Circle_n10_f3


100%|██████████| 10/10 [00:03<00:00,  3.23it/s]



Processing: Circle_n10_f8
------------------------------------------------------------
Dataset: Circle_n10_f8


100%|██████████| 10/10 [00:02<00:00,  3.35it/s]



Processing: Moons_n05
------------------------------------------------------------
Dataset: Moons_n05


100%|██████████| 10/10 [00:03<00:00,  3.20it/s]



Processing: Moons_n10
------------------------------------------------------------
Dataset: Moons_n10


100%|██████████| 10/10 [00:03<00:00,  3.33it/s]



Processing: Moons_n15
------------------------------------------------------------
Dataset: Moons_n15


100%|██████████| 10/10 [00:03<00:00,  3.22it/s]



Processing: Moons_n25
------------------------------------------------------------
Dataset: Moons_n25


100%|██████████| 10/10 [00:02<00:00,  3.34it/s]



Processing: XOR
------------------------------------------------------------
Dataset: XOR


100%|██████████| 10/10 [00:03<00:00,  3.25it/s]



Processing: Spiral
------------------------------------------------------------
Dataset: Spiral


100%|██████████| 10/10 [00:03<00:00,  3.30it/s]



Processing: Checkerboard_2x2
------------------------------------------------------------
Dataset: Checkerboard_2x2


100%|██████████| 10/10 [00:03<00:00,  3.17it/s]



Processing: Iris
------------------------------------------------------------
Dataset: Iris


100%|██████████| 10/10 [00:02<00:00,  3.70it/s]



Processing: Wine
------------------------------------------------------------
Dataset: Wine


100%|██████████| 10/10 [00:03<00:00,  2.88it/s]



Processing: BreastCancer
------------------------------------------------------------
Dataset: BreastCancer


100%|██████████| 10/10 [00:27<00:00,  2.73s/it]



Processing: Pima
------------------------------------------------------------
Dataset: Pima


100%|██████████| 10/10 [00:44<00:00,  4.41s/it]



Processing: Banknote
------------------------------------------------------------
Dataset: Banknote


100%|██████████| 10/10 [01:59<00:00, 11.91s/it]



Processing: Haberman
------------------------------------------------------------
Dataset: Haberman


100%|██████████| 10/10 [00:07<00:00,  1.31it/s]


SUMMARY TABLE (Test Accuracy - SVM)
                     YZ_CX   HighDim    HZY_CZ  Chebyshev  ParamZFeatureMap  SeparableRX  HardwareEfficientRx  ZFeatureMap  ZZFeatureMap
Dataset                                                                                                                                 
Blobs_F2C2        0.542500  0.995000  1.000000   0.652500          0.837500     0.997500             0.997500     0.985000      0.960000
Blobs_F2C3        0.322500  1.000000  1.000000   0.487500          0.702500     1.000000             0.995000     1.000000      0.995000
Blobs_F2C4        0.265000  0.995000  1.000000   0.370000          0.587500     0.997500             0.997500     0.997500      0.987500
Blobs_F4C2        0.452500  1.000000  1.000000   0.657500          0.912500     1.000000             1.000000     0.997500      0.987500
Blobs_F4C3        0.340000  1.000000  1.000000   0.410000          0.750000     1.000000             1.000000     1.000000      0.995000
Blob




In [13]:
def label_dataframe(all_dataset_results, 
                    model_name="SVM",
                    threshold=0.01):
    encodings = list(ENCODING_REGISTER.keys())
    
    table_data = []
    tied_data = []
    
    for dataset_name, result_dict in all_dataset_results.items():
        results = result_dict["results"]
        
        kernel_accs = {}
        for enc_name in encodings:
            if enc_name in results:
                for r in results[enc_name]:
                    if r.model_name == model_name:
                        kernel_accs[enc_name] = r.test_accuracy
                        break
        
        if not kernel_accs:
            continue
            
        # Find best
        best_acc = max(kernel_accs.values())
        best_kernel = max(kernel_accs, key=kernel_accs.get)
        
        table_data.append({
            "Dataset": dataset_name,
            "Best_Kernel": best_kernel,
            "Accuracy": best_acc
        })
        
        # Find tied kernels (Task 1-B)
        if threshold is not None:
            for enc_name, acc in kernel_accs.items():
                if acc >= best_acc - threshold:
                    tied_data.append({
                        "Dataset": dataset_name,
                        "Best_Kernel": enc_name,
                        "Accuracy": acc
                    })
    
    df = pd.DataFrame(table_data).set_index("Dataset")
    
    if threshold is not None and tied_data:
        df_tied = pd.DataFrame(tied_data).set_index("Dataset")
        return df, df_tied
    
    return df


# Sử dụng Task 1-A (single best)
df_best = label_dataframe(all_results, model_name="SVM", threshold=None)
print("Task 1-A: Single Best Kernel")
print(df_best)

# Sử dụng Task 1-B (tied kernels within 1% threshold)
df_best, df_tied = label_dataframe(all_results, model_name="SVM", threshold=0.01)
print(f"\nTask 1-A samples: {len(df_best)}")
print(f"Task 1-B samples: {len(df_tied)}")
print("\nTask 1-B: All Tied Best Kernels")
print(df_tied)

Task 1-A: Single Best Kernel
                          Best_Kernel  Accuracy
Dataset                                        
Blobs_F2C2                     HZY_CZ  1.000000
Blobs_F2C3                    HighDim  1.000000
Blobs_F2C4                     HZY_CZ  1.000000
Blobs_F4C2                    HighDim  1.000000
Blobs_F4C3                    HighDim  1.000000
Blobs_F4C4                    HighDim  1.000000
Blobs_F2C2_std3                HZY_CZ  1.000000
Blobs_F2C2_std10               HZY_CZ  0.997500
Blobs_F4C2_std3                HZY_CZ  1.000000
Blobs_F4C2_std10              HighDim  1.000000
Circle_n05_f5             SeparableRX  1.000000
Circle_n10_f5             SeparableRX  0.992500
Circle_n15_f5     HardwareEfficientRx  0.930000
Circle_n10_f3                  HZY_CZ  1.000000
Circle_n10_f8             ZFeatureMap  0.800000
Moons_n05                     HighDim  0.990000
Moons_n10                     HighDim  0.985000
Moons_n15                     HighDim  0.972500
Moons_n25  