In [None]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.decomposition import KernelPCA

from qiskit.circuit.library import ZZFeatureMap, PauliFeatureMap
from qiskit_machine_learning.kernels import FidelityQuantumKernel
from qiskit_machine_learning.algorithms import QSVC


# ============================================================
# CONFIG
# ============================================================

DATA_DIR = "/Users/denniswayo/gpaw-DFT/Q-UCSpec/data"

FEATURE_COLS = [
    "Energy (eV)",
    "OscStrength",
    "Normalized f",
    "Peak Intensity",
    "Spectral Area",
    "Mean Energy",
    "Variance",
    "ε₁ (Real dielectric)",
    "ε₂ (Imag dielectric)",
    "n (Refractive index)",
    "κ (Extinction coeff)",
    "α (Absorption cm^-1)"
]

SPLIT_SEED = 42

MAX_QSVM_SAMPLES = 60
MAX_KERNEL_POINTS = 40

DO_PLOTS = False


# ============================================================
# A) Load / Split / Scale
# ============================================================

def load_dataset_with_splits():
    t0 = time.perf_counter()

    caf2 = pd.read_csv(os.path.join(DATA_DIR, "caf2_qml_full_descriptors.csv"))
    er   = pd.read_csv(os.path.join(DATA_DIR, "caf2_er_qml_full_descriptors.csv"))

    print("\n=== CAF2 columns ===")
    print(list(caf2.columns))

    print("\n=== ER columns ===")
    print(list(er.columns))

    caf2["label"] = 0
    er["label"]   = 1

    df = pd.concat([caf2, er], ignore_index=True)

    X = df[FEATURE_COLS].values
    y = df["label"].values

    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.20, stratify=y, random_state=SPLIT_SEED
    )

    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=SPLIT_SEED
    )

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val   = scaler.transform(X_val)
    X_test  = scaler.transform(X_test)

    t1 = time.perf_counter()
    print(
        f"[INFO] Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)} (Total = {len(X)})"
    )
    print(f"[TIMER] Data prep + scaling: {t1 - t0:.2f} s")

    return X_train, X_val, X_test, y_train, y_val, y_test


# ============================================================
# B) Feature Map Builder
# ============================================================

def build_feature_map(map_type, feature_dim, reps=1, entanglement="linear"):
    if map_type == "zz":
        return ZZFeatureMap(
            feature_dimension=feature_dim,
            reps=reps,
            entanglement=entanglement
        )
    elif map_type == "pauli":
        return PauliFeatureMap(
            feature_dimension=feature_dim,
            reps=reps,
            paulis=["Z", "ZZ"],
            entanglement=entanglement
        )
    else:
        raise ValueError("Unknown feature map type: 'zz' or 'pauli'.")


# ============================================================
# C) Train QSVM
# ============================================================

def train_qsvm(X_train, y_train, X_val, y_val, X_test, y_test, feature_map):

    if len(X_train) > MAX_QSVM_SAMPLES:
        rng = np.random.RandomState(SPLIT_SEED)
        idx = rng.choice(len(X_train), size=MAX_QSVM_SAMPLES, replace=False)
        X_train_q = X_train[idx]
        y_train_q = y_train[idx]
        print(f"[INFO] QSVM uses {len(X_train_q)} subsampled points")
    else:
        X_train_q, y_train_q = X_train, y_train

    quantum_kernel = FidelityQuantumKernel(feature_map=feature_map)
    qsvc = QSVC(quantum_kernel=quantum_kernel)

    t0 = time.perf_counter()
    qsvc.fit(X_train_q, y_train_q)
    t1 = time.perf_counter()
    print(f"[TIMER] QSVM training: {t1 - t0:.2f} s")

    t0 = time.perf_counter()
    y_train_pred = qsvc.predict(X_train)
    y_val_pred   = qsvc.predict(X_val)
    y_test_pred  = qsvc.predict(X_test)
    t1 = time.perf_counter()
    print(f"[TIMER] QSVM inference: {t1 - t0:.2f} s")

    acc_train = accuracy_score(y_train, y_train_pred)
    acc_val   = accuracy_score(y_val, y_val_pred)
    acc_test  = accuracy_score(y_test, y_test_pred)

    print("\n QSVM Accuracy:")
    print(f"   Train: {acc_train:.3f}")
    print(f"   Val:   {acc_val:.3f}")
    print(f"   Test:  {acc_test:.3f}")

    return qsvc, quantum_kernel


# ============================================================
# D) Classical Baseline
# ============================================================

def classical_baseline(X_train, y_train, X_val, y_val, X_test, y_test):

    svm = SVC(kernel="rbf", C=1.0, gamma="scale")

    t0 = time.perf_counter()
    svm.fit(X_train, y_train)
    t1 = time.perf_counter()
    print(f"[TIMER] Classical SVM training: {t1 - t0:.3f} s")

    acc_train = svm.score(X_train, y_train)
    acc_val   = svm.score(X_val, y_val)
    acc_test  = svm.score(X_test, y_test)

    print("\n Classical RBF SVM Accuracy:")
    print(f"   Train: {acc_train:.3f}")
    print(f"   Val:   {acc_val:.3f}")
    print(f"   Test:  {acc_test:.3f}")

    return svm


# ============================================================
# E) SCALE SWEEP (NEW)
# ============================================================

def scale_sweep(X_train, X_val, y_train, y_val, feature_map,
                scale_list=[10, 30, 100, 300, 500, 1000]):

    print("\n Running Quantum SCALE Sweep...\n")

    results = []

    rng = np.random.RandomState(SPLIT_SEED)
    idx = rng.choice(len(X_train), size=40, replace=False)
    X_small = X_train[idx]
    y_small = y_train[idx]

    for SCALE in scale_list:
        print(f" → Testing SCALE = {SCALE}")

        X_small_scaled = X_small * SCALE
        X_val_scaled   = X_val   * SCALE

        kernel = FidelityQuantumKernel(feature_map=feature_map)
        qsvc = QSVC(quantum_kernel=kernel)

        t0 = time.perf_counter()
        qsvc.fit(X_small_scaled, y_small)
        t1 = time.perf_counter()

        y_val_pred = qsvc.predict(X_val_scaled)
        acc_val = accuracy_score(y_val, y_val_pred)

        print(f"    ✓ val_acc={acc_val:.3f} | train_time={t1 - t0:.2f}s")
        results.append((SCALE, acc_val))

    print("\n SCALE RESULTS:")
    for s, a in results:
        print(f"   SCALE={s} → val_acc={a:.3f}")

    best = max(results, key=lambda x: x[1])
    print(f"\n Best SCALE = {best[0]}  (Val Acc = {best[1]:.3f})\n")

    return best[0]


# ============================================================
# F) Fast QSVM Sweep
# ============================================================

def fast_qsvm_sweep(X_train, y_train, X_val, y_val):

    SUB = 40
    rng = np.random.RandomState(SPLIT_SEED)
    idx = rng.choice(len(X_train), size=SUB, replace=False)
    Xq = X_train[idx]
    yq = y_train[idx]

    MAPS = ["zz", "pauli"]
    REPS = [1, 2, 3]
    ENTS = ["linear", "full"]

    results = []

    print("\n Running Fast QSVM Sweep (MVP mode)...\n")

    for m in MAPS:
        for r in REPS:
            for e in ENTS:

                print(f" → Testing map={m}, reps={r}, ent={e}")

                fmap = build_feature_map(m, X_train.shape[1], reps=r, entanglement=e)
                kernel = FidelityQuantumKernel(feature_map=fmap)
                qsvc = QSVC(quantum_kernel=kernel)

                t0 = time.perf_counter()
                qsvc.fit(Xq, yq)
                t1 = time.perf_counter()

                y_val_pred = qsvc.predict(X_val)
                acc_val = accuracy_score(y_val, y_val_pred)

                print(f"    ✓ val_acc={acc_val:.3f} | time={t1 - t0:.2f}s")

                results.append({
                    "map": m,
                    "reps": r,
                    "ent": e,
                    "val_acc": acc_val,
                    "train_time": t1 - t0
                })

    df = pd.DataFrame(results).sort_values("val_acc", ascending=False)
    df.to_csv("qsvm_hyperparam_results_fast.csv", index=False)

    print("\n Top QSVM configs:")
    print(df.head(5))
    print("\n Saved: qsvm_hyperparam_results_fast.csv")

    return df


# ============================================================
# MASTER EXECUTION
# ============================================================

def run_full_pipeline(map_type="zz", reps=1):

    X_train, X_val, X_test, y_train, y_val, y_test = load_dataset_with_splits()

    feature_map = build_feature_map(map_type, X_train.shape[1], reps=reps)

    # ===== NEW: Quantum Scale Sweep
    best_scale = scale_sweep(X_train, X_val, y_train, y_val, feature_map)
    print(f"[INFO] Using BEST SCALE = {best_scale}")

    X_train_qs = X_train * best_scale
    X_val_qs   = X_val   * best_scale
    X_test_qs  = X_test  * best_scale

    # ===== Final QSVM
    qsvc, quantum_kernel = train_qsvm(
        X_train_qs, y_train,
        X_val_qs, y_val,
        X_test_qs, y_test,
        feature_map
    )

    # ===== Classical Baseline
    classical_baseline(X_train, y_train, X_val, y_val, X_test, y_test)

    # ===== Fast QSVM Sweeping
    fast_qsvm_sweep(X_train_qs, y_train, X_val_qs, y_val)


# ============================================================
# EXECUTE
# ============================================================

run_full_pipeline(map_type="zz", reps=1)


=== CAF2 columns ===
['Energy (eV)', 'OscStrength', 'Normalized f', 'Peak Intensity', 'Spectral Area', 'Mean Energy', 'Variance', 'ε₁ (Real dielectric)', 'ε₂ (Imag dielectric)', 'n (Refractive index)', 'κ (Extinction coeff)', 'α (Absorption cm^-1)', 'System']

=== ER columns ===
['Energy (eV)', 'OscStrength', 'Normalized f', 'Peak Intensity', 'Spectral Area', 'Mean Energy', 'Variance', 'ε₁ (Real dielectric)', 'ε₂ (Imag dielectric)', 'n (Refractive index)', 'κ (Extinction coeff)', 'α (Absorption cm^-1)', 'System']
[INFO] Train: 2304, Val: 288, Test: 289 (Total = 2881)
[TIMER] Data prep + scaling: 0.06 s

 Running Quantum SCALE Sweep...

 → Testing SCALE = 10
