In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score
import pennylane as qml
from tqdm import tqdm
import time
import joblib  # Added joblib import

# Define classifiers
CLASSIFIERS = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)  # Added n_jobs=-1
    #'ExtraTrees': ExtraTreesClassifier(n_estimators=100, random_state=42, n_jobs=-1),  # Added n_jobs=-1
    #'KNN': KNeighborsClassifier(n_neighbors=5, n_jobs=-1)  # Added n_jobs=-1
}

def load_and_preprocess_data(filepath):
    data = pd.read_csv(filepath)
    df=data

    from sklearn.utils import resample
    
    # Check the unique counts in the label column
    print(df['class'].value_counts())
    df = df.drop_duplicates()
    df = df.dropna()
    # Separate classes
    class_counts = df['class'].value_counts()
    minority_class = class_counts.idxmin()  # Class with fewer samples
    majority_class = class_counts.idxmax()  # Class with more samples
    
    df_minority = df[df['class'] == minority_class]
    df_majority = df[df['class'] == majority_class]
    
    # Undersample both classes to 70 records each
    df_minority_undersampled = resample(df_minority, 
                                        replace=False, 
                                        n_samples=500, 
                                        random_state=42)
    
    df_majority_undersampled = resample(df_majority, 
                                        replace=False, 
                                        n_samples=500, 
                                        random_state=42)

    # Combine the undersampled data
    df_balanced = pd.concat([df_minority_undersampled, df_majority_undersampled])
    
    # Shuffle the dataset to mix the classes
    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Check new class distribution
    print(df_balanced['class'].value_counts())

    df=df_balanced
    
    from sklearn.model_selection import train_test_split
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    le = LabelEncoder()
    y = le.fit_transform(y)
    
    return X, y

def normalize_for_amplitude_embedding(X):
    """Normalize data for amplitude embedding (L2 norm = 1)"""
    X_normalized = []
    for x in X:
        # Calculate L2 norm
        norm = np.linalg.norm(x)
        if norm > 0:
            # Normalize the vector
            x_normalized = x / norm
        else:
            x_normalized = x
        X_normalized.append(x_normalized)
    return np.array(X_normalized)

def create_quantum_embedding_circuit(n_qubits):
    # Calculate how many qubits we need for amplitude embedding
    # For n features, we need log2(n) qubits
    n_required_qubits = int(np.ceil(np.log2(n_qubits)))
    dev = qml.device("default.qubit", wires=n_required_qubits)
    
    @qml.qnode(dev)
    def quantum_circuit(inputs):
        # Amplitude embedding for normalized features
        # This will encode the data into the amplitudes of the quantum state
        qml.AmplitudeEmbedding(inputs, wires=range(n_required_qubits), normalize=True, pad_with=0.0)
        
        # Add entangling layers for quantum advantage
        for i in range(2):  # Two layers of entanglement
            for j in range(n_required_qubits-1):
                qml.CNOT(wires=[j, j+1])
            qml.RY(np.pi/4, wires=n_required_qubits-1)
            
        # Measure in computational basis
        return [qml.expval(qml.PauliZ(i)) for i in range(n_required_qubits)]
    
    return quantum_circuit

# Helper function for parallel processing of quantum embedding
def process_quantum_sample(args):
    sample, quantum_circuit, n_qubits, n_amplitudes = args
    
    # Truncate or pad the sample to match required amplitudes
    sample_padded = np.zeros(n_amplitudes)
    sample_padded[:min(len(sample), n_amplitudes)] = sample[:min(len(sample), n_amplitudes)]
    
    # Renormalize after padding
    norm = np.linalg.norm(sample_padded)
    if norm > 0:
        sample_padded = sample_padded / norm
        
    embedded_sample = quantum_circuit(sample_padded)
    return embedded_sample

def apply_quantum_embedding(X, n_qubits):
    # For amplitude embedding, we need to pad the data to a power of 2
    n_required_qubits = int(np.ceil(np.log2(n_qubits)))
    n_amplitudes = 2**n_required_qubits
    
    # Normalize data for amplitude embedding
    X_normalized = normalize_for_amplitude_embedding(X)
    
    quantum_circuit = create_quantum_embedding_circuit(n_qubits)
    
    # Prepare arguments for parallel processing
    args_list = [(sample, quantum_circuit, n_qubits, n_amplitudes) for sample in X_normalized]
    
    # Use joblib for parallel processing
    X_quantum = joblib.Parallel(n_jobs=-1)(
        joblib.delayed(process_quantum_sample)(arg) for arg in tqdm(args_list, desc=f"Quantum embedding (qubits={n_required_qubits})")
    )
    
    return np.array(X_quantum)

# Process a single quantum distance calculation
def process_quantum_distance(args):
    sample1, sample2, state_prep, n_amplitudes = args
    
    # Pad inputs to power of 2 for amplitude embedding
    x1_padded = np.zeros(n_amplitudes)
    x1_padded[:min(len(sample1), n_amplitudes)] = sample1[:min(len(sample1), n_amplitudes)]
    norm1 = np.linalg.norm(x1_padded)
    if norm1 > 0:
        x1_padded = x1_padded / norm1
        
    x2_padded = np.zeros(n_amplitudes)
    x2_padded[:min(len(sample2), n_amplitudes)] = sample2[:min(len(sample2), n_amplitudes)]
    norm2 = np.linalg.norm(x2_padded)
    if norm2 > 0:
        x2_padded = x2_padded / norm2
    
    state1 = state_prep(x1_padded)
    state2 = state_prep(x2_padded)
    
    # Calculate fidelity (overlap) between quantum states
    fidelity = np.abs(np.vdot(state1, state2))**2
    
    # Convert to distance (1 - fidelity)
    # Higher fidelity = more similar = less distance
    return 1 - fidelity

# Improved Quantum KNN implementation with fidelity-based distance
class QuantumKNN:
    def __init__(self, n_neighbors=5, n_qubits=None):
        self.n_neighbors = n_neighbors
        self.n_qubits = n_qubits
        self.X_train = None
        self.y_train = None
        
    def create_fidelity_circuit(self):
        # Calculate required qubits for amplitude embedding
        n_required_qubits = int(np.ceil(np.log2(self.n_qubits)))
        
        # Create a device with the required number of qubits
        dev = qml.device("default.qubit", wires=n_required_qubits)
        
        @qml.qnode(dev)
        def state_preparation(x):
            # Prepare quantum state using amplitude embedding
            qml.AmplitudeEmbedding(x, wires=range(n_required_qubits), normalize=True, pad_with=0.0)
            # Return state
            return qml.state()
        
        return state_preparation
    
    def quantum_distance(self, x1, x2):
        # Calculate quantum distance based on fidelity between states
        state_prep = self.create_fidelity_circuit()
        
        # Pad inputs to power of 2 for amplitude embedding
        n_required_qubits = int(np.ceil(np.log2(self.n_qubits)))
        n_amplitudes = 2**n_required_qubits
        
        return process_quantum_distance((x1, x2, state_prep, n_amplitudes))
    
    def fit(self, X, y):
        if self.n_qubits is None:
            self.n_qubits = X.shape[1]
        self.X_train = X
        self.y_train = y
        return self
    
    def predict(self, X):
        if self.n_qubits is None:
            self.n_qubits = X.shape[1]
        
        y_pred = []
        
        # Create state preparation circuit once for efficiency
        state_prep = self.create_fidelity_circuit()
        n_required_qubits = int(np.ceil(np.log2(self.n_qubits)))
        n_amplitudes = 2**n_required_qubits
        
        for test_sample in tqdm(X, desc="Quantum KNN prediction"):
            # Prepare args for parallel distance calculation
            args_list = [(test_sample, train_sample, state_prep, n_amplitudes) 
                        for train_sample in self.X_train]
            
            # Calculate distances in parallel
            distances = joblib.Parallel(n_jobs=-1)(
                joblib.delayed(process_quantum_distance)(arg) for arg in args_list
            )
            
            # Find k nearest neighbors
            indices = np.argsort(distances)[:self.n_neighbors]
            neighbors_classes = [self.y_train[i] for i in indices]
            
            # Majority vote
            if len(neighbors_classes) > 0:
                y_pred.append(max(set(neighbors_classes), key=neighbors_classes.count))
            else:
                # Fallback if no neighbors found
                y_pred.append(0)
        
        return np.array(y_pred)

# Fix for the metrics calculation to handle class imbalance
def fixed_evaluate_model(model, X_train, X_test, y_train, y_test):
    start_time = time.time()
    model.fit(X_train, y_train)
    fit_time = time.time() - start_time
    
    start_time = time.time()
    y_pred = model.predict(X_test)
    predict_time = time.time() - start_time
    
    # Check if we have enough classes for metrics
    unique_classes = np.unique(np.concatenate([y_test, y_pred]))
    
    # Set default metrics
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Fit Time': fit_time,
        'Predict Time': predict_time
    }
    
    # Only calculate other metrics if we have multiple classes
    if len(unique_classes) > 1:
        metrics.update({
            'Precision': precision_score(y_test, y_pred, zero_division=0),
            'Recall': recall_score(y_test, y_pred, zero_division=0),
            'F1': f1_score(y_test, y_pred, zero_division=0),
            'Kappa': cohen_kappa_score(y_test, y_pred)
        })
        
        # Only calculate ROC AUC if both true and pred have multiple classes
        if len(np.unique(y_test)) > 1 and len(np.unique(y_pred)) > 1:
            metrics['ROC AUC'] = roc_auc_score(y_test, y_pred)
        else:
            metrics['ROC AUC'] = np.nan
    else:
        metrics.update({
            'Precision': np.nan,
            'Recall': np.nan,
            'F1': np.nan,
            'ROC AUC': np.nan,
            'Kappa': np.nan
        })
    
    return metrics

def main():
    X, y = load_and_preprocess_data('Defacement_Infogain.csv')
    
    pca_dims = [15]  # [11, 15]
    results = []
    
    for pca_dim in pca_dims:
        print(f"\nProcessing PCA dimension: {pca_dim}")
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7,random_state=42)
        
        from sklearn.preprocessing import MaxAbsScaler
        scaler = MaxAbsScaler()
        X_train = scaler.fit_transform(X_train)
        
        X_test = scaler.transform(X_test)
        X_test = np.clip(X_test,0,1)

        # Apply quantum embedding
        X_train_quantum = apply_quantum_embedding(X_train[:, :pca_dim], pca_dim)
        X_test_quantum = apply_quantum_embedding(X_test[:, :pca_dim], pca_dim)
        
        for clf_name, clf in CLASSIFIERS.items():
            # Classical version
            clf_classical = clf.__class__(**clf.get_params())
            metrics = fixed_evaluate_model(clf_classical, X_train[:, :pca_dim], X_test[:, :pca_dim], 
                                        y_train, y_test)
            metrics['Model'] = f'{clf_name}_Classical_PCA{pca_dim}'
            metrics['PCA_Dim'] = pca_dim
            results.append(metrics)
            
            # Quantum version with classical ML models
            clf_quantum = clf.__class__(**clf.get_params())
            metrics = fixed_evaluate_model(clf_quantum, X_train_quantum, X_test_quantum, 
                                        y_train, y_test)
            metrics['Model'] = f'{clf_name}_Quantum_PCA{pca_dim}'
            metrics['PCA_Dim'] = pca_dim
            results.append(metrics)
        
        # True Quantum KNN
        print("Running True Quantum KNN with quantum distance...")
        qknn = QuantumKNN(n_neighbors=5, n_qubits=pca_dim)
        
        # Use 2000 records for training instead of 100
        sample_size = min(200000, len(X_train_quantum))
        indices = np.random.choice(len(X_train_quantum), sample_size, replace=False)
        
        # Get class distribution
        classes, counts = np.unique(y_train[indices], return_counts=True)
        print(f"Class distribution in training sample: {dict(zip(classes, counts))}")
        
        X_train_quantum_sample = X_train_quantum[indices]
        y_train_sample = y_train[indices]
        
        # Use 400 records for testing instead of 50
        test_sample_size = min(200000, len(X_test_quantum))
        test_indices = np.random.choice(len(X_test_quantum), test_sample_size, replace=False)
        X_test_quantum_sample = X_test_quantum[test_indices]
        y_test_sample = y_test[test_indices]
        
        # Print class distribution for test set
        classes, counts = np.unique(y_test_sample, return_counts=True)
        print(f"Class distribution in test sample: {dict(zip(classes, counts))}")
        
        try:
            metrics = fixed_evaluate_model(qknn, X_train_quantum_sample, X_test_quantum_sample, 
                                        y_train_sample, y_test_sample)
            metrics['Model'] = f'TrueQuantumKNN_PCA{pca_dim}'
            metrics['PCA_Dim'] = pca_dim
            # Add sample size info to metrics
            metrics['Train_Samples'] = len(X_train_quantum_sample)
            metrics['Test_Samples'] = len(X_test_quantum_sample)
            results.append(metrics)
        except Exception as e:
            print(f"Error running True Quantum KNN: {e}")
            # Add fallback metrics with error note
            metrics = {
                'Accuracy': np.nan, 'Precision': np.nan, 'Recall': np.nan,
                'F1': np.nan, 'ROC AUC': np.nan, 'Kappa': np.nan,
                'Fit Time': np.nan, 'Predict Time': np.nan,
                'Model': f'TrueQuantumKNN_PCA{pca_dim}_Error',
                'PCA_Dim': pca_dim
            }
            results.append(metrics)
    
    results_df = pd.DataFrame(results)
    print("\nResults:")
    print(results_df.to_string())
    
    # Save results with timestamp
    timestamp = time.strftime("%Y%m%d-%H%M%S")
    #results_df.to_csv(f'quantum_ml_results_{timestamp}.csv', index=False)

if __name__ == "__main__":
    main()

class
Defacement    7930
benign        7781
Name: count, dtype: int64
class
benign        500
Defacement    500
Name: count, dtype: int64

Processing PCA dimension: 15




[A[Am embedding (qubits=4):   0%|                                                                                                                                                                                | 0/700 [00:00<?, ?it/s]

[A[Am embedding (qubits=4):   5%|███████▌                                                                                                                                                              | 32/700 [00:00<00:03, 174.35it/s]

[A[Am embedding (qubits=4):   7%|███████████▊                                                                                                                                                          | 50/700 [00:00<00:04, 158.53it/s]

[A[Am embedding (qubits=4):  11%|██████████████████▉                                                                                                                                                   | 80/700 [00:00<00:03, 173.16it/s]

[A[Am embedding (qubits=4):  16%|███████████████

Running True Quantum KNN with quantum distance...
Class distribution in training sample: {0: 344, 1: 356}
Class distribution in test sample: {0: 156, 1: 144}




[A[Am KNN prediction:   0%|                                                                                                                                                                                      | 0/300 [00:00<?, ?it/s]

[A[Am KNN prediction:   0%|▌                                                                                                                                                                             | 1/300 [00:00<03:50,  1.30it/s]

[A[Am KNN prediction:   1%|█▏                                                                                                                                                                            | 2/300 [00:01<03:48,  1.30it/s]

[A[Am KNN prediction:   1%|█▋                                                                                                                                                                            | 3/300 [00:02<03:44,  1.33it/s]

[A[Am KNN prediction:   1%|██▎                  


Results:
   Accuracy  Fit Time  Predict Time  Precision    Recall        F1     Kappa   ROC AUC                         Model  PCA_Dim  Train_Samples  Test_Samples
0  0.970000  0.173514      0.028190   0.972028  0.965278  0.968641  0.939888  0.969818  RandomForest_Classical_PCA15       15            NaN           NaN
1  0.826667  0.165343      0.025940   0.838235  0.791667  0.814286  0.652034  0.825321    RandomForest_Quantum_PCA15       15            NaN           NaN
2  0.823333  0.000002    278.311512   0.801325  0.840278  0.820339  0.646761  0.823985          TrueQuantumKNN_PCA15       15          700.0         300.0






[Antum KNN prediction:   1%|█▏                                                                                                                                                                        | 18/2700 [02:04<5:21:46,  7.20s/it]
[Antum KNN prediction:   1%|█▏                                                                                                                                                                        | 19/2700 [02:13<5:45:50,  7.74s/it]
[Antum KNN prediction:   1%|█▎                                                                                                                                                                        | 20/2700 [02:21<5:50:49,  7.85s/it]
[Antum KNN prediction:   1%|█▎                                                                                                                                                                        | 21/2700 [02:29<5:57:25,  8.01s/it]
[Antum KNN prediction:   1%|█▍                        