In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score
import pennylane as qml
from tqdm import tqdm
import time
from joblib import Parallel, delayed

# Define classifiers
CLASSIFIERS = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    #'ExtraTrees': ExtraTreesClassifier(n_estimators=100, random_state=42),
    #'KNN': KNeighborsClassifier(n_neighbors=5)
}

def load_and_preprocess_data(filepath):
    data = pd.read_csv(filepath)
    df=data
    drop_cols = ["sourcePayloadAsBase64", "sourcePayloadAsUTF", 
                 "destinationPayloadAsBase64", "destinationPayloadAsUTF"]

    df = df.drop(columns=drop_cols, errors='ignore')
    
    from sklearn.utils import resample
    df = df.drop_duplicates()
    
    
    # Check the unique counts in the label column
    print(df['Label'].value_counts())
    
    # Separate classes
    class_counts = df['Label'].value_counts()
    minority_class = class_counts.idxmin()  # Class with fewer samples
    majority_class = class_counts.idxmax()  # Class with more samples
    
    df_minority = df[df['Label'] == minority_class]
    df_majority = df[df['Label'] == majority_class]
    
    # Undersample both classes to 70 records each
    df_minority_undersampled = resample(df_minority, 
                                        replace=False, 
                                        n_samples=200, 
                                        random_state=42)
    
    df_majority_undersampled = resample(df_majority, 
                                        replace=False, 
                                        n_samples=200, 
                                        random_state=42)

    # Combine the undersampled data
    df_balanced = pd.concat([df_minority_undersampled, df_majority_undersampled])
    
    # Shuffle the dataset to mix the classes
    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Check new class distribution
    print(df_balanced['Label'].value_counts())

    df=df_balanced
    df.drop(columns=['generated', 'startDateTime', 'stopDateTime', 'source', 'destination'], inplace=True)

    from sklearn.preprocessing import LabelEncoder
    
    
    # List of binary categorical columns
    binary_cols = ['direction', 'sourceTCPFlagsDescription', 'destinationTCPFlagsDescription', 'Label','appName', 'protocolName']
    le = LabelEncoder()
    
    # Apply Label Encoding for binary categorical features
    for col in binary_cols:
        df[col] = le.fit_transform(df[col])

    from sklearn.preprocessing import MaxAbsScaler

    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    le = LabelEncoder()
    y = le.fit_transform(y)
    
    # Normalize to unit vectors for amplitude encoding
    # (We'll normalize properly in the quantum embedding function)
    
    return X, y

def calculate_required_qubits(n_features):
    """Calculate number of qubits needed for amplitude encoding"""
    return int(np.ceil(np.log2(n_features)))

def normalize_for_amplitude_encoding(X):
    """Normalize data for amplitude encoding"""
    # Ensure all values are positive by shifting if needed
    if np.min(X) < 0:
        X = X - np.min(X)
    
    # Normalize each sample to have L2 norm = 1 (unit vector)
    norms = np.linalg.norm(X, axis=1, keepdims=True)
    # Avoid division by zero
    norms[norms == 0] = 1.0
    X_normalized = X / norms
    
    return X_normalized

def pad_to_power_of_two(X):
    """Pad features to make the total number a power of 2"""
    n_features = X.shape[1]
    n_qubits = calculate_required_qubits(n_features)
    target_size = 2**n_qubits
    
    if n_features < target_size:
        padding = np.zeros((X.shape[0], target_size - n_features))
        X_padded = np.hstack((X, padding))
        # Re-normalize after padding
        X_padded = normalize_for_amplitude_encoding(X_padded)
        return X_padded
    return X

def create_quantum_embedding_circuit(n_qubits):
    """Create quantum circuit for amplitude encoding"""
    dev = qml.device("lightning.qubit", wires=n_qubits)
    
    @qml.qnode(dev)
    def quantum_circuit(inputs):
        # Amplitude embedding - encodes normalized feature vector into amplitudes
        qml.AmplitudeEmbedding(inputs, wires=range(n_qubits), normalize=True, pad_with=0.0)
        
        # Add entangling layers for quantum advantage
        for i in range(2):  # Two layers of entanglement
            for j in range(n_qubits-1):
                qml.CNOT(wires=[j, j+1])
            qml.RY(np.pi/4, wires=n_qubits-1)
            
        # Measure in computational basis
        return [qml.expval(qml.PauliZ(i)) for i in range(n_qubits)]
    
    return quantum_circuit

def apply_quantum_embedding(X, pca_dim):
    """Apply amplitude encoding to data"""
    # Select features according to pca_dim
    X_selected = X[:, :pca_dim]
    
    # Calculate required qubits for amplitude encoding
    n_qubits = calculate_required_qubits(pca_dim)
    
    # For amplitude encoding, we need 2^n_qubits amplitudes
    target_size = 2**n_qubits
    
    # Pad and normalize data for amplitude encoding
    if X_selected.shape[1] < target_size:
        # Pad with zeros to reach power of 2
        padding = np.zeros((X_selected.shape[0], target_size - X_selected.shape[1]))
        X_padded = np.hstack((X_selected, padding))
    else:
        X_padded = X_selected[:, :target_size]
    
    # Normalize for amplitude encoding
    X_normalized = normalize_for_amplitude_encoding(X_padded)
    
    quantum_circuit = create_quantum_embedding_circuit(n_qubits)
    X_quantum = []
    
    for sample in tqdm(X_normalized, desc=f"Quantum embedding (qubits={n_qubits})"):
        embedded_sample = quantum_circuit(sample)
        X_quantum.append(embedded_sample)
    
    return np.array(X_quantum)

# Improved Quantum KNN implementation with fidelity-based distance
class QuantumKNN:
    def __init__(self, n_neighbors=5, n_qubits=None):
        self.n_neighbors = n_neighbors
        self.n_qubits = n_qubits
        self.X_train = None
        self.y_train = None
        self.data_dim = None
    
    def prepare_data_for_circuit(self, x):
        """Prepare data for the quantum circuit"""
        # Ensure data is properly formatted for amplitude encoding
        target_size = 2**self.n_qubits
        
        if len(x) < target_size:
            # Pad with zeros
            x_padded = np.zeros(target_size)
            x_padded[:len(x)] = x
        else:
            x_padded = x[:target_size]
        
        # Normalize
        norm = np.linalg.norm(x_padded)
        if norm > 0:
            x_normalized = x_padded / norm
        else:
            x_normalized = x_padded
            
        return x_normalized
        
    def create_fidelity_circuit(self):
        # Create a device with the required number of qubits
        dev = qml.device("lightning.qubit", wires=self.n_qubits)
        
        @qml.qnode(dev)
        def state_preparation(x):
            # Prepare quantum state based on data point using amplitude encoding
            qml.AmplitudeEmbedding(x, wires=range(self.n_qubits), normalize=True, pad_with=0.0)
            # Return state
            return qml.state()
        
        return state_preparation
    
    def quantum_distance(self, x1, x2):
        # Prepare data for the quantum circuit
        x1_prepared = self.prepare_data_for_circuit(x1)
        x2_prepared = self.prepare_data_for_circuit(x2)
        
        # Get the state preparation circuit
        state_prep = self.create_fidelity_circuit()
        
        # Calculate quantum states
        state1 = state_prep(x1_prepared)
        state2 = state_prep(x2_prepared)
        
        # Calculate fidelity (overlap) between quantum states
        fidelity = np.abs(np.vdot(state1, state2))**2
        
        # Convert to distance (1 - fidelity)
        # Higher fidelity = more similar = less distance
        return 1 - fidelity
    
    def fit(self, X, y):
        # Store the original data dimensionality
        self.data_dim = X.shape[1]
        
        if self.n_qubits is None:
            # Calculate required qubits for amplitude encoding
            self.n_qubits = calculate_required_qubits(self.data_dim)
            
        self.X_train = X
        self.y_train = y
        return self
    
    def predict(self, X):
        if self.n_qubits is None:
            self.n_qubits = calculate_required_qubits(X.shape[1])
        
        def process_sample(test_sample):
            distances = []
            for train_sample in self.X_train:
                # Calculate quantum distance
                dist = self.quantum_distance(test_sample, train_sample)
                distances.append(dist)
            # Find k nearest neighbors
            indices = np.argsort(distances)[:self.n_neighbors]
            neighbors_classes = [self.y_train[i] for i in indices]
            
            # Majority vote
            if len(neighbors_classes) > 0:
                return max(set(neighbors_classes), key=neighbors_classes.count)
            else:
                # Fallback if no neighbors found
                return 0
        
        # Use joblib for parallel processing with tqdm for progress tracking
        y_pred = Parallel(n_jobs=-1)(
            delayed(process_sample)(x) for x in tqdm(X, desc="Predicting")
        )
        
        return np.array(y_pred)

# Fix for the metrics calculation to handle class imbalance
def fixed_evaluate_model(model, X_train, X_test, y_train, y_test):
    start_time = time.time()
    model.fit(X_train, y_train)
    fit_time = time.time() - start_time
    
    start_time = time.time()
    y_pred = model.predict(X_test)
    predict_time = time.time() - start_time
    
    # Check if we have enough classes for metrics
    unique_classes = np.unique(np.concatenate([y_test, y_pred]))
    
    # Set default metrics
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Fit Time': fit_time,
        'Predict Time': predict_time
    }
    
    # Only calculate other metrics if we have multiple classes
    if len(unique_classes) > 1:
        metrics.update({
            'Precision': precision_score(y_test, y_pred, zero_division=0),
            'Recall': recall_score(y_test, y_pred, zero_division=0),
            'F1': f1_score(y_test, y_pred, zero_division=0),
            'Kappa': cohen_kappa_score(y_test, y_pred)
        })
        
        # Only calculate ROC AUC if both true and pred have multiple classes
        if len(np.unique(y_test)) > 1 and len(np.unique(y_pred)) > 1:
            metrics['ROC AUC'] = roc_auc_score(y_test, y_pred)
        else:
            metrics['ROC AUC'] = np.nan
    else:
        metrics.update({
            'Precision': np.nan,
            'Recall': np.nan,
            'F1': np.nan,
            'ROC AUC': np.nan,
            'Kappa': np.nan
        })
    
    return metrics

def main():
    X, y = load_and_preprocess_data('TestbedSunJun13Flows.csv')
    pca_dims = [11]#,2 15]
    results = []
    
    for pca_dim in pca_dims:
        print(f"\nProcessing PCA dimension: {pca_dim}")
        # Use stratified split to ensure class balance
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

        from sklearn.preprocessing import MaxAbsScaler
        scaler = MaxAbsScaler()
        X_train = scaler.fit_transform(X_train)
        
        X_test = scaler.transform(X_test)
        X_test = np.clip(X_test, 0, 1)
        
        # Calculate required qubits for amplitude encoding
        n_qubits = calculate_required_qubits(pca_dim)
        print(f"Using {n_qubits} qubits for {pca_dim} features with amplitude encoding")
        
        # Apply quantum embedding
        X_train_quantum = apply_quantum_embedding(X_train, pca_dim)
        X_test_quantum = apply_quantum_embedding(X_test, pca_dim)
        
        for clf_name, clf in CLASSIFIERS.items():
            # Classical version
            clf_classical = clf.__class__(**clf.get_params())
            metrics = fixed_evaluate_model(clf_classical, X_train[:, :pca_dim], X_test[:, :pca_dim], 
                                        y_train, y_test)
            metrics['Model'] = f'{clf_name}_Classical_PCA{pca_dim}'
            metrics['PCA_Dim'] = pca_dim
            results.append(metrics)
            
            # Quantum version with classical ML models
            clf_quantum = clf.__class__(**clf.get_params())
            metrics = fixed_evaluate_model(clf_quantum, X_train_quantum, X_test_quantum, 
                                        y_train, y_test)
            metrics['Model'] = f'{clf_name}_Quantum_PCA{pca_dim}'
            metrics['PCA_Dim'] = pca_dim
            results.append(metrics)
        
        # True Quantum KNN
        print("Running True Quantum KNN with quantum distance...")
        qknn = QuantumKNN(n_neighbors=5, n_qubits=n_qubits)
        
        # Use smaller samples to speed up computation
        sample_size = min(100000, len(X_train_quantum))
        indices = np.random.choice(len(X_train_quantum), sample_size, replace=False)
        
        # Get class distribution
        classes, counts = np.unique(y_train[indices], return_counts=True)
        print(f"Class distribution in training sample: {dict(zip(classes, counts))}")
        
        X_train_quantum_sample = X_train_quantum[indices]
        y_train_sample = y_train[indices]
        
        # Use smaller test set
        test_sample_size = min(200000, len(X_test_quantum))
        test_indices = np.random.choice(len(X_test_quantum), test_sample_size, replace=False)
        X_test_quantum_sample = X_test_quantum[test_indices]
        y_test_sample = y_test[test_indices]
        
        # Print class distribution for test set
        classes, counts = np.unique(y_test_sample, return_counts=True)
        print(f"Class distribution in test sample: {dict(zip(classes, counts))}")
        
        try:
            metrics = fixed_evaluate_model(qknn, X_train_quantum_sample, X_test_quantum_sample, 
                                        y_train_sample, y_test_sample)
            metrics['Model'] = f'TrueQuantumKNN_PCA{pca_dim}'
            metrics['PCA_Dim'] = pca_dim
            # Add sample size info to metrics
            metrics['Train_Samples'] = len(X_train_quantum_sample)
            metrics['Test_Samples'] = len(X_test_quantum_sample)
            results.append(metrics)
        except Exception as e:
            print(f"Error running True Quantum KNN: {e}")
            # Add fallback metrics with error note
            metrics = {
                'Accuracy': np.nan, 'Precision': np.nan, 'Recall': np.nan,
                'F1': np.nan, 'ROC AUC': np.nan, 'Kappa': np.nan,
                'Fit Time': np.nan, 'Predict Time': np.nan,
                'Model': f'TrueQuantumKNN_PCA{pca_dim}_Error',
                'PCA_Dim': pca_dim
            }
            results.append(metrics)
    
    results_df = pd.DataFrame(results)
    print("\nResults:")
    print(results_df.to_string())
    
    # Save results with timestamp
    timestamp = time.strftime("%Y%m%d-%H%M%S")
    #results_df.to_csv(f'quantum_ml_results_{timestamp}.csv', index=False)

if __name__ == "__main__":
    main()

Label
Normal    126430
Attack     10139
Name: count, dtype: int64
Label
Normal    200
Attack    200
Name: count, dtype: int64

Processing PCA dimension: 11
Using 4 qubits for 11 features with amplitude encoding




[A[Am embedding (qubits=4):   0%|                                                                                                                                                                                | 0/280 [00:00<?, ?it/s]

[A[Am embedding (qubits=4):  10%|█████████████████▏                                                                                                                                                    | 29/280 [00:00<00:00, 286.04it/s]

[A[Am embedding (qubits=4):  21%|██████████████████████████████████▍                                                                                                                                   | 58/280 [00:00<00:01, 221.71it/s]

[A[Am embedding (qubits=4):  29%|████████████████████████████████████████████████▌                                                                                                                     | 82/280 [00:00<00:00, 202.60it/s]

[A[Am embedding (qubits=4):  37%|███████████████

Running True Quantum KNN with quantum distance...
Class distribution in training sample: {0: 140, 1: 140}
Class distribution in test sample: {0: 60, 1: 60}




[A[Ating:   0%|                                                                                                                                                                                                  | 0/120 [00:00<?, ?it/s]



[A[Ating:  13%|████████████████████████▋                                                                                                                                                                | 16/120 [00:06<00:52,  1.98it/s]

[A[Ating:  20%|█████████████████████████████████████                                                                                                                                                    | 24/120 [00:09<00:37,  2.54it/s]

[A[Ating:  22%|████████████████████████████████████████                                                                                                                                                 | 26/120 [00:09<00:32,  2.88it/s]

[A[Ating:  27%|███████████████████████████████


Results:
   Accuracy  Fit Time  Predict Time  Precision    Recall        F1     Kappa   ROC AUC                         Model  PCA_Dim  Train_Samples  Test_Samples
0  0.966667  0.216213      0.010128   0.937500  1.000000  0.967742  0.933333  0.966667  RandomForest_Classical_PCA11       11            NaN           NaN
1  0.916667  0.147529      0.007782   0.867647  0.983333  0.921875  0.833333  0.916667    RandomForest_Quantum_PCA11       11            NaN           NaN
2  0.850000  0.000002     36.665165   0.808824  0.916667  0.859375  0.700000  0.850000          TrueQuantumKNN_PCA11       11          280.0         120.0
