In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score
import pennylane as qml
from tqdm import tqdm
import time

# Define classifiers
CLASSIFIERS = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    #'ExtraTrees': ExtraTreesClassifier(n_estimators=100, random_state=42),
    #'KNN': KNeighborsClassifier(n_neighbors=5)
}

def load_and_preprocess_data(filepath):
    data = pd.read_csv(filepath)
    df=data
    drop_cols = ["sourcePayloadAsBase64", "sourcePayloadAsUTF", 
                 "destinationPayloadAsBase64", "destinationPayloadAsUTF"]

    df = df.drop(columns=drop_cols, errors='ignore')
    
    from sklearn.utils import resample
    df = df.drop_duplicates()
    
    
    # Check the unique counts in the label column
    print(df['Label'].value_counts())
    
    # Separate classes
    class_counts = df['Label'].value_counts()
    minority_class = class_counts.idxmin()  # Class with fewer samples
    majority_class = class_counts.idxmax()  # Class with more samples
    
    df_minority = df[df['Label'] == minority_class]
    df_majority = df[df['Label'] == majority_class]
    
    # Undersample both classes to 70 records each
    df_minority_undersampled = resample(df_minority, 
                                        replace=False, 
                                        n_samples=5000, 
                                        random_state=42)
    
    df_majority_undersampled = resample(df_majority, 
                                        replace=False, 
                                        n_samples=5000, 
                                        random_state=42)

    # Combine the undersampled data
    df_balanced = pd.concat([df_minority_undersampled, df_majority_undersampled])
    
    # Shuffle the dataset to mix the classes
    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Check new class distribution
    print(df_balanced['Label'].value_counts())

    df=df_balanced
    df.drop(columns=['generated', 'startDateTime', 'stopDateTime', 'source', 'destination'], inplace=True)

    from sklearn.preprocessing import LabelEncoder
    
    
    # List of binary categorical columns
    binary_cols = ['direction', 'sourceTCPFlagsDescription', 'destinationTCPFlagsDescription', 'Label','appName', 'protocolName']
    le = LabelEncoder()
    
    # Apply Label Encoding for binary categorical features
    for col in binary_cols:
        df[col] = le.fit_transform(df[col])

    from sklearn.preprocessing import MaxAbsScaler

    #scaler = MaxAbsScaler()
    #df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    
    # Check processed data
    #print(df_scaled.head())

    from sklearn.model_selection import train_test_split
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]


    #X = data.iloc[:, :23]
    #y = data.iloc[:, 23]
    
    le = LabelEncoder()
    y = le.fit_transform(y)
    
    #scaler = StandardScaler()
    #X = scaler.fit_transform(X)
    
    # Normalize to range [-π, π] for angle embedding
    X = np.pi * (X - X.min()) / (X.max() - X.min())
    
    return X, y

def create_quantum_embedding_circuit(n_qubits):
    dev = qml.device("lightning.qubit", wires=n_qubits)
    
    @qml.qnode(dev)
    def quantum_circuit(inputs):
        # Angle embedding for normalized features
        qml.AngleEmbedding(inputs, wires=range(n_qubits), rotation='X')
        
        # Add entangling layers for quantum advantage
        for i in range(2):  # Two layers of entanglement
            for j in range(n_qubits-1):
                qml.CNOT(wires=[j, j+1])
            qml.RY(np.pi/4, wires=n_qubits-1)
            
        # Measure in computational basis
        return [qml.expval(qml.PauliZ(i)) for i in range(n_qubits)]
    
    return quantum_circuit

def apply_quantum_embedding(X, n_qubits):
    quantum_circuit = create_quantum_embedding_circuit(n_qubits)
    X_quantum = []
    
    for sample in tqdm(X, desc=f"Quantum embedding (qubits={n_qubits})"):
        # Ensure we only use the first n_qubits features
        sample_truncated = sample[:n_qubits]
        embedded_sample = quantum_circuit(sample_truncated)
        X_quantum.append(embedded_sample)
    
    return np.array(X_quantum)

# Improved Quantum KNN implementation with fidelity-based distance
class QuantumKNN:
    def __init__(self, n_neighbors=5, n_qubits=None):
        self.n_neighbors = n_neighbors
        self.n_qubits = n_qubits
        self.X_train = None
        self.y_train = None
        
    def create_fidelity_circuit(self):
        # Create a device with the required number of qubits
        # We'll use this to compute quantum state fidelity
        dev = qml.device("lightning.qubit", wires=self.n_qubits)
        
        @qml.qnode(dev)
        def state_preparation(x):
            # Prepare quantum state based on data point
            qml.AngleEmbedding(x, wires=range(self.n_qubits), rotation='X')
            # Return state
            return qml.state()
        
        return state_preparation
    
    def quantum_distance(self, x1, x2):
        # Calculate quantum distance based on fidelity between states
        state_prep = self.create_fidelity_circuit()
        state1 = state_prep(x1)
        state_prep = self.create_fidelity_circuit()
        state2 = state_prep(x2)
        
        # Calculate fidelity (overlap) between quantum states
        fidelity = np.abs(np.vdot(state1, state2))**2
        
        # Convert to distance (1 - fidelity)
        # Higher fidelity = more similar = less distance
        return 1 - fidelity
    
    def fit(self, X, y):
        if self.n_qubits is None:
            self.n_qubits = X.shape[1]
        self.X_train = X
        self.y_train = y
        return self
    
    def predict(self, X):
        if self.n_qubits is None:
            self.n_qubits = X.shape[1]
        
        from joblib import Parallel, delayed
        from tqdm import tqdm
        
        def process_sample(test_sample):
            distances = []
            for train_sample in self.X_train:
                # Calculate quantum distance
                dist = self.quantum_distance(test_sample, train_sample)
                distances.append(dist)
            # Find k nearest neighbors
            indices = np.argsort(distances)[:self.n_neighbors]
            neighbors_classes = [self.y_train[i] for i in indices]
            
            # Majority vote
            if len(neighbors_classes) > 0:
                return max(set(neighbors_classes), key=neighbors_classes.count)
            else:
                # Fallback if no neighbors found
                return 0
        
        # Use joblib for parallelization with tqdm for progress tracking
        y_pred = Parallel(n_jobs=-1)(
            delayed(process_sample)(x) for x in tqdm(X, desc="Predicting")
        )
        
        return np.array(y_pred)

# Fix for the metrics calculation to handle class imbalance
def fixed_evaluate_model(model, X_train, X_test, y_train, y_test):
    start_time = time.time()
    model.fit(X_train, y_train)
    fit_time = time.time() - start_time
    
    start_time = time.time()
    y_pred = model.predict(X_test)
    predict_time = time.time() - start_time
    
    # Check if we have enough classes for metrics
    unique_classes = np.unique(np.concatenate([y_test, y_pred]))
    
    # Set default metrics
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Fit Time': fit_time,
        'Predict Time': predict_time
    }
    
    # Only calculate other metrics if we have multiple classes
    if len(unique_classes) > 1:
        metrics.update({
            'Precision': precision_score(y_test, y_pred, zero_division=0),
            'Recall': recall_score(y_test, y_pred, zero_division=0),
            'F1': f1_score(y_test, y_pred, zero_division=0),
            'Kappa': cohen_kappa_score(y_test, y_pred)
        })
        
        # Only calculate ROC AUC if both true and pred have multiple classes
        if len(np.unique(y_test)) > 1 and len(np.unique(y_pred)) > 1:
            metrics['ROC AUC'] = roc_auc_score(y_test, y_pred)
        else:
            metrics['ROC AUC'] = np.nan
    else:
        metrics.update({
            'Precision': np.nan,
            'Recall': np.nan,
            'F1': np.nan,
            'ROC AUC': np.nan,
            'Kappa': np.nan
        })
    
    return metrics

def main():
    X, y = load_and_preprocess_data('TestbedSunJun13Flows.csv')
    pca_dims = [11]#,2 15]
    results = []
    
    for pca_dim in pca_dims:
        print(f"\nProcessing PCA dimension: {pca_dim}")
        # Use stratified split to ensure class balance
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

        from sklearn.preprocessing import MaxAbsScaler
        scaler = MaxAbsScaler()
        X_train = scaler.fit_transform(X_train)
        
        X_test = scaler.transform(X_test)
        X_test = np.clip(X_test,0,1)
        #X_train = X_train.to_numpy()  # Convert to NumPy if it's a DataFrame
        #X_test = X_test.to_numpy()

        
        # Apply quantum embedding
        X_train_quantum = apply_quantum_embedding(X_train[:, :pca_dim], pca_dim)
        X_test_quantum = apply_quantum_embedding(X_test[:, :pca_dim], pca_dim)
        
        for clf_name, clf in CLASSIFIERS.items():
            # Classical version
            clf_classical = clf.__class__(**clf.get_params())
            metrics = fixed_evaluate_model(clf_classical, X_train[:, :pca_dim], X_test[:, :pca_dim], 
                                        y_train, y_test)
            metrics['Model'] = f'{clf_name}_Classical_PCA{pca_dim}'
            metrics['PCA_Dim'] = pca_dim
            results.append(metrics)
            
            # Quantum version with classical ML models
            clf_quantum = clf.__class__(**clf.get_params())
            metrics = fixed_evaluate_model(clf_quantum, X_train_quantum, X_test_quantum, 
                                        y_train, y_test)
            metrics['Model'] = f'{clf_name}_Quantum_PCA{pca_dim}'
            metrics['PCA_Dim'] = pca_dim
            results.append(metrics)
        
        # True Quantum KNN
        print("Running True Quantum KNN with quantum distance...")
        qknn = QuantumKNN(n_neighbors=5, n_qubits=pca_dim)
        
        # Use 2000 records for training instead of 100
        sample_size = min(200000, len(X_train_quantum))
        indices = np.random.choice(len(X_train_quantum), sample_size, replace=False)
        
        # Get class distribution
        classes, counts = np.unique(y_train[indices], return_counts=True)
        print(f"Class distribution in training sample: {dict(zip(classes, counts))}")
        
        X_train_quantum_sample = X_train_quantum[indices]
        y_train_sample = y_train[indices]
        
        # Use 400 records for testing instead of 50
        test_sample_size = min(200000, len(X_test_quantum))
        test_indices = np.random.choice(len(X_test_quantum), test_sample_size, replace=False)
        X_test_quantum_sample = X_test_quantum[test_indices]
        y_test_sample = y_test[test_indices]
        
        # Print class distribution for test set
        classes, counts = np.unique(y_test_sample, return_counts=True)
        print(f"Class distribution in test sample: {dict(zip(classes, counts))}")
        
        try:
            metrics = fixed_evaluate_model(qknn, X_train_quantum_sample, X_test_quantum_sample, 
                                        y_train_sample, y_test_sample)
            metrics['Model'] = f'TrueQuantumKNN_PCA{pca_dim}'
            metrics['PCA_Dim'] = pca_dim
            # Add sample size info to metrics
            metrics['Train_Samples'] = len(X_train_quantum_sample)
            metrics['Test_Samples'] = len(X_test_quantum_sample)
            results.append(metrics)
        except Exception as e:
            print(f"Error running True Quantum KNN: {e}")
            # Add fallback metrics with error note
            metrics = {
                'Accuracy': np.nan, 'Precision': np.nan, 'Recall': np.nan,
                'F1': np.nan, 'ROC AUC': np.nan, 'Kappa': np.nan,
                'Fit Time': np.nan, 'Predict Time': np.nan,
                'Model': f'TrueQuantumKNN_PCA{pca_dim}_Error',
                'PCA_Dim': pca_dim
            }
            results.append(metrics)
    
    results_df = pd.DataFrame(results)
    print("\nResults:")
    print(results_df.to_string())
    
    # Save results with timestamp
    timestamp = time.strftime("%Y%m%d-%H%M%S")
    #results_df.to_csv(f'quantum_ml_results_{timestamp}.csv', index=False)

if __name__ == "__main__":
    main()

Label
Normal    126430
Attack     10139
Name: count, dtype: int64
Label
Normal    5000
Attack    5000
Name: count, dtype: int64

Processing PCA dimension: 11



[Antum embedding (qubits=11):   0%|                                                                                                                                                                              | 0/7000 [00:00<?, ?it/s]
[Antum embedding (qubits=11):   0%|▎                                                                                                                                                                   | 11/7000 [00:00<01:04, 108.86it/s]
[Antum embedding (qubits=11):   0%|▌                                                                                                                                                                   | 23/7000 [00:00<01:01, 113.29it/s]
[Antum embedding (qubits=11):   1%|▉                                                                                                                                                                   | 39/7000 [00:00<00:51, 133.91it/s]
[Antum embedding (qubits=11):   1%|█▎                 

Running True Quantum KNN with quantum distance...
Class distribution in training sample: {0: 3461, 1: 3539}
Class distribution in test sample: {0: 1539, 1: 1461}



[Adicting:   0%|                                                                                                                                                                                                 | 0/3000 [00:00<?, ?it/s]

[Adicting:   0%|▍                                                                                                                                                                                        | 8/3000 [00:17<01:29, 33.42it/s]
[Adicting:   1%|▉                                                                                                                                                                                     | 16/3000 [01:04<3:56:29,  4.76s/it]
[Adicting:   1%|█▍                                                                                                                                                                                    | 24/3000 [02:02<4:51:45,  5.88s/it]
[Adicting:   1%|█▉                                   

KeyboardInterrupt: 