In [14]:
import pandas as pd
import numpy as np
import pennylane as qml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score
import xgboost as xgb
from tqdm import tqdm
import time

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from tqdm import tqdm
import time
import xgboost as xgb

# Load and preprocess data

def load_and_preprocess(filepath):
    # Columns to drop
    drop_cols = ["sourcePayloadAsBase64", "sourcePayloadAsUTF", 
                 "destinationPayloadAsBase64", "destinationPayloadAsUTF"]
    
    df = pd.read_csv(filepath)
    df = df.drop(columns=drop_cols, errors='ignore')
    
    # Convert datetime columns
    datetime_cols = df.select_dtypes(include=['object']).columns
    for col in datetime_cols:
        try:
            df[col] = pd.to_datetime(df[col], format='%m/%d/%Y %H:%M')
            df[col] = df[col].astype(np.int64) // 10**9
        except:
            continue
    
    # Handle categorical columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    le = LabelEncoder()
    for col in categorical_cols:
        df[col] = le.fit_transform(df[col].astype(str))
    
    # Separate features and target
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, le.fit_transform(y)


# Quantum circuit for angle embedding
def quantum_circuit(features, n_qubits):
    dev = qml.device("default.qubit", wires=n_qubits)
    
    @qml.qnode(dev)
    def circuit(x):
        # Angle embedding
        qml.AngleEmbedding(x, wires=range(n_qubits))
        
        # Entangling layers
        for i in range(n_qubits-1):
            qml.CNOT(wires=[i, i+1])
            
        # Measure all qubits
        return [qml.expval(qml.PauliZ(i)) for i in range(n_qubits)]
    
    return circuit(features)

# Quantum feature mapping
def quantum_feature_mapping(X, n_qubits):
    X_quantum = []
    for sample in tqdm(X, desc="Quantum Feature Mapping"):
        # Pad or truncate features to match n_qubits
        features = sample[:n_qubits]
        quantum_features = quantum_circuit(features, n_qubits)
        X_quantum.append(quantum_features)
    return np.array(X_quantum)

def evaluate_classifiers(X_train, X_test, y_train, y_test, classifiers):
    results = []
    
    for name, clf in tqdm(classifiers.items(), desc="Training Classifiers"):
        # Training
        start_fit = time.time()
        clf.fit(X_train, y_train)
        fit_time = time.time() - start_fit
        
        # Prediction
        start_predict = time.time()
        y_pred = clf.predict(X_test)
        predict_time = time.time() - start_predict
        
        # Metrics
        metrics = {
            'Classifier': name,
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'F1': f1_score(y_test, y_pred),
            'ROC AUC': roc_auc_score(y_test, y_pred),
            'Kappa': cohen_kappa_score(y_test, y_pred),
            'Fit Time': fit_time,
            'Predict Time': predict_time
        }
        results.append(metrics)
    
    return pd.DataFrame(results)



In [15]:
def main():
    # Load and preprocess data
    X, y = load_and_preprocess('TestbedSunJun13Flows.csv')
    from imblearn.under_sampling import RandomUnderSampler

    # Add this after X, y = load_and_preprocess_data('TestbedSunJun13Flows.csv')
    # and before train_test_split
    
    print("Original class distribution:")
    print(pd.Series(y).value_counts())
    
    # Perform undersampling
    undersampler = RandomUnderSampler(random_state=42)
    X_balanced, y_balanced = undersampler.fit_resample(X, y)
    
    print("Balanced class distribution:")
    print(pd.Series(y_balanced).value_counts())
    
    # Update your X and y variables
    X = X_balanced
    y = y_balanced
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define classifiers
    classifiers = {
        'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=42),
        'XGBoost': xgb.XGBClassifier(random_state=42),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'CatBoost': CatBoostClassifier(random_state=42, verbose=False),
        'AdaBoost': AdaBoostClassifier(random_state=42),    
        'Gradient Boosting': GradientBoostingClassifier(random_state=42),
        'Logistic Regression': LogisticRegression(random_state=42),
        'KNN': KNeighborsClassifier(),
        'Decision Tree': DecisionTreeClassifier(random_state=42)
    }
    
    # Classical ML evaluation
    print("Classical ML Evaluation:")
    classical_results = evaluate_classifiers(X_train, X_test, y_train, y_test, classifiers)
    
    # Quantum Feature Mapping
    n_qubits = 8  # Adjust based on your quantum computer capacity
    X_train_quantum = quantum_feature_mapping(X_train, n_qubits)
    X_test_quantum = quantum_feature_mapping(X_test, n_qubits)
    
    # Quantum ML evaluation
    print("\nQuantum ML Evaluation:")
    quantum_results = evaluate_classifiers(X_train_quantum, X_test_quantum, y_train, y_test, classifiers)

    results_df = pd.DataFrame(quantum_results)
    results_df1 = pd.DataFrame(classical_results)
    
    # Save results
    results_df.to_csv('5th dataset angle embedding_quantum.csv', index=False)
    results_df1.to_csv('5th dataset angle embedding_classical.csv', index=False)
    # Display results
    print("\nClassical ML Results:")
    print(classical_results)
    print("\nQuantum ML Results:")
    print(quantum_results)

if __name__ == "__main__":
    main()


Original class distribution:
1    255170
0     20358
Name: count, dtype: int64
Balanced class distribution:
0    20358
1    20358
Name: count, dtype: int64
Classical ML Evaluation:


Training Classifiers: 100%|██████████████████████████████████████████████████████████████| 9/9 [01:00<00:00,  6.72s/it]
Quantum Feature Mapping: 100%|███████████████████████████████████████████████████| 32572/32572 [06:50<00:00, 79.37it/s]
Quantum Feature Mapping: 100%|█████████████████████████████████████████████████████| 8144/8144 [02:09<00:00, 62.92it/s]



Quantum ML Evaluation:


Training Classifiers: 100%|██████████████████████████████████████████████████████████████| 9/9 [01:21<00:00,  9.05s/it]


Classical ML Results:
            Classifier  Accuracy  Precision    Recall        F1   ROC AUC  \
0          Extra Trees  0.999140   0.999026  0.999269  0.999148  0.999139   
1              XGBoost  0.999263   0.999513  0.999026  0.999269  0.999265   
2        Random Forest  0.999263   1.000000  0.998539  0.999269  0.999269   
3             CatBoost  0.999140   0.999756  0.998539  0.999147  0.999146   
4             AdaBoost  0.997544   0.998050  0.997077  0.997563  0.997548   
5    Gradient Boosting  0.999140   1.000000  0.998295  0.999147  0.999148   
6  Logistic Regression  0.978757   0.980689  0.977107  0.978895  0.978771   
7                  KNN  0.995825   0.994415  0.997321  0.995866  0.995813   
8        Decision Tree  0.998772   0.999025  0.998539  0.998782  0.998774   

      Kappa   Fit Time  Predict Time  
0  0.998281   2.708005      0.132425  
1  0.998526   1.208272      0.026608  
2  0.998526   5.281624      0.087290  
3  0.998281  16.074282      0.077011  
4  0.995088


