In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score
import pennylane as qml
from tqdm import tqdm
import time

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from tqdm import tqdm
import time
import xgboost as xgb

# Load and preprocess data
def load_and_preprocess_data(filepath):
    # Columns to drop
    drop_cols = ["sourcePayloadAsBase64", "sourcePayloadAsUTF", 
                 "destinationPayloadAsBase64", "destinationPayloadAsUTF"]
    
    df = pd.read_csv(filepath)
    df = df.drop(columns=drop_cols, errors='ignore')
    
    # Convert datetime columns
    datetime_cols = df.select_dtypes(include=['object']).columns
    for col in datetime_cols:
        try:
            df[col] = pd.to_datetime(df[col], format='%m/%d/%Y %H:%M')
            df[col] = df[col].astype(np.int64) // 10**9
        except:
            continue
    
    # Handle categorical columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    le = LabelEncoder()
    for col in categorical_cols:
        df[col] = le.fit_transform(df[col].astype(str))
    
    # Separate features and target
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, le.fit_transform(y)

# Quantum amplitude embedding circuit
def create_quantum_circuit(n_qubits):
    dev = qml.device("default.qubit", wires=n_qubits)
    
    @qml.qnode(dev)
    def quantum_circuit(inputs):
        # Amplitude embedding
        qml.AmplitudeEmbedding(inputs, wires=range(n_qubits), normalize=True)
        
        # Rotation layers
        for i in range(n_qubits):
            qml.RY(np.pi/2, wires=i)
            qml.RZ(np.pi/4, wires=i)
        
        # Entangling layers
        for i in range(n_qubits-1):
            qml.CNOT(wires=[i, i+1])
        
        return [qml.expval(qml.PauliZ(i)) for i in range(n_qubits)]
    
    return quantum_circuit

# Quantum feature extraction
def quantum_feature_extraction(X, n_qubits):
    quantum_circuit = create_quantum_circuit(n_qubits)
    quantum_features = []
    
    for sample in tqdm(X, desc="Quantum Feature Extraction"):
        # Calculate required padding
        target_size = 2**n_qubits
        # Normalize and pad the sample to match required qubit size
        normalized_sample = sample / np.linalg.norm(sample)
        padded_sample = np.zeros(target_size)
        padded_sample[:len(normalized_sample)] = normalized_sample
        # Renormalize after padding
        padded_sample = padded_sample / np.linalg.norm(padded_sample)
        quantum_features.append(quantum_circuit(padded_sample))
    
    return np.array(quantum_features)

# Evaluation function
def evaluate_classifier(clf, X_train, X_test, y_train, y_test, name):
    start_fit = time.time()
    clf.fit(X_train, y_train)
    fit_time = time.time() - start_fit
    
    start_pred = time.time()
    y_pred = clf.predict(X_test)
    pred_time = time.time() - start_pred
    
    # Modified metrics calculation for multi-class
    metrics = {
        'Classifier': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='weighted'),
        'Recall': recall_score(y_test, y_pred, average='weighted'),
        'F1': f1_score(y_test, y_pred, average='weighted'),
        #'ROC AUC': roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) if len(np.unique(y)) == 2 else roc_auc_score(y_test, clf.predict_proba(X_test), multi_class='ovr', average='weighted'),
        'Kappa': cohen_kappa_score(y_test, y_pred),
        'Fit Time': fit_time,
        'Predict Time': pred_time
    }
    return metrics



In [14]:
def main():
    # Load and preprocess data
    X, y = load_and_preprocess_data('TestbedSunJun13Flows.csv')
    from imblearn.under_sampling import RandomUnderSampler

    # Add this after X, y = load_and_preprocess_data('TestbedSunJun13Flows.csv')
    # and before train_test_split
    
    print("Original class distribution:")
    print(pd.Series(y).value_counts())
    
    # Perform undersampling
    undersampler = RandomUnderSampler(random_state=42)
    X_balanced, y_balanced = undersampler.fit_resample(X, y)
    
    print("Balanced class distribution:")
    print(pd.Series(y_balanced).value_counts())
    
    # Update your X and y variables
    X = X_balanced
    y = y_balanced
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define classifiers
    classifiers = [
        ('Extra Trees', ExtraTreesClassifier(n_estimators=100, random_state=42)),
        ('XGBoost', xgb.XGBClassifier(random_state=42)),
        ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('CatBoost', CatBoostClassifier(random_state=42, verbose=False)),
        ('AdaBoost', AdaBoostClassifier(random_state=42)),    
        ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
        ('Logistic Regression', LogisticRegression(random_state=42)),
        ('KNN', KNeighborsClassifier()),
        ('Decision Tree', DecisionTreeClassifier(random_state=42))
        
        
    ]
    
    # Classical ML evaluation
    classical_results = []
    print("Evaluating Classical ML Models...")
    for name, clf in tqdm(classifiers):
        metrics = evaluate_classifier(clf, X_train, X_test, y_train, y_test, f"Classical {name}")
        classical_results.append(metrics)
    
    # Quantum feature extraction
     # Calculate minimum required qubits
    n_features = X.shape[1]
    n_qubits = int(np.ceil(np.log2(n_features)))
    
    
    n_qubits = int(np.ceil(np.log2(X.shape[1])))
    print(f"Using {n_qubits} qubits for quantum circuit")
    
    X_train_quantum = quantum_feature_extraction(X_train, n_qubits)
    X_test_quantum = quantum_feature_extraction(X_test, n_qubits)
    
    # Quantum ML evaluation
    quantum_results = []
    print("Evaluating Quantum ML Models...")
    for name, clf in tqdm(classifiers):
        metrics = evaluate_classifier(clf, X_train_quantum, X_test_quantum, y_train, y_test, f"Quantum {name}")
        quantum_results.append(metrics)
    
    # Combine and display results
    all_results = pd.DataFrame(classical_results + quantum_results)
    print("\nResults:")
    print(all_results.to_string(index=False))
    
    
    # Save results
    # Save results
    all_results.to_csv('5th dataset amplitude.csv', index=False)

if __name__ == "__main__":
    main()


Original class distribution:
1    255170
0     20358
Name: count, dtype: int64
Balanced class distribution:
0    20358
1    20358
Name: count, dtype: int64
Evaluating Classical ML Models...


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:43<00:00,  4.85s/it]


Using 4 qubits for quantum circuit


Quantum Feature Extraction: 100%|███████████████████████████████████████████████| 32572/32572 [04:14<00:00, 127.95it/s]
Quantum Feature Extraction: 100%|█████████████████████████████████████████████████| 8144/8144 [01:03<00:00, 128.36it/s]


Evaluating Quantum ML Models...


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:48<00:00,  5.35s/it]


Results:
                   Classifier  Accuracy  Precision   Recall       F1    Kappa  Fit Time  Predict Time
        Classical Extra Trees  0.999140   0.999140 0.999140 0.999140 0.998281  1.269210      0.096744
            Classical XGBoost  0.999263   0.999263 0.999263 0.999263 0.998526  0.294805      0.015619
      Classical Random Forest  0.999263   0.999264 0.999263 0.999263 0.998526  2.354936      0.047309
           Classical CatBoost  0.999140   0.999141 0.999140 0.999140 0.998281 28.756250      0.081601
           Classical AdaBoost  0.997544   0.997545 0.997544 0.997544 0.995088  1.799892      0.033443
  Classical Gradient Boosting  0.999140   0.999142 0.999140 0.999140 0.998281  7.181824      0.000000
Classical Logistic Regression  0.978757   0.978765 0.978757 0.978758 0.957513  0.099254      0.015662
                Classical KNN  0.995825   0.995829 0.995825 0.995825 0.991650  0.016457      1.333194
      Classical Decision Tree  0.998772   0.998772 0.998772 0.998772 0.9


