In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score
import pennylane as qml
from tqdm import tqdm
import time

# Define classifiers
CLASSIFIERS = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'ExtraTrees': ExtraTreesClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier(n_estimators=100, random_state=42)
    
    'CatBoost': CatBoostClassifier(random_state=42, verbose=False),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    
    'Logistic Regression': LogisticRegression(random_state=42),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

def load_and_preprocess_data(filepath):
    data = pd.read_csv(filepath)
    X = data.iloc[:, :23]
    y = data.iloc[:, 23]
    
    le = LabelEncoder()
    y = le.fit_transform(y)
    
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    # Normalize to range [-π, π] for angle embedding
    X = np.pi * (X - X.min()) / (X.max() - X.min())
    
    return X, y

def create_quantum_embedding_circuit(n_qubits):
    dev = qml.device("default.qubit", wires=n_qubits)
    
    @qml.qnode(dev)
    def quantum_circuit(inputs):
        # Angle embedding for normalized features
        qml.AngleEmbedding(inputs, wires=range(n_qubits), rotation='X')
        
        # Add entangling layers for quantum advantage
        for i in range(2):  # Two layers of entanglement
            for j in range(n_qubits-1):
                qml.CNOT(wires=[j, j+1])
            qml.RY(np.pi/4, wires=n_qubits-1)
            
        # Measure in computational basis
        return [qml.expval(qml.PauliZ(i)) for i in range(n_qubits)]
    
    return quantum_circuit

def apply_quantum_embedding(X, n_qubits):
    quantum_circuit = create_quantum_embedding_circuit(n_qubits)
    X_quantum = []
    
    for sample in tqdm(X, desc=f"Quantum embedding (qubits={n_qubits})"):
        # Ensure we only use the first n_qubits features
        sample_truncated = sample[:n_qubits]
        embedded_sample = quantum_circuit(sample_truncated)
        X_quantum.append(embedded_sample)
    
    return np.array(X_quantum)

def evaluate_model(model, X_train, X_test, y_train, y_test):
    start_time = time.time()
    model.fit(X_train, y_train)
    fit_time = time.time() - start_time
    
    start_time = time.time()
    y_pred = model.predict(X_test)
    predict_time = time.time() - start_time
    
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_pred),
        'Kappa': cohen_kappa_score(y_test, y_pred),
        'Fit Time': fit_time,
        'Predict Time': predict_time
    }
    
    return metrics



In [4]:
def main():
    X, y = load_and_preprocess_data('data-30s.csv')
    pca_dims = [2,10,15]
    results = []
    
    for pca_dim in pca_dims:
        print(f"\nProcessing PCA dimension: {pca_dim}")
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Apply quantum embedding
        X_train_quantum = apply_quantum_embedding(X_train[:, :pca_dim], pca_dim)
        X_test_quantum = apply_quantum_embedding(X_test[:, :pca_dim], pca_dim)
        
        for clf_name, clf in CLASSIFIERS.items():
            # Classical version
            clf_classical = clf.__class__(**clf.get_params())
            metrics = evaluate_model(clf_classical, X_train[:, :pca_dim], X_test[:, :pca_dim], 
                                  y_train, y_test)
            metrics['Model'] = f'{clf_name}_Classical_PCA{pca_dim}'
            metrics['PCA_Dim'] = pca_dim
            results.append(metrics)
            
            # Quantum version
            clf_quantum = clf.__class__(**clf.get_params())
            metrics = evaluate_model(clf_quantum, X_train_quantum, X_test_quantum, 
                                  y_train, y_test)
            metrics['Model'] = f'{clf_name}_Quantum_PCA{pca_dim}'
            metrics['PCA_Dim'] = pca_dim
            results.append(metrics)
    
    results_df = pd.DataFrame(results)
    print("\nResults:")
    print(results_df.to_string())
    
    # Save results with timestamp
    timestamp = time.strftime("%Y%m%d-%H%M%S")
    results_df.to_csv(f'quantum_ml_results_{timestamp}.csv', index=False)

if __name__ == "__main__":
    main()



Processing PCA dimension: 2


Quantum embedding (qubits=2): 100%|█████████████████████████████████████████████| 11720/11720 [00:39<00:00, 300.17it/s]
Quantum embedding (qubits=2): 100%|███████████████████████████████████████████████| 2931/2931 [00:09<00:00, 296.16it/s]



Results:
   Accuracy  Precision    Recall        F1   ROC AUC     Kappa  Fit Time  Predict Time                        Model  PCA_Dim
0  0.737291   0.738247  0.764474  0.751131  0.736241  0.473131  1.925661      0.066837  RandomForest_Classical_PCA2        2
1  0.619243   0.626566  0.657895  0.641849  0.617750  0.235955  2.107771      0.096200    RandomForest_Quantum_PCA2        2
2  0.734220   0.734345  0.763816  0.748791  0.733077  0.466878  1.135880      0.110052    ExtraTrees_Classical_PCA2        2
3  0.618901   0.627129  0.653947  0.640258  0.617548  0.235484  1.278682      0.081920      ExtraTrees_Quantum_PCA2        2
4  0.748550   0.741220  0.791447  0.765511  0.746893  0.495082  0.136180      0.015631       XGBoost_Classical_PCA2        2
5  0.657455   0.666667  0.678947  0.672751  0.656625  0.313472  0.109375      0.015625         XGBoost_Quantum_PCA2        2


In [6]:
df=pd.read_csv('data-30s.csv')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14651 entries, 0 to 14650
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   duration            14651 non-null  float64
 1   total_fiat          14651 non-null  float64
 2   total_biat          14651 non-null  float64
 3   min_fiat            14651 non-null  float64
 4   min_biat            14651 non-null  float64
 5   max_fiat            14651 non-null  float64
 6   max_biat            14651 non-null  float64
 7   mean_fiat           14651 non-null  float64
 8   mean_biat           14651 non-null  float64
 9   flowPktsPerSecond   14651 non-null  float64
 10  flowBytesPerSecond  14651 non-null  float64
 11  min_flowiat         14651 non-null  float64
 12  max_flowiat         14651 non-null  float64
 13  mean_flowiat        14651 non-null  float64
 14  std_flowiat         14651 non-null  float64
 15  min_active          14651 non-null  float64
 16  mean