In [1]:
import pennylane as qml
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score
import time
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

def preprocess_data(filepath, n_features=20):
    df = pd.read_csv(filepath)
    
    feature_cols = df.columns[:n_features]
    label_col = df.columns[-1]
    
    X = df[feature_cols].copy()
    y = df[label_col].copy()
    
    for col in X.columns:
        if X[col].dtype == 'object':
            if X[col].str.contains('.').any() and X[col].str.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$').any():
                X[col] = X[col].apply(lambda x: sum(int(octet) * (256 ** (3-i)) for i, octet in enumerate(x.split('.'))))
            else:
                X[col] = pd.to_numeric(X[col], errors='coerce')
    
    X = X.replace([np.inf, -np.inf], np.nan)
    X = X.fillna(X.mean())
    
    # L2 normalization for amplitude embedding (unit vector)
    normalizer = Normalizer(norm='l2')
    X = normalizer.fit_transform(X)
    
    le = LabelEncoder()
    y = le.fit_transform(y)
    
    return X, y, le.classes_

def create_amplitude_embedding_circuit(n_qubits):
    # For amplitude embedding, we need 2^n_qubits features
    dev = qml.device("default.qubit", wires=n_qubits)
    
    @qml.qnode(dev)
    def amplitude_embedding_circuit(features):
        # Amplitude embedding
        qml.AmplitudeEmbedding(features, wires=range(n_qubits), normalize=True)
        
        # Add entangling layers
        for i in range(n_qubits-1):
            qml.CNOT(wires=[i, i+1])
        
        # Add parametric gates
        for i in range(n_qubits):
            qml.RY(np.pi/2, wires=i)
            qml.RZ(np.pi/4, wires=i)
        
        return [qml.expval(qml.PauliZ(i)) for i in range(n_qubits)]
    
    return amplitude_embedding_circuit

def pad_features(X, n_qubits):
    target_size = 2**n_qubits
    if X.shape[1] < target_size:
        padding = np.zeros((X.shape[0], target_size - X.shape[1]))
        return np.hstack((X, padding))
    return X[:, :target_size]

def quantum_feature_extraction(X, n_qubits):
    circuit = create_amplitude_embedding_circuit(n_qubits)
    quantum_features = []
    batch_size = 100
    
    # Pad features to match required dimension for amplitude embedding
    X_padded = pad_features(X, n_qubits)
    
    for i in tqdm(range(0, len(X_padded), batch_size), desc="Quantum Processing"):
        batch = X_padded[i:i + batch_size]
        batch_features = []
        
        for sample in batch:
            # Ensure normalization for quantum state preparation
            normalized_sample = sample / np.linalg.norm(sample)
            quantum_output = circuit(normalized_sample)
            batch_features.append(quantum_output)
            
        quantum_features.extend(batch_features)
    
    return np.array(quantum_features)

def evaluate_classifier(clf, X_train, X_test, y_train, y_test):
    start_fit = time.time()
    clf.fit(X_train, y_train)
    fit_time = time.time() - start_fit
    
    start_pred = time.time()
    y_pred = clf.predict(X_test)
    y_pred_proba = clf.predict_proba(X_test)
    pred_time = time.time() - start_pred
    
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='weighted', zero_division=1),
        'Recall': recall_score(y_test, y_pred, average='weighted', zero_division=1),
        'F1': f1_score(y_test, y_pred, average='weighted', zero_division=1),
        'ROC AUC': roc_auc_score(y_test, y_pred_proba, multi_class='ovr'),
        'Kappa': cohen_kappa_score(y_test, y_pred),
        'Fit Time': fit_time,
        'Predict Time': pred_time
    }
    return metrics



In [2]:
def main():
    X, y, classes = preprocess_data("Scenario-B-merged_5s.csv")
    
    classifiers = {
        'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
        'XGBoost': XGBClassifier(random_state=42, scale_pos_weight=1),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
    }
    
    # For amplitude embedding, we need log2(features) qubits
    pca_dimensions = [2, 10, 15]
    results = []
    
    for n_components in pca_dimensions:
        pca = PCA(n_components=n_components)
        X_pca = pca.fit_transform(X)
        
        X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
        
        for clf_name, clf in classifiers.items():
            metrics = evaluate_classifier(clf, X_train, X_test, y_train, y_test)
            metrics.update({
                'Method': 'Classical',
                'PCA_dim': n_components,
                'Classifier': clf_name
            })
            results.append(metrics)
        
        # Quantum evaluation with amplitude embedding
        n_qubits = int(np.ceil(np.log2(n_components)))  # Number of qubits needed
        X_quantum = quantum_feature_extraction(X_pca, n_qubits)
        X_train_q, X_test_q, y_train_q, y_test_q = train_test_split(X_quantum, y, test_size=0.2, random_state=42)
        
        for clf_name, clf in classifiers.items():
            metrics = evaluate_classifier(clf, X_train_q, X_test_q, y_train_q, y_test_q)
            metrics.update({
                'Method': 'Quantum-Amplitude',
                'PCA_dim': n_components,
                'Classifier': clf_name
            })
            results.append(metrics)
    
    results_df = pd.DataFrame(results)
    print(results_df.to_string())
    results_df.to_csv('quantum_amplitude_classical_comparison_results.csv', index=False)

if __name__ == "__main__":
    main()


Quantum Processing: 100%|████████████████████████████████████████████████████████████| 146/146 [00:48<00:00,  3.02it/s]
Quantum Processing: 100%|████████████████████████████████████████████████████████████| 146/146 [02:03<00:00,  1.18it/s]
Quantum Processing: 100%|████████████████████████████████████████████████████████████| 146/146 [01:27<00:00,  1.68it/s]


    Accuracy  Precision    Recall        F1   ROC AUC     Kappa   Fit Time  Predict Time             Method  PCA_dim     Classifier
0   0.795658   0.795349  0.795658  0.795106  0.913196  0.749992   2.670015      1.805645          Classical        2    Extra Trees
1   0.832874   0.831859  0.832874  0.827287  0.974197  0.794296   9.741848      0.065489          Classical        2        XGBoost
2   0.796003   0.795393  0.796003  0.795449  0.948774  0.750353   2.033080      0.122273          Classical        2  Random Forest
3   0.756030   0.757108  0.756030  0.756527  0.851107  0.701936   1.004988      0.186307  Quantum-Amplitude        2    Extra Trees
4   0.817712   0.818615  0.817712  0.806175  0.964846  0.775309   0.942881      0.032178  Quantum-Amplitude        2        XGBoost
5   0.758787   0.759796  0.758787  0.759211  0.909187  0.705287   1.743805      0.095838  Quantum-Amplitude        2  Random Forest
6   0.900414   0.901724  0.900414  0.900558  0.981738  0.878182   1.176296  

In [4]:


X, y, classes = preprocess_data("Scenario-B-merged_5s.csv")


pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
        
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

In [8]:
len(X_test)

2902