In [1]:
import pandas as pd
import numpy as np
import socket
import struct
import pennylane as qml
import base64
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                            ExtraTreesClassifier, GradientBoostingClassifier)
# from lightgbm import LGBMClassifier
# from xgboost import XGBClassifier

from pathlib import Path
import json
from collections import defaultdict
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score

In [2]:
df = pd.read_csv(r"CSV/TestbedSunJun13Flows.csv")
df.shape


(275528, 21)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275528 entries, 0 to 275527
Data columns (total 21 columns):
 #   Column                          Non-Null Count   Dtype 
---  ------                          --------------   ----- 
 0   generated                       275528 non-null  object
 1   appName                         275528 non-null  object
 2   totalSourceBytes                275528 non-null  int64 
 3   totalDestinationBytes           275528 non-null  int64 
 4   totalDestinationPackets         275528 non-null  int64 
 5   totalSourcePackets              275528 non-null  int64 
 6   sourcePayloadAsBase64           123604 non-null  object
 7   sourcePayloadAsUTF              123388 non-null  object
 8   destinationPayloadAsBase64      118724 non-null  object
 9   destinationPayloadAsUTF         118696 non-null  object
 10  direction                       275528 non-null  object
 11  sourceTCPFlagsDescription       220704 non-null  object
 12  destinationTCPFlagsDescription

In [4]:
# Drop payload columns
payload_columns = [
    "sourcePayloadAsBase64", "sourcePayloadAsUTF",
    "destinationPayloadAsBase64", "destinationPayloadAsUTF"
]
df.drop(columns=payload_columns, inplace=True)

In [5]:
# Convert labels
df["Label"] = df["Label"].map({"Normal": 0, "Attack": 1})

In [6]:
# import pandas as pd
# import numpy as np
# from sklearn.utils import resample

# def undersample_dataset(df, class_column):
#     """
#     Undersample the majority classes to match the minority class size.
    
#     Parameters:
#     -----------
#     df : pandas.DataFrame
#         The imbalanced dataset
#     class_column : str
#         The name of the column containing class labels
        
#     Returns:
#     --------
#     pandas.DataFrame
#         The balanced dataset
#     """
#     # Get the class distribution
#     class_counts = df[class_column].value_counts()
    
#     # Find the minority class and its count
#     minority_class = class_counts.idxmin()
#     minority_count = class_counts.min()
    
#     print(f"Minority class: {minority_class} with {minority_count} samples")
    
#     # Create a list to store the balanced data
#     balanced_dfs = []
    
#     # Add all samples from the minority class
#     minority_df = df[df[class_column] == minority_class]
#     balanced_dfs.append(minority_df)
    
#     # Undersample each majority class
#     for cls in class_counts.index:
#         if cls != minority_class:
#             # Get all samples from this class
#             class_df = df[df[class_column] == cls]
#             # Undersample to match minority class size
#             undersampled_df = resample(class_df, 
#                                       replace=False,  # sample without replacement
#                                       n_samples=minority_count,  # match minority class
#                                       random_state=42)  # reproducible results
#             balanced_dfs.append(undersampled_df)
    
#     # Combine all balanced classes
#     balanced_df = pd.concat(balanced_dfs)
    
#     # Shuffle the dataset
#     balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
#     print(f"Original dataset shape: {df.shape}")
#     print(f"Balanced dataset shape: {balanced_df.shape}")
#     print("New class distribution:")
#     print(balanced_df[class_column].value_counts())
    
#     return balanced_df

# # Example usage
# if __name__ == "__main__":
    
    
    
    
#     # Print original class distribution
#     print("Original class distribution:")
#     print(df['Label'].value_counts())
    
#     # Undersample to balance the dataset
#     balanced_df = undersample_dataset(df, 'Label')
#     df = balanced_df
#     # Save the balanced dataset if needed
#     # balanced_df.to_csv('balanced_dataset.csv', index=False)


In [7]:
import pandas as pd
from sklearn.utils import resample


# Check the unique counts in the label column
print(df['Label'].value_counts())

# Separate classes
class_counts = df['Label'].value_counts()
minority_class = class_counts.idxmin()  # Class with fewer samples
majority_class = class_counts.idxmax()  # Class with more samples

df_minority = df[df['Label'] == minority_class]
df_majority = df[df['Label'] == majority_class]

# Undersample both classes to 70 records each
df_minority_undersampled = resample(df_minority, 
                                    replace=False, 
                                    n_samples=100, 
                                    random_state=46)

df_majority_undersampled = resample(df_majority, 
                                    replace=False, 
                                    n_samples=100, 
                                    random_state=48)

# Combine the undersampled data
df_balanced = pd.concat([df_minority_undersampled, df_majority_undersampled])

# Shuffle the dataset to mix the classes
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Check new class distribution
print(df_balanced['Label'].value_counts())


df = df_balanced

Label
0    255170
1     20358
Name: count, dtype: int64
Label
1    100
0    100
Name: count, dtype: int64


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   generated                       200 non-null    object
 1   appName                         200 non-null    object
 2   totalSourceBytes                200 non-null    int64 
 3   totalDestinationBytes           200 non-null    int64 
 4   totalDestinationPackets         200 non-null    int64 
 5   totalSourcePackets              200 non-null    int64 
 6   direction                       200 non-null    object
 7   sourceTCPFlagsDescription       177 non-null    object
 8   destinationTCPFlagsDescription  170 non-null    object
 9   source                          200 non-null    object
 10  protocolName                    200 non-null    object
 11  sourcePort                      200 non-null    int64 
 12  destination                     200 non-null    ob

In [9]:
# Select numeric features
numeric_cols = df.select_dtypes(include=['int64']).columns
X = df[numeric_cols].drop(columns="Label")
y = df['Label']


In [10]:
# import pennylane as qml
# n_features = X.shape[1]
# N = int(np.ceil(np.log2(n_features)))
# wires = range(N)
# dev = qml.device('lightning.qubit', wires)    

# @qml.qnode(dev)
# def circuit(f=None):
#     qml.AmplitudeEmbedding(f, wires=wires,pad_with=0,normalize=True)
#     return qml.state()
# X_norm = X.values
# X_quantum = circuit(X_norm)
# X_real = np.real(np.array(X_quantum))
# # Create column names based on index
# column_names = [f'feature_{i}' for i in range(X_real.shape[1])]
# X_real = pd.DataFrame(X_real, columns=column_names)

# X = X_real


In [11]:
# import pennylane as qml
# from pennylane import numpy as np

# N = X.shape[1]
# wires = range(N)
# dev = qml.device("default.qubit", wires)

# @qml.qnode(dev)
# def circuit(val_list):
#     qml.AngleEmbedding(val_list, wires, rotation="Y")
#     return [qml.expval(qml.PauliZ(w)) for w in wires]

# # Function to process DataFrame through quantum circuit
# def quantum_transform(df):
#     # Convert DataFrame to numpy array
#     values = df.values
#     # Process each row through quantum circuit
#     quantum_features = np.array([circuit(row) for row in values])
#     # Remove tensor properties and convert to regular numpy array
#     quantum_features = np.array(quantum_features).astype(float)
#     return quantum_features
# # Transform your data
# X_real = quantum_transform(X)

# quantum_cols = [f'quantum_state_{i}' for i in range(len(X_real[0]))]
# X_real = pd.DataFrame(X_real, columns=quantum_cols)
# X_real.head()
# X = X_real

In [12]:
X.shape

(200, 6)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)  

In [14]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()

# # Fit the scaler on the training data and transform it
# X_train = scaler.fit_transform(X_train)

# # Transform the test data using the same scaler
# X_test = scaler.transform(X_test)

In [15]:
import time
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score
from sklearn.model_selection import train_test_split
# import xgboost as xgb
# import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
# Create a dictionary of models to evaluate
models = {
    "SVM (Linear)": SVC(kernel="linear", random_state=42),
    "SVM (Poly)": SVC(kernel="poly", random_state=42),
    "SVM (RBF)": SVC(kernel="rbf", random_state=42),
    "SVM (Sigmoid)": SVC(kernel="sigmoid", random_state=42),
    # "KNN": KNeighborsClassifier(),
    # "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    # "AdaBoost": AdaBoostClassifier(random_state=42),
    # "Extra Trees": ExtraTreesClassifier(n_estimators=100, random_state=42),
    # "XGBoost": xgb.XGBClassifier(random_state=42),
    # "LightGBM": lgb.LGBMClassifier(random_state=42),
    # "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# Function to calculate metrics
def evaluate_model(model, X_train, X_test, y_train, y_test):
    start_time = time.time()  # Track model fitting time
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None

    # Get performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None
    cohen_kappa = cohen_kappa_score(y_test, y_pred)
    
    # Running time
    end_time = time.time()
    runtime = end_time - start_time
    
    # Detailed classification report
    class_report = classification_report(y_test, y_pred)
    
    
    # Return all metrics
    return {
        "Model": model.__class__.__name__,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC AUC": roc_auc,
        "Cohen’s Kappa": cohen_kappa,
        "Running Time (s)": runtime,
    }

# Evaluating all models and storing results
results = []

for name, model in models.items():
    print(f"Evaluating model: {name}")
    result = evaluate_model(model, X_train, X_test, y_train, y_test)
    results.append(result)

# Convert results into a DataFrame
results_df = pd.DataFrame(results)

# Display all the results
print(results_df)

Evaluating model: SVM (Linear)
Evaluating model: SVM (Poly)
Evaluating model: SVM (RBF)
Evaluating model: SVM (Sigmoid)
  Model  Accuracy  Precision   Recall  F1 Score ROC AUC  Cohen’s Kappa  \
0   SVC  0.900000   0.882353  0.93750  0.909091    None       0.798206   
1   SVC  0.716667   0.826087  0.59375  0.690909    None       0.442013   
2   SVC  0.900000   0.882353  0.93750  0.909091    None       0.798206   
3   SVC  0.900000   0.882353  0.93750  0.909091    None       0.798206   

   Running Time (s)  
0        182.950845  
1          0.008114  
2          0.007639  
3          0.007912  


In [16]:
# results_df.to_csv('TestbedThuJun17Flows_ang.csv', index=False)


In [17]:
results_df


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC,Cohen’s Kappa,Running Time (s)
0,SVC,0.9,0.882353,0.9375,0.909091,,0.798206,182.950845
1,SVC,0.716667,0.826087,0.59375,0.690909,,0.442013,0.008114
2,SVC,0.9,0.882353,0.9375,0.909091,,0.798206,0.007639
3,SVC,0.9,0.882353,0.9375,0.909091,,0.798206,0.007912


In [18]:
import numpy as np

# Method 1: Using numpy unique with return_counts
unique_values, counts = np.unique(y, return_counts=True)
print(dict(zip(unique_values, counts)))

# Method 2: Using value_counts if y is a pandas series
print(pd.Series(y).value_counts())



{0: 100, 1: 100}
Label
1    100
0    100
Name: count, dtype: int64
