# **1. Data Loading**


In [2]:
!pip install scikit-learn



In [3]:
!pip install imbalanced-learn



In [4]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Load monitored and unmonitored data
print("Loading data...")
with open("mon_standard.pkl", 'rb') as fi:
    mon_data = pickle.load(fi)

with open("unmon_standard10_3000.pkl", 'rb') as fi:
    unmon_data = pickle.load(fi)

Loading data...


# **2. Data Preprocessing**


In [6]:
def process_data(data, label_value):
    """
    Preprocess the data to extract features and labels.
    data: The raw data (dictionary or list).
    label_value: The label for the data (e.g., 1 for monitored, -1 for unmonitored).
    return: (X1, X2, y)
    """
    X1, X2, y = [], [], []
    if isinstance(data, dict):
        for samples in data.values():
            for sample in samples:
                X1.append([abs(c) for c in sample])
                X2.append([(1 if c > 0 else -1) * 512 for c in sample])
                y.append(label_value)
    elif isinstance(data, list):
        for sample in data:
            X1.append([abs(c) for c in sample])
            X2.append([(1 if c > 0 else -1) * 512 for c in sample])
            y.append(label_value)
    return X1, X2, y

# Preprocess monitored and unmonitored data
X1_mon, X2_mon, y_mon = process_data(mon_data, label_value=1)
X1_unmon, X2_unmon, y_unmon = process_data(unmon_data, label_value=-1)

# Combine monitored and unmonitored data
X1 = X1_mon + X1_unmon
X2 = X2_mon + X2_unmon
y = y_mon + y_unmon

# **3. Feature Engineering**


In [8]:
from itertools import groupby

# Initialize a list to store newly generated features
features = []

# Compute features for each data sample
for i in range(len(X2)):
    size_seq = X2[i]
    time_seq = X1[i]
    
    # 1. Number of incoming packets
    num_incoming_packets = sum(1 for size in size_seq if size < 0)
    
    # 2. Total number of packets
    num_total_packets = len(size_seq)
    
    # 3. Number of outgoing packets
    num_outgoing_packets = sum(1 for size in size_seq if size > 0)
    
    # 4. Ratio of outgoing packets to total packets
    ratio_outgoing = num_outgoing_packets / num_total_packets if num_total_packets > 0 else 0
    
    # 5. Ratio of incoming packets to total packets
    ratio_incoming = num_incoming_packets / num_total_packets if num_total_packets > 0 else 0
    
    # 6. Ratio of incoming packets in the first 30 packets to total incoming packets
    incoming_first_30 = sum(1 for size in size_seq[:30] if size < 0) / 30 if num_total_packets >= 30 else 0
    
    # 7. Ratio of incoming packets in the first 30 packets to total packets
    incoming_first_30_all = sum(1 for size in size_seq[:30] if size < 0) / 30 if num_total_packets >= 30 else 0
    
    # 8. Maximum number of packets in an outgoing burst
    burst_outgoing_max = max(len(list(g)) for k, g in groupby(size_seq) if k > 0) if num_outgoing_packets > 0 else 0
    
    # 9. Ratio of outgoing packets in the first 30 packets to total packets
    outgoing_first_30 = sum(1 for size in size_seq[:30] if size > 0) / 30 if num_total_packets >= 30 else 0
    
    # 10. Ratio of outgoing packets in the first 30 packets to total outgoing packets
    outgoing_first_30_all = sum(1 for size in size_seq[:30] if size > 0) / 30 if num_total_packets >= 30 else 0
    
    # 11. Number of packets in the last 5 seconds
    last_5_seconds_packets = len([size for size in size_seq[-5:] if size != 0])
    
    # 12. Standard deviation of the number of packets in outgoing bursts
    outgoing_burst_std = np.std([len(list(g)) for k, g in groupby(size_seq) if k > 0])
    
    # 13. Average number of packets in outgoing bursts
    outgoing_burst_avg = np.mean([len(list(g)) for k, g in groupby(size_seq) if k > 0])
    
    # 14. Total number of outgoing bursts
    num_outgoing_bursts = len([1 for k, g in groupby(size_seq) if k > 0])
    
    # 15. Total number of incoming bursts
    num_incoming_bursts = len([1 for k, g in groupby(size_seq) if k < 0])
    
    # 16. Number of incoming packets in bursts during the first 5 seconds
    incoming_burst_first_5_seconds = sum(1 for size in size_seq[:5] if size < 0)
    
    # 17. Number of outgoing packets in bursts during the first 5 seconds
    outgoing_burst_first_5_seconds = sum(1 for size in size_seq[:5] if size > 0)
    
    # 18. Average number of incoming packets per second
    incoming_per_second_avg = num_incoming_packets / (len(time_seq) / 60) if len(time_seq) > 0 else 0
    
    # 19. Ratio of incoming packets in the first 5 packets to total packets
    incoming_first_5 = sum(1 for size in size_seq[:5] if size < 0) / 5 if num_total_packets >= 5 else 0
    
    # 20. Ratio of outgoing packets in the first 5 packets to total packets
    outgoing_first_5 = sum(1 for size in size_seq[:5] if size > 0) / 5 if num_total_packets >= 5 else 0
    
    # 21. Total loading time of the website
    total_time = sum(time_seq)
    
    # 22. Ratio of incoming packets in the first 5 packets to total incoming packets
    incoming_first_5_all = sum(1 for size in size_seq[:5] if size < 0) / 5 if num_total_packets >= 5 else 0
    
    # 23. Average time interval between packets during the last 5 seconds
    last_5_seconds_time_intervals = np.mean([time_seq[i+1] - time_seq[i] for i in range(len(time_seq)-1) if time_seq[i] > 0]) if len(time_seq) > 1 else 0
    
    # 24. Ratio of outgoing packets in the first 5 packets to total outgoing packets
    outgoing_first_5_all = sum(1 for size in size_seq[:5] if size > 0) / 5 if num_total_packets >= 5 else 0

    # Append all calculated features for the current sample to the features list
    features.append([
        num_incoming_packets, num_total_packets, num_outgoing_packets,
        ratio_outgoing, ratio_incoming, incoming_first_30, incoming_first_30_all,
        burst_outgoing_max, outgoing_first_30, outgoing_first_30_all, last_5_seconds_packets,
        outgoing_burst_std, outgoing_burst_avg, num_outgoing_bursts, num_incoming_bursts,
        incoming_burst_first_5_seconds, outgoing_burst_first_5_seconds, incoming_per_second_avg,
        incoming_first_5, outgoing_first_5, total_time, incoming_first_5_all,
        last_5_seconds_time_intervals, outgoing_first_5_all
    ])

# 'features' contains the dataset with 24 calculated features
X = np.array(features)

# 'y' contains the labels for each sample
y = np.array(y)


# **4. Data Splitting**


In [10]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [11]:
# Data Distribution
print("Training Data Distribution:")
print(pd.Series(y_train).value_counts())
print("Test Data Distribution:")
print(pd.Series(y_test).value_counts())

Training Data Distribution:
 1    15200
-1     2400
Name: count, dtype: int64
Test Data Distribution:
 1    3800
-1     600
Name: count, dtype: int64


# **5. Scaling the Data**


In [13]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# **6. Model training**


In [15]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier

# Configure the default Random Forest model (before hyperparameter tuning)
rf_model = rf_model_default = RandomForestClassifier(random_state=42)

# Configure the default Gradient Boosting model (before hyperparameter tuning)
gb_model = gb_model_default = GradientBoostingClassifier(random_state=42)

# Combine the models using VotingClassifier
ensemble_model_default = VotingClassifier(
    estimators=[('Random Forest', rf_model_default), ('Gradient Boosting', gb_model_default)],
    voting='soft'
)

# Train the ensemble model with default settings
print("Training the ensemble model with default hyperparameters...")
ensemble_model_default.fit(X_train, y_train)

Training the ensemble model with default hyperparameters...


# **7. Model Evaluation**


In [17]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Evaluate the ensemble model
print("Evaluating the ensemble model...")
y_pred = ensemble_model_default.predict(X_test)

# Display evaluation metrics
print("\nFinal Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")


Evaluating the ensemble model...

Final Evaluation:
Accuracy: 0.9072727272727272
Confusion Matrix:
[[ 212  388]
 [  20 3780]]
Classification Report:
              precision    recall  f1-score   support

          -1       0.91      0.35      0.51       600
           1       0.91      0.99      0.95      3800

    accuracy                           0.91      4400
   macro avg       0.91      0.67      0.73      4400
weighted avg       0.91      0.91      0.89      4400



In [18]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve, precision_recall_curve, auc

fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)
precision = precision_score(y_test, y_pred, pos_label=1)
recall = recall_score(y_test, y_pred, pos_label=1)
roc_auc = auc(fpr, tpr)
precision_values, recall_values, thresholds_pr = precision_recall_curve(y_test, y_pred, pos_label=1)
pr_auc = auc(recall_values, precision_values)

print(f"True Positive Rate (TPR): {tpr[1]}")
print(f"False Positive Rate (FPR): {fpr[1]}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC: {roc_auc}")
print(f"Precision-Recall AUC: {pr_auc}")

True Positive Rate (TPR): 0.9947368421052631
False Positive Rate (FPR): 0.6466666666666666
Precision: 0.9069097888675623
Recall: 0.9947368421052631
ROC AUC: 0.6740350877192982
Precision-Recall AUC: 0.95309604275914
