## 1. Data Load

In [2]:
import pickle

with open("mon_standard.pkl", 'rb') as fi:
    mon_data = pickle.load(fi)
    

with open("unmon_standard10_3000.pkl", 'rb') as fi:
    unmon_data = pickle.load(fi)
    
print("Data has been successfully loaded.")

Data has been successfully loaded.


## 2. Data Preprocess

In [3]:
# Function to process monitored data
def process_monitored(data, total_urls=950, use_sublabel=False, url_per_site=10):
    X1, X2, y = [], [], []
    for i in range(total_urls):
        label = i if use_sublabel else i // url_per_site
        for sample in data[i]:
            size_seq = []
            time_seq = []
            for c in sample:
                dr = 1 if c > 0 else -1
                time_seq.append(abs(c))
                size_seq.append(dr * 512)
            X1.append(time_seq)
            X2.append(size_seq)
            y.append(label)
    return X1, X2, y

# Function to process unmonitored data
def process_unmonitored(data, total_urls=3000):
    X1, X2 = [], []
    for i in range(total_urls):
        size_seq = []
        time_seq = []
        for c in data[i]:
            dr = 1 if c > 0 else -1
            time_seq.append(abs(c))
            size_seq.append(dr * 512)
        X1.append(time_seq)
        X2.append(size_seq)
    y = [-1] * len(X1)  # Label unmonitored data as -1
    return X1, X2, y

# Process monitored data
X1_mon, X2_mon, y_mon = process_monitored(
    mon_data, total_urls=950, use_sublabel=False, url_per_site=10
)

# Process unmonitored data
X1_unmon, X2_unmon, y_unmon = process_unmonitored(
    unmon_data, total_urls=3000
)

# Combine data
X1 = X1_mon + X1_unmon
X2 = X2_mon + X2_unmon
y = y_mon + y_unmon


## 3. Feature Extraction

In [4]:
import numpy as np
from itertools import groupby

# Initialize a list to store the newly generated features
features = []

# Compute each feature
for i in range(len(X2)):
    size_seq = X2[i]
    time_seq = X1[i]
    
    # 1. Number of incoming packets
    num_incoming_packets = sum(1 for size in size_seq if size < 0)
    
    # 2. Total number of packets
    num_total_packets = len(size_seq)
    
    # 3. Number of outgoing packets
    num_outgoing_packets = sum(1 for size in size_seq if size > 0)
    
    # 4. Ratio of outgoing packets to total packets
    ratio_outgoing = num_outgoing_packets / num_total_packets if num_total_packets > 0 else 0
    
    # 5. Ratio of incoming packets to total packets
    ratio_incoming = num_incoming_packets / num_total_packets if num_total_packets > 0 else 0
    
    # 6. Ratio of the first 30 incoming packets to 30 (relative to 30 packets)
    incoming_first_30 = sum(1 for size in size_seq[:30] if size < 0) / 30 if num_total_packets >= 30 else 0
    
    # 7. Ratio of the first 30 incoming packets to 30 (relative to all incoming packets)
    incoming_first_30_all = sum(1 for size in size_seq[:30] if size < 0) / 30 if num_total_packets >= 30 else 0
    
    # 8. Maximum number of packets in an outgoing burst
    burst_outgoing_max = max(len(list(g)) for k, g in groupby(size_seq) if k > 0) if num_outgoing_packets > 0 else 0
    
    # 9. Ratio of the first 30 outgoing packets to 30 (relative to 30 packets)
    outgoing_first_30 = sum(1 for size in size_seq[:30] if size > 0) / 30 if num_total_packets >= 30 else 0
    
    # 10. Ratio of the first 30 outgoing packets to 30 (relative to all outgoing packets)
    outgoing_first_30_all = sum(1 for size in size_seq[:30] if size > 0) / 30 if num_total_packets >= 30 else 0
    
    # 11. Number of packets in the last 5 seconds
    last_5_seconds_packets = len([size for size in size_seq[-5:] if size != 0])
    
    # 12. Standard deviation of packet counts in outgoing bursts
    outgoing_burst_std = np.std([len(list(g)) for k, g in groupby(size_seq) if k > 0])
    
    # 13. Average number of packets in outgoing bursts
    outgoing_burst_avg = np.mean([len(list(g)) for k, g in groupby(size_seq) if k > 0])
    
    # 14. Number of outgoing bursts
    num_outgoing_bursts = len([1 for k, g in groupby(size_seq) if k > 0])
    
    # 15. Number of incoming bursts
    num_incoming_bursts = len([1 for k, g in groupby(size_seq) if k < 0])
    
    # 16. Number of packets in incoming bursts during the first 5 seconds
    incoming_burst_first_5_seconds = sum(1 for size in size_seq[:5] if size < 0)
    
    # 17. Number of packets in outgoing bursts during the first 5 seconds
    outgoing_burst_first_5_seconds = sum(1 for size in size_seq[:5] if size > 0)
    
    # 18. Average number of incoming packets per second
    incoming_per_second_avg = num_incoming_packets / (len(time_seq) / 60) if len(time_seq) > 0 else 0
    
    # 19. Ratio of the first 5 incoming packets to 5 (relative to 5 packets)
    incoming_first_5 = sum(1 for size in size_seq[:5] if size < 0) / 5 if num_total_packets >= 5 else 0
    
    # 20. Ratio of the first 5 outgoing packets to 5 (relative to 5 packets)
    outgoing_first_5 = sum(1 for size in size_seq[:5] if size > 0) / 5 if num_total_packets >= 5 else 0
    
    # 21. Total loading time of the website (total time)
    total_time = sum(time_seq)
    
    # 22. Ratio of the first 5 incoming packets to 5 (relative to all incoming packets)
    incoming_first_5_all = sum(1 for size in size_seq[:5] if size < 0) / 5 if num_total_packets >= 5 else 0
    
    # 23. Average time interval of packets in the last 5 seconds
    last_5_seconds_time_intervals = np.mean([time_seq[i+1] - time_seq[i] for i in range(len(time_seq)-1) if time_seq[i] > 0]) if len(time_seq) > 1 else 0
    
    # 24. Ratio of the first 5 outgoing packets to 5 (relative to all outgoing packets)
    outgoing_first_5_all = sum(1 for size in size_seq[:5] if size > 0) / 5 if num_total_packets >= 5 else 0

    # Add all computed features to the features list
    features.append([
        num_incoming_packets, num_total_packets, num_outgoing_packets,
        ratio_outgoing, ratio_incoming, incoming_first_30, incoming_first_30_all,
        burst_outgoing_max, outgoing_first_30, outgoing_first_30_all, last_5_seconds_packets,
        outgoing_burst_std, outgoing_burst_avg, num_outgoing_bursts, num_incoming_bursts,
        incoming_burst_first_5_seconds, outgoing_burst_first_5_seconds, incoming_per_second_avg,
        incoming_first_5, outgoing_first_5, total_time, incoming_first_5_all,
        last_5_seconds_time_intervals, outgoing_first_5_all
    ])

# 'features' contains the dataset with 24 features
features = np.array(features)

# 'y' contains the labels for each sample
y = np.array(y)


## 4. Data split and downsampling

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
import pandas as pd

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    features, y, test_size=0.2, random_state=42, stratify=y
)

# Separate monitored and unmonitored data from the training set
train_data = pd.DataFrame({
    'features': list(X_train),  # Convert features to list for DataFrame compatibility
    'label': y_train           # Use corresponding labels
})
train_monitored = train_data[train_data['label'] != -1]  # Filter monitored data
train_unmonitored = train_data[train_data['label'] == -1]  # Filter unmonitored data

# Downsample the monitored data to balance the dataset
train_monitored_downsampled = resample(
    train_monitored,
    replace=True,  # Allow sampling with replacement
    n_samples=len(train_unmonitored),  # Match the number of unmonitored samples
    random_state=42  # Ensure reproducibility
)

# Recombine the downsampled monitored and unmonitored data
balanced_train_data = pd.concat([train_monitored_downsampled, train_unmonitored])

# Extract balanced features and labels
X_train_balanced = np.array(balanced_train_data['features'].tolist())  # Convert back to NumPy array
y_train_balanced = balanced_train_data['label'].values  # Extract labels as a NumPy array

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train_balanced = scaler.fit_transform(X_train_balanced)  # Fit and transform the training set
X_test = scaler.transform(X_test)  # Transform the test set using the same scaler


  from pandas.core import (


## 5. Train the Random Forest model

In [6]:

from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=1000, 
    min_samples_split=2, 
    min_samples_leaf=1, 
    max_features=0.7, 
    max_depth=30, 
    class_weight='balanced' 
)

rf_model.fit(X_train_balanced, y_train_balanced)

## 6. Test and evaluate the model

In [7]:
from sklearn.metrics import accuracy_score, auc, confusion_matrix, precision_score, precision_recall_curve, roc_curve, recall_score, f1_score, classification_report

y_pred = rf_model.predict(X_test)

# calculate Accuracy, Precison, Recall, F1 Score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average = 'weighted')
recall = recall_score(y_test, y_pred, average = 'weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# calculate Confusion Matrix, ROC AUC, PR AUC
conf_matrix = confusion_matrix(y_test, y_pred)

# create Classification Report
class_report = classification_report(y_test, y_pred, target_names=[f'Class {i}' for i in range(-1, 95)])

# print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (Weighted): {precision:.4f}")
print(f"Recall (Weighted): {recall:.4f}")
print(f"F1 Score (Weighted): {f1:.4f}")

print(f"\nConfusion Matrix\n{conf_matrix}")
print("\nClassification Report:\n", class_report)

Accuracy: 0.4845
Precision (Weighted): 0.6661
Recall (Weighted): 0.4845
F1 Score (Weighted): 0.4845

Confusion Matrix
[[568   0   0 ...   1   0   0]
 [ 30   3   0 ...   0   0   0]
 [ 26   0  10 ...   0   0   0]
 ...
 [ 34   0   0 ...   1   0   0]
 [  5   0   0 ...   0  33   0]
 [ 27   0   0 ...   0   1   3]]

Classification Report:
               precision    recall  f1-score   support

    Class -1       0.25      0.95      0.39       600
     Class 0       0.50      0.07      0.13        40
     Class 1       1.00      0.25      0.40        40
     Class 2       0.93      0.35      0.51        40
     Class 3       0.87      0.33      0.47        40
     Class 4       0.58      0.35      0.44        40
     Class 5       0.71      0.55      0.62        40
     Class 6       0.74      0.72      0.73        40
     Class 7       0.80      0.70      0.75        40
     Class 8       0.74      0.42      0.54        40
     Class 9       0.79      0.47      0.59        40
    Class 10    

In [8]:
# check data distribution
print("\nOriginal Training Data Distribution:")
print(f"Label -1 count: {sum(y_train == -1)}")
print(f"Other labels count: {sum(y_train != -1)}")

print("\nBalanced Training Data Distribution:")
print(f"Label -1 count: {sum(y_train_balanced == -1)}")
print(f"Other labels count: {sum(y_train_balanced != -1)}")

print("\nTest Data Distribution:")
print(f"Label -1 count: {sum(y_test == -1)}")
print(f"Other labels count: {sum(y_test != -1)}")



Original Training Data Distribution:
Label -1 count: 2400
Other labels count: 15200

Balanced Training Data Distribution:
Label -1 count: 2400
Other labels count: 2400

Test Data Distribution:
Label -1 count: 600
Other labels count: 3800
