## 1. Data Load

In [1]:
import pickle

with open("mon_standard.pkl", 'rb') as fi:
    mon_data = pickle.load(fi)
    

with open("unmon_standard10_3000.pkl", 'rb') as fi:
    unmon_data = pickle.load(fi)
    
print("Data has been successfully loaded.")

Data has been successfully loaded.


## 2. Data Preprocess

In [2]:
# Function to process monitored data
def process_monitored(data, total_urls=950, use_sublabel=False, url_per_site=10):
    X1, X2, y = [], [], []
    for i in range(total_urls):
        # Assign label based on sublabel usage or grouping by site
        label = i if use_sublabel else i // url_per_site
        for sample in data[i]:
            size_seq = []
            time_seq = []
            for c in sample:
                dr = 1 if c > 0 else -1  # Determine direction (1 for positive, -1 for negative)
                time_seq.append(abs(c))  # Add the absolute value of the timestamp
                size_seq.append(dr * 512)  # Calculate packet size with direction
            X1.append(time_seq)
            X2.append(size_seq)
            y.append(label)
    return X1, X2, y

# Function to process unmonitored data
def process_unmonitored(data, total_urls=3000):
    X1, X2 = [], []
    for i in range(total_urls):
        size_seq = []
        time_seq = []
        for c in data[i]:
            dr = 1 if c > 0 else -1  # Determine direction (1 for positive, -1 for negative)
            time_seq.append(abs(c))  # Add the absolute value of the timestamp
            size_seq.append(dr * 512)  # Calculate packet size with direction
        X1.append(time_seq)
        X2.append(size_seq)
    y = [-1] * len(X1)  # Label unmonitored data with -1
    return X1, X2, y

# Process monitored data
X1_mon, X2_mon, y_mon = process_monitored(
    mon_data, total_urls=950, use_sublabel=False, url_per_site=10
)

# Process unmonitored data
X1_unmon, X2_unmon, y_unmon = process_unmonitored(
    unmon_data, total_urls=3000
)

# Combine monitored and unmonitored data
X1 = X1_mon + X1_unmon
X2 = X2_mon + X2_unmon
y = y_mon + y_unmon


## 3. Feature Extraction

In [3]:
import numpy as np
from itertools import groupby

# Initialize the list to store the new features
features = []

# Calculate each feature for the data
for i in range(len(X2)):
    size_seq = X2[i]  # Packet size sequence
    time_seq = X1[i]  # Time sequence
    
    # 1. Number of incoming packets
    num_incoming_packets = sum(1 for size in size_seq if size < 0)
    
    # 2. Total number of packets
    num_total_packets = len(size_seq)
    
    # 3. Number of outgoing packets
    num_outgoing_packets = sum(1 for size in size_seq if size > 0)
    
    # 4. Ratio of outgoing packets to total packets
    ratio_outgoing = num_outgoing_packets / num_total_packets if num_total_packets > 0 else 0
    
    # 5. Ratio of incoming packets to total packets
    ratio_incoming = num_incoming_packets / num_total_packets if num_total_packets > 0 else 0
    
    # 6. Ratio of incoming packets in the first 30 packets
    incoming_first_30 = sum(1 for size in size_seq[:30] if size < 0) / 30 if num_total_packets >= 30 else 0
    
    # 7. Ratio of incoming packets in the first 30 packets (total)
    incoming_first_30_all = sum(1 for size in size_seq[:30] if size < 0) / 30 if num_total_packets >= 30 else 0
    
    # 8. Maximum number of packets in a burst of outgoing packets
    burst_outgoing_max = max(len(list(g)) for k, g in groupby(size_seq) if k > 0) if num_outgoing_packets > 0 else 0
    
    # 9. Ratio of outgoing packets in the first 30 packets
    outgoing_first_30 = sum(1 for size in size_seq[:30] if size > 0) / 30 if num_total_packets >= 30 else 0
    
    # 10. Ratio of outgoing packets in the first 30 packets (total)
    outgoing_first_30_all = sum(1 for size in size_seq[:30] if size > 0) / 30 if num_total_packets >= 30 else 0
    
    # 11. Number of packets in the last 5 seconds
    last_5_seconds_packets = len([size for size in size_seq[-5:] if size != 0])
    
    # 12. Standard deviation of packet counts in outgoing bursts
    outgoing_burst_std = np.std([len(list(g)) for k, g in groupby(size_seq) if k > 0])
    
    # 13. Average number of packets in outgoing bursts
    outgoing_burst_avg = np.mean([len(list(g)) for k, g in groupby(size_seq) if k > 0])
    
    # 14. Number of outgoing bursts
    num_outgoing_bursts = len([1 for k, g in groupby(size_seq) if k > 0])
    
    # 15. Number of incoming bursts
    num_incoming_bursts = len([1 for k, g in groupby(size_seq) if k < 0])
    
    # 16. Number of packets in incoming bursts in the first 5 seconds
    incoming_burst_first_5_seconds = sum(1 for size in size_seq[:5] if size < 0)
    
    # 17. Number of packets in outgoing bursts in the first 5 seconds
    outgoing_burst_first_5_seconds = sum(1 for size in size_seq[:5] if size > 0)
    
    # 18. Average number of incoming packets per second
    incoming_per_second_avg = num_incoming_packets / (len(time_seq) / 60) if len(time_seq) > 0 else 0
    
    # 19. Ratio of incoming packets in the first 5 packets
    incoming_first_5 = sum(1 for size in size_seq[:5] if size < 0) / 5 if num_total_packets >= 5 else 0
    
    # 20. Ratio of outgoing packets in the first 5 packets
    outgoing_first_5 = sum(1 for size in size_seq[:5] if size > 0) / 5 if num_total_packets >= 5 else 0
    
    # 21. Total website loading time (total time)
    total_time = sum(time_seq)
    
    # 22. Ratio of incoming packets in the first 5 packets (total)
    incoming_first_5_all = sum(1 for size in size_seq[:5] if size < 0) / 5 if num_total_packets >= 5 else 0
    
    # 23. Average time interval between packets in the last 5 seconds
    last_5_seconds_time_intervals = np.mean([time_seq[i+1] - time_seq[i] for i in range(len(time_seq)-1) if time_seq[i] > 0]) if len(time_seq) > 1 else 0
    
    # 24. Ratio of outgoing packets in the first 5 packets (total)
    outgoing_first_5_all = sum(1 for size in size_seq[:5] if size > 0) / 5 if num_total_packets >= 5 else 0

    # Add all the calculated features to the features list
    features.append([
        num_incoming_packets, num_total_packets, num_outgoing_packets,
        ratio_outgoing, ratio_incoming, incoming_first_30, incoming_first_30_all,
        burst_outgoing_max, outgoing_first_30, outgoing_first_30_all, last_5_seconds_packets,
        outgoing_burst_std, outgoing_burst_avg, num_outgoing_bursts, num_incoming_bursts,
        incoming_burst_first_5_seconds, outgoing_burst_first_5_seconds, incoming_per_second_avg,
        incoming_first_5, outgoing_first_5, total_time, incoming_first_5_all,
        last_5_seconds_time_intervals, outgoing_first_5_all
    ])

# 'features' now contains the dataset with 24 features
features = np.array(features)

# 'y' contains the labels for each sample
y = np.array(y)


## 4. Data split and downsampling

In [4]:
# conda install -c conda-forge scikit-learn imbalanced-learn

In [5]:
pip uninstall imbalanced-learn scikit-learn -y


Found existing installation: imbalanced-learn 0.10.1
Uninstalling imbalanced-learn-0.10.1:
  Successfully uninstalled imbalanced-learn-0.10.1
Found existing installation: scikit-learn 1.2.2
Uninstalling scikit-learn-1.2.2:
  Successfully uninstalled scikit-learn-1.2.2
Note: you may need to restart the kernel to use updated packages.


In [6]:
!pip install scikit-learn==1.2.2 imbalanced-learn==0.10.1


[33mDEPRECATION: Loading egg at /Users/sojeonglee/anaconda3/lib/python3.11/site-packages/dlib-19.24.99-py3.11-macosx-10.9-x86_64.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0mCollecting scikit-learn==1.2.2
  Using cached scikit_learn-1.2.2-cp311-cp311-macosx_10_9_x86_64.whl.metadata (11 kB)
Collecting imbalanced-learn==0.10.1
  Using cached imbalanced_learn-0.10.1-py3-none-any.whl.metadata (8.2 kB)
Using cached scikit_learn-1.2.2-cp311-cp311-macosx_10_9_x86_64.whl (9.0 MB)
Using cached imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
Installing collected packages: scikit-learn, imbalanced-learn
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
mlxtend 0.23.3 requires scikit-learn>=1.3.1, but you h

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=42, stratify=y)

# Separate the Monitored and Unmonitored data in the training set
train_data = pd.DataFrame({
    'features': list(X_train),
    'label': y_train
})
train_monitored = train_data[train_data['label'] != -1]  # Monitored data
train_unmonitored = train_data[train_data['label'] == -1]  # Unmonitored data

# Use SMOTE for oversampling the minority class (Unmonitored)
X_train_list = np.array(train_data['features'].tolist())
y_train_array = train_data['label'].values

smote = SMOTE(random_state=42, sampling_strategy={-1: 15200})  # Upsample Unmonitored to 15200 samples
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_list, y_train_array)

# Apply scaling to the data
scaler = StandardScaler()
X_train_balanced = scaler.fit_transform(X_train_balanced)  # Scale the training data
X_test = scaler.transform(X_test)  # Scale the test data using the same scaler

# Print the distribution of classes in the balanced training set
print("Balanced Training Data Distribution:")
print(pd.Series(y_train_balanced).value_counts())

# Print the distribution of classes in the test set
print("Test Data Distribution:")
print(pd.Series(y_test).value_counts())


  from pandas.core import (


Balanced Training Data Distribution:
-1     15200
 12      160
 73      160
 67      160
 52      160
       ...  
 82      160
 26      160
 23      160
 31      160
 81      160
Name: count, Length: 96, dtype: int64
Test Data Distribution:
-1     600
 6      40
 20     40
 62     40
 29     40
      ... 
 75     40
 28     40
 73     40
 34     40
 42     40
Name: count, Length: 96, dtype: int64




## 5. Train the Random Forest model

In [8]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=2000, 
    min_samples_split=5, 
    min_samples_leaf=2, 
    max_features=0.7, 
    max_depth=30, 
    class_weight='balanced' 
)

rf_model.fit(X_train_balanced, y_train_balanced)

## 6. Test and evaluate the model

In [9]:
from sklearn.metrics import accuracy_score, auc, confusion_matrix, precision_score, precision_recall_curve, roc_curve, recall_score, f1_score, classification_report

y_pred = rf_model.predict(X_test)

# calculate Accuracy, Precison, Recall, F1 Score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average = 'weighted')
recall = recall_score(y_test, y_pred, average = 'weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# calculate Confusion Matrix, ROC AUC, PR AUC 
conf_matrix = confusion_matrix(y_test, y_pred)

# create Classification Report
class_report = classification_report(y_test, y_pred, target_names=[f'Class {i}' for i in range(-1, 95)])

# print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (Weighted): {precision:.4f}")
print(f"Recall (Weighted): {recall:.4f}")
print(f"F1 Score (Weighted): {f1:.4f}")

print(f"\nConfusion Matrix\n{conf_matrix}")
print("\nClassification Report:\n", class_report)

Accuracy: 0.7077
Precision (Weighted): 0.7234
Recall (Weighted): 0.7077
F1 Score (Weighted): 0.7057

Confusion Matrix
[[452   1   2 ...   4   0   0]
 [  6  21   0 ...   1   0   0]
 [  5   0  29 ...   0   0   0]
 ...
 [ 10   0   0 ...  17   0   2]
 [  1   0   0 ...   0  36   0]
 [  2   0   0 ...   0   1  20]]

Classification Report:
               precision    recall  f1-score   support

    Class -1       0.52      0.75      0.61       600
     Class 0       0.81      0.53      0.64        40
     Class 1       0.85      0.72      0.78        40
     Class 2       0.81      0.85      0.83        40
     Class 3       0.73      0.68      0.70        40
     Class 4       0.71      0.72      0.72        40
     Class 5       0.89      0.78      0.83        40
     Class 6       0.82      0.93      0.87        40
     Class 7       0.80      0.90      0.85        40
     Class 8       0.67      0.78      0.72        40
     Class 9       0.69      0.62      0.66        40
    Class 10    

In [10]:
# check data distribution
print("\nOriginal Training Data Distribution:")
print(f"Label -1 count: {sum(y_train == -1)}")
print(f"Other labels count: {sum(y_train != -1)}")

print("\nBalanced Training Data Distribution:")
print(f"Label -1 count: {sum(y_train_balanced == -1)}")
print(f"Other labels count: {sum(y_train_balanced != -1)}")

print("\nTest Data Distribution:")
print(f"Label -1 count: {sum(y_test == -1)}")
print(f"Other labels count: {sum(y_test != -1)}")



Original Training Data Distribution:
Label -1 count: 2400
Other labels count: 15200

Balanced Training Data Distribution:
Label -1 count: 15200
Other labels count: 15200

Test Data Distribution:
Label -1 count: 600
Other labels count: 3800
