## 1. Data Load

In [1]:
import pickle

with open("mon_standard.pkl", 'rb') as fi:
    mon_data = pickle.load(fi)
    

with open("unmon_standard10_3000.pkl", 'rb') as fi:
    unmon_data = pickle.load(fi)
    
print("Data has been successfully loaded.")

Data has been successfully loaded.


## 4. Data split and downsampling

## 2. Data Preprocess

In [None]:
# Function to process monitored data
def process_monitored(data, total_urls=950, use_sublabel=False, url_per_site=10):
    X1, X2, y = [], [], []
    for i in range(total_urls):
        label = i if use_sublabel else i // url_per_site  # Assign label based on sublabel option or site grouping
        for sample in data[i]:
            size_seq = []
            time_seq = []
            for c in sample:
                dr = 1 if c > 0 else -1  # Determine direction based on value sign
                time_seq.append(abs(c))  # Append absolute time
                size_seq.append(dr * 512)  # Assign fixed packet size with direction
            X1.append(time_seq)
            X2.append(size_seq)
            y.append(label)
    return X1, X2, y

# Function to process unmonitored data
def process_unmonitored(data, total_urls=3000):
    X1, X2 = [], []
    for i in range(total_urls):
        size_seq = []
        time_seq = []
        for c in data[i]:
            dr = 1 if c > 0 else -1  # Determine direction based on value sign
            time_seq.append(abs(c))  # Append absolute time
            size_seq.append(dr * 512)  # Assign fixed packet size with direction
        X1.append(time_seq)
        X2.append(size_seq)
    y = [-1] * len(X1)  # Label unmonitored data as -1
    return X1, X2, y

# Process monitored data
X1_mon, X2_mon, y_mon = process_monitored(
    mon_data, total_urls=950, use_sublabel=False, url_per_site=10
)

# Process unmonitored data
X1_unmon, X2_unmon, y_unmon = process_unmonitored(
    unmon_data, total_urls=3000
)

# Combine data
X1 = X1_mon + X1_unmon  # Combine time sequence data
X2 = X2_mon + X2_unmon  # Combine size sequence data
y = y_mon + y_unmon  # Combine labels


## 3. Feature Extraction

In [None]:
import numpy as np
from itertools import groupby

# Initialize a list to store the newly generated features
features = []

# Calculate each feature
for i in range(len(X2)):
    size_seq = X2[i]
    time_seq = X1[i]
    
    # 1. Number of incoming packets
    num_incoming_packets = sum(1 for size in size_seq if size < 0)
    
    # 2. Total number of packets
    num_total_packets = len(size_seq)
    
    # 3. Number of outgoing packets
    num_outgoing_packets = sum(1 for size in size_seq if size > 0)
    
    # 4. Proportion of outgoing packets among all packets
    ratio_outgoing = num_outgoing_packets / num_total_packets if num_total_packets > 0 else 0
    
    # 5. Proportion of incoming packets among all packets
    ratio_incoming = num_incoming_packets / num_total_packets if num_total_packets > 0 else 0
    
    # 6. Proportion of incoming packets among the first 30 packets
    incoming_first_30 = sum(1 for size in size_seq[:30] if size < 0) / 30 if num_total_packets >= 30 else 0
    
    # 7. Proportion of incoming packets among the first 30 packets relative to total packets
    incoming_first_30_all = sum(1 for size in size_seq[:30] if size < 0) / 30 if num_total_packets >= 30 else 0
    
    # 8. Maximum number of packets in an outgoing burst
    burst_outgoing_max = max(len(list(g)) for k, g in groupby(size_seq) if k > 0) if num_outgoing_packets > 0 else 0
    
    # 9. Proportion of outgoing packets among the first 30 packets
    outgoing_first_30 = sum(1 for size in size_seq[:30] if size > 0) / 30 if num_total_packets >= 30 else 0
    
    # 10. Proportion of outgoing packets among the first 30 packets relative to total packets
    outgoing_first_30_all = sum(1 for size in size_seq[:30] if size > 0) / 30 if num_total_packets >= 30 else 0
    
    # 11. Number of packets during the last 5 seconds
    last_5_seconds_packets = len([size for size in size_seq[-5:] if size != 0])
    
    # 12. Standard deviation of packet counts in outgoing bursts
    outgoing_burst_std = np.std([len(list(g)) for k, g in groupby(size_seq) if k > 0])
    
    # 13. Average number of packets in outgoing bursts
    outgoing_burst_avg = np.mean([len(list(g)) for k, g in groupby(size_seq) if k > 0])
    
    # 14. Number of outgoing bursts
    num_outgoing_bursts = len([1 for k, g in groupby(size_seq) if k > 0])
    
    # 15. Number of incoming bursts
    num_incoming_bursts = len([1 for k, g in groupby(size_seq) if k < 0])
    
    # 16. Number of packets in incoming bursts during the first 5 seconds
    incoming_burst_first_5_seconds = sum(1 for size in size_seq[:5] if size < 0)
    
    # 17. Number of packets in outgoing bursts during the first 5 seconds
    outgoing_burst_first_5_seconds = sum(1 for size in size_seq[:5] if size > 0)
    
    # 18. Average number of incoming packets per second
    incoming_per_second_avg = num_incoming_packets / (len(time_seq) / 60) if len(time_seq) > 0 else 0
    
    # 19. Proportion of incoming packets among the first 5 packets
    incoming_first_5 = sum(1 for size in size_seq[:5] if size < 0) / 5 if num_total_packets >= 5 else 0
    
    # 20. Proportion of outgoing packets among the first 5 packets
    outgoing_first_5 = sum(1 for size in size_seq[:5] if size > 0) / 5 if num_total_packets >= 5 else 0
    
    # 21. Total loading time (sum of all time intervals)
    total_time = sum(time_seq)
    
    # 22. Proportion of incoming packets among the first 5 packets relative to total packets
    incoming_first_5_all = sum(1 for size in size_seq[:5] if size < 0) / 5 if num_total_packets >= 5 else 0
    
    # 23. Average time interval of packets during the last 5 seconds
    last_5_seconds_time_intervals = np.mean([time_seq[i+1] - time_seq[i] for i in range(len(time_seq)-1) if time_seq[i] > 0]) if len(time_seq) > 1 else 0
    
    # 24. Proportion of outgoing packets among the first 5 packets relative to total packets
    outgoing_first_5_all = sum(1 for size in size_seq[:5] if size > 0) / 5 if num_total_packets >= 5 else 0

    # Add all generated features to the 'features' list
    features.append([
        num_incoming_packets, num_total_packets, num_outgoing_packets,
        ratio_outgoing, ratio_incoming, incoming_first_30, incoming_first_30_all,
        burst_outgoing_max, outgoing_first_30, outgoing_first_30_all, last_5_seconds_packets,
        outgoing_burst_std, outgoing_burst_avg, num_outgoing_bursts, num_incoming_bursts,
        incoming_burst_first_5_seconds, outgoing_burst_first_5_seconds, incoming_per_second_avg,
        incoming_first_5, outgoing_first_5, total_time, incoming_first_5_all,
        last_5_seconds_time_intervals, outgoing_first_5_all
    ])

# 'features' is a dataset containing 24 calculated features
features = np.array(features)

# 'y' is the label array corresponding to each sample
y = np.array(y)


## 4. Data split and downsampling

In [None]:
# conda install -c conda-forge scikit-learn imbalanced-learn

In [5]:
pip uninstall imbalanced-learn scikit-learn -y


Found existing installation: imbalanced-learn 0.10.1
Uninstalling imbalanced-learn-0.10.1:
  Successfully uninstalled imbalanced-learn-0.10.1
Found existing installation: scikit-learn 1.2.2
Uninstalling scikit-learn-1.2.2:
  Successfully uninstalled scikit-learn-1.2.2
Note: you may need to restart the kernel to use updated packages.


In [6]:
!pip install scikit-learn==1.2.2 imbalanced-learn==0.10.1


[33mDEPRECATION: Loading egg at /Users/sojeonglee/anaconda3/lib/python3.11/site-packages/dlib-19.24.99-py3.11-macosx-10.9-x86_64.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0mCollecting scikit-learn==1.2.2
  Using cached scikit_learn-1.2.2-cp311-cp311-macosx_10_9_x86_64.whl.metadata (11 kB)
Collecting imbalanced-learn==0.10.1
  Using cached imbalanced_learn-0.10.1-py3-none-any.whl.metadata (8.2 kB)
Using cached scikit_learn-1.2.2-cp311-cp311-macosx_10_9_x86_64.whl (9.0 MB)
Using cached imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
Installing collected packages: scikit-learn, imbalanced-learn
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
mlxtend 0.23.3 requires scikit-learn>=1.3.1, but you h

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

# Split the dataset into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=42, stratify=y)

# Separate Monitored and Unmonitored data in the training set
train_data = pd.DataFrame({
    'features': list(X_train),  # Convert training features to a list for manipulation
    'label': y_train  # Add corresponding labels
})
train_monitored = train_data[train_data['label'] != -1]  # Monitored samples
train_unmonitored = train_data[train_data['label'] == -1]  # Unmonitored samples

# Apply SMOTE to oversample the minority class (Unmonitored)
X_train_list = np.array(train_data['features'].tolist())  # Convert features back to a NumPy array
y_train_array = train_data['label'].values  # Extract labels as a NumPy array

smote = SMOTE(random_state=42, sampling_strategy={-1: 15200})  # Oversample Unmonitored to 15,200 samples
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_list, y_train_array)

# Scale the features for both training and test sets using StandardScaler
scaler = StandardScaler()
X_train_balanced = scaler.fit_transform(X_train_balanced)  # Fit and transform on the training set
X_test = scaler.transform(X_test)  # Transform only on the test set

# Check the distribution of classes after balancing
print("Balanced Training Data Distribution:")
print(pd.Series(y_train_balanced).value_counts())  # Count each class in the balanced training set

print("Test Data Distribution:")
print(pd.Series(y_test).value_counts())  # Count each class in the test set


  from pandas.core import (


Balanced Training Data Distribution:
-1     15200
 12      160
 73      160
 67      160
 52      160
       ...  
 82      160
 26      160
 23      160
 31      160
 81      160
Name: count, Length: 96, dtype: int64
Test Data Distribution:
-1     600
 6      40
 20     40
 62     40
 29     40
      ... 
 75     40
 28     40
 73     40
 34     40
 42     40
Name: count, Length: 96, dtype: int64




## 5. Train xgboost model

In [16]:
# 클래스 라벨 변환
min_class = min(y_train_balanced.min(), y_test.min())  # 가장 작은 클래스 값
y_train_balanced_transformed = y_train_balanced - min_class
y_test_transformed = y_test - min_class

# XGBoost 모델 생성 및 학습
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

xgb_model = XGBClassifier(
    objective='multi:softprob',  # 다중 클래스 확률 출력
    num_class=len(set(y_train_balanced_transformed)),  # 클래스 개수
    max_depth=15,  # 트리의 최대 깊이
    learning_rate=0.1,  # 학습률
    n_estimators=3000,  # 부스팅 라운드 수
    subsample=0.9,  # 데이터 샘플링 비율
    colsample_bytree=0.8,  # 특성 샘플링 비율
    random_state=42
)

xgb_model.fit(X_train_balanced, y_train_balanced_transformed)

# 검증 및 결과 출력
y_pred_transformed = xgb_model.predict(X_test)

# 원래 클래스 값으로 복원 (필요할 경우)
y_pred = y_pred_transformed + min_class

print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred))


XGBoost Classification Report:
              precision    recall  f1-score   support

          -1       0.61      0.75      0.67       600
           0       0.69      0.45      0.55        40
           1       0.72      0.82      0.77        40
           2       0.84      0.80      0.82        40
           3       0.76      0.72      0.74        40
           4       0.75      0.75      0.75        40
           5       0.78      0.72      0.75        40
           6       0.86      0.90      0.88        40
           7       0.80      0.88      0.83        40
           8       0.68      0.75      0.71        40
           9       0.65      0.75      0.70        40
          10       0.79      0.55      0.65        40
          11       0.74      0.78      0.76        40
          12       0.87      0.82      0.85        40
          13       0.42      0.28      0.33        40
          14       0.49      0.42      0.45        40
          15       0.72      0.70      0.71       

## 6. Test and evaluate the model

In [None]:
# Transform class labels to ensure they start from 0
min_class = min(y_train_balanced.min(), y_test.min())  # Find the smallest class value
y_train_balanced_transformed = y_train_balanced - min_class  # Shift training labels
y_test_transformed = y_test - min_class  # Shift test labels

# Train an XGBoost model
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

xgb_model = XGBClassifier(
    objective='multi:softprob',  # Outputs probabilities for multiple classes
    num_class=len(set(y_train_balanced_transformed)),  # Number of unique classes
    max_depth=15,  # Maximum depth of each tree
    learning_rate=0.1,  # Learning rate
    n_estimators=3000,  # Number of boosting rounds
    subsample=0.9,  # Fraction of samples used for training each tree
    colsample_bytree=0.8,  # Fraction of features used for training each tree
    random_state=42  # Seed for reproducibility
)

xgb_model.fit(X_train_balanced, y_train_balanced_transformed)  # Train the model

# Make predictions on the test set
y_pred_transformed = xgb_model.predict(X_test)

# Restore original class labels (if needed)
y_pred = y_pred_transformed + min_class  # Add the minimum class value back

# Output the classification report
print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred))  # Compare true and predicted labels

Accuracy: 0.7139
Precision (Weighted): 0.7176
Recall (Weighted): 0.7139
F1 Score (Weighted): 0.7110

Confusion Matrix
[[448   1   3 ...   4   0   1]
 [  5  18   1 ...   1   0   0]
 [  3   0  33 ...   0   0   0]
 ...
 [  7   0   0 ...  21   0   2]
 [  1   0   0 ...   0  37   0]
 [  1   0   0 ...   0   0  18]]

Classification Report:
               precision    recall  f1-score   support

    Class -1       0.61      0.75      0.67       600
     Class 0       0.69      0.45      0.55        40
     Class 1       0.72      0.82      0.77        40
     Class 2       0.84      0.80      0.82        40
     Class 3       0.76      0.72      0.74        40
     Class 4       0.75      0.75      0.75        40
     Class 5       0.78      0.72      0.75        40
     Class 6       0.86      0.90      0.88        40
     Class 7       0.80      0.88      0.83        40
     Class 8       0.68      0.75      0.71        40
     Class 9       0.65      0.75      0.70        40
    Class 10    

In [None]:
# check data distribution
print("\nOriginal Training Data Distribution:")
print(f"Label -1 count: {sum(y_train == -1)}")
print(f"Other labels count: {sum(y_train != -1)}")

print("\nBalanced Training Data Distribution:")
print(f"Label -1 count: {sum(y_train_balanced == -1)}")
print(f"Other labels count: {sum(y_train_balanced != -1)}")

print("\nTest Data Distribution:")
print(f"Label -1 count: {sum(y_test == -1)}")
print(f"Other labels count: {sum(y_test != -1)}")



Original Training Data Distribution:
Label -1 count: 2400
Other labels count: 15200

Balanced Training Data Distribution:
Label -1 count: 15200
Other labels count: 15200

Test Data Distribution:
Label -1 count: 600
Other labels count: 3800
