## 1. Data Load

In [16]:
import pickle

with open("mon_standard.pkl", 'rb') as fi:
    mon_data = pickle.load(fi)
    

with open("unmon_standard10_3000.pkl", 'rb') as fi:
    unmon_data = pickle.load(fi)
    
print("Data has been successfully loaded.")

Data has been successfully loaded.


## 4. Data split and downsampling

## 2. Data Preprocess

In [17]:
# Monitored 데이터 처리 함수
def process_monitored(data, total_urls=950, use_sublabel=False, url_per_site=10):
    X1, X2, y = [], [], []
    for i in range(total_urls):
        label = i if use_sublabel else i // url_per_site
        for sample in data[i]:
            size_seq = []
            time_seq = []
            for c in sample:
                dr = 1 if c > 0 else -1
                time_seq.append(abs(c))
                size_seq.append(dr * 512)
            X1.append(time_seq)
            X2.append(size_seq)
            y.append(label)
    return X1, X2, y

# Unmonitored 데이터 처리 함수
def process_unmonitored(data, total_urls=3000):
    X1, X2 = [], []
    for i in range(total_urls):
        size_seq = []
        time_seq = []
        for c in data[i]:
            dr = 1 if c > 0 else -1
            time_seq.append(abs(c))
            size_seq.append(dr * 512)
        X1.append(time_seq)
        X2.append(size_seq)
    y = [-1] * len(X1)  # Unmonitored 데이터는 -1로 라벨링
    return X1, X2, y

# Monitored 데이터 처리
X1_mon, X2_mon, y_mon = process_monitored(
    mon_data, total_urls=950, use_sublabel=False, url_per_site=10
)

# Unmonitored 데이터 처리
X1_unmon, X2_unmon, y_unmon = process_unmonitored(
    unmon_data, total_urls=3000
)

# 데이터 결합
X1 = X1_mon + X1_unmon
X2 = X2_mon + X2_unmon
y = y_mon + y_unmon

## 3. Feature Extraction

In [18]:
import numpy as np
from itertools import groupby

# Initialize a list to store all features
features = []

# Feature extraction
for i in range(len(X2)):
    size_seq = X2[i]
    time_seq = X1[i]
    
    # 1. Number of incoming packets
    num_incoming_packets = sum(1 for size in size_seq if size < 0)
    
    # 2. Total number of packets
    num_total_packets = len(size_seq)
    
    # 3. Number of outgoing packets
    num_outgoing_packets = sum(1 for size in size_seq if size > 0)
    
    # 4. Proportion of outgoing packets
    ratio_outgoing = num_outgoing_packets / num_total_packets if num_total_packets > 0 else 0
    
    # 5. Proportion of incoming packets
    ratio_incoming = num_incoming_packets / num_total_packets if num_total_packets > 0 else 0
    
    # 6. Proportion of incoming packets in the first 30 packets
    incoming_first_30 = sum(1 for size in size_seq[:30] if size < 0) / 30 if num_total_packets >= 30 else 0
    
    # 7. Proportion of outgoing packets in the first 30 packets
    outgoing_first_30 = sum(1 for size in size_seq[:30] if size > 0) / 30 if num_total_packets >= 30 else 0
    
    # 8. Maximum number of packets in outgoing bursts
    burst_outgoing_max = max(len(list(g)) for k, g in groupby(size_seq) if k > 0) if num_outgoing_packets > 0 else 0
    
    # 9. Standard deviation of outgoing burst sizes
    outgoing_burst_std = np.std([len(list(g)) for k, g in groupby(size_seq) if k > 0])
    
    # 10. Average size of outgoing bursts
    outgoing_burst_avg = np.mean([len(list(g)) for k, g in groupby(size_seq) if k > 0])
    
    # 11. Number of outgoing bursts
    num_outgoing_bursts = len([1 for k, g in groupby(size_seq) if k > 0])
    
    # 12. Number of incoming bursts
    num_incoming_bursts = len([1 for k, g in groupby(size_seq) if k < 0])
    
    # 13. Average number of incoming packets per second
    incoming_per_second_avg = num_incoming_packets / (len(time_seq) / 60) if len(time_seq) > 0 else 0

    # 14. Total transmission time
    total_time = sum(time_seq)

    # 15. Average time interval in the last 5 seconds
    last_5_seconds_time_intervals = np.mean([time_seq[i+1] - time_seq[i] for i in range(len(time_seq)-1) if time_seq[i] > 0]) if len(time_seq) > 1 else 0
    
    # 16. Sum of packet sizes
    sum_packets = sum(size_seq)
    
    # 17. Packets per second
    packets_per_second = len(size_seq) / sum(time_seq) if sum(time_seq) > 0 else 0
    
    # 18. Mean packets per second
    packets_per_second_values = [1 / t for t in time_seq if t > 0]
    
    # 19. mean of the packets per second values
    mean_packets_per_second = np.mean(packets_per_second_values) if packets_per_second_values else 0
    
    # 20. Standard deviation of packets per second
    std_packets_per_second = np.std(packets_per_second_values) if packets_per_second_values else 0
    
    # 21. Transmission time Q1 (25th percentile)
    transmission_time_Q1 = np.percentile(time_seq, 25) if len(time_seq) > 0 else 0
    
    # 22. Transmission time Q2 (median)
    transmission_time_Q2 = np.percentile(time_seq, 50) if len(time_seq) > 0 else 0
    
    # 23. Transmission time Q3 (75th percentile)
    transmission_time_Q3 = np.percentile(time_seq, 75) if len(time_seq) > 0 else 0
    
    # 24. Transmission time Q4 (maximum)
    transmission_time_Q4 = np.percentile(time_seq, 100) if len(time_seq) > 0 else 0
    
    # 25. Average ordering of outgoing packets
    outgoing_order = [i for i, size in enumerate(size_seq) if size > 0]
    average_outgoing_ordering = np.mean(outgoing_order) if outgoing_order else 0
    
    # 26. Standard deviation of outgoing packet ordering
    std_dev_outgoing_ordering = np.std(outgoing_order) if outgoing_order else 0
    
    # Add all features to the list
    features.append([
        num_incoming_packets, num_total_packets, num_outgoing_packets, ratio_outgoing,
        ratio_incoming, incoming_first_30, outgoing_first_30, burst_outgoing_max,
        outgoing_burst_std, outgoing_burst_avg, num_outgoing_bursts,
        num_incoming_bursts, incoming_per_second_avg,  total_time,
        last_5_seconds_time_intervals, 
        sum_packets, packets_per_second, mean_packets_per_second, std_packets_per_second,
        transmission_time_Q1, transmission_time_Q2, transmission_time_Q3, transmission_time_Q4,
        average_outgoing_ordering, std_dev_outgoing_ordering
    ])

# Convert the feature list to a numpy array
features = np.array(features)

# Labels for each sample
y = np.array(y)

## 4. Data split and downsampling

In [19]:
# conda install -c conda-forge scikit-learn imbalanced-learn


In [20]:
pip uninstall imbalanced-learn scikit-learn -y


Found existing installation: imbalanced-learn 0.10.1
Uninstalling imbalanced-learn-0.10.1:
  Successfully uninstalled imbalanced-learn-0.10.1
Found existing installation: scikit-learn 1.2.2
Uninstalling scikit-learn-1.2.2:
  Successfully uninstalled scikit-learn-1.2.2
Note: you may need to restart the kernel to use updated packages.


In [21]:
!pip install scikit-learn==1.2.2 imbalanced-learn==0.10.1


[33mDEPRECATION: Loading egg at /Users/sojeonglee/anaconda3/lib/python3.11/site-packages/dlib-19.24.99-py3.11-macosx-10.9-x86_64.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0mCollecting scikit-learn==1.2.2
  Using cached scikit_learn-1.2.2-cp311-cp311-macosx_10_9_x86_64.whl.metadata (11 kB)
Collecting imbalanced-learn==0.10.1
  Using cached imbalanced_learn-0.10.1-py3-none-any.whl.metadata (8.2 kB)
Using cached scikit_learn-1.2.2-cp311-cp311-macosx_10_9_x86_64.whl (9.0 MB)
Using cached imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
Installing collected packages: scikit-learn, imbalanced-learn
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
mlxtend 0.23.3 requires scikit-learn>=1.3.1, but you h

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

# 데이터 분할 (Train/Test)
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=42, stratify=y)

# Train 데이터에서 Monitored와 Unmonitored 데이터 분리
train_data = pd.DataFrame({
    'features': list(X_train),
    'label': y_train
})
train_monitored = train_data[train_data['label'] != -1]
train_unmonitored = train_data[train_data['label'] == -1]

# SMOTE를 사용하여 업샘플링
X_train_list = np.array(train_data['features'].tolist())
y_train_array = train_data['label'].values

smote = SMOTE(random_state=42, sampling_strategy={-1: 15200})  # Unmonitored를 15200까지 업샘플링
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_list, y_train_array)

# 데이터 스케일링
scaler = StandardScaler()
X_train_balanced = scaler.fit_transform(X_train_balanced)
X_test = scaler.transform(X_test)

# 결과 확인
print("Balanced Training Data Distribution:")
print(pd.Series(y_train_balanced).value_counts())

print("Test Data Distribution:")
print(pd.Series(y_test).value_counts())


Balanced Training Data Distribution:
-1     15200
 12      160
 73      160
 67      160
 52      160
       ...  
 82      160
 26      160
 23      160
 31      160
 81      160
Name: count, Length: 96, dtype: int64
Test Data Distribution:
-1     600
 6      40
 20     40
 62     40
 29     40
      ... 
 75     40
 28     40
 73     40
 34     40
 42     40
Name: count, Length: 96, dtype: int64




## 5. Train the Random Forest model

In [23]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=2000, 
    min_samples_split=5, 
    min_samples_leaf=2, 
    max_features=0.7, 
    max_depth=30, 
    class_weight='balanced' 
)

rf_model.fit(X_train_balanced, y_train_balanced)

## 6. Test and evaluate the model

In [24]:
from sklearn.metrics import accuracy_score, auc, confusion_matrix, precision_score, precision_recall_curve, roc_curve, recall_score, f1_score, classification_report

y_pred = rf_model.predict(X_test)

# calculate Accuracy, Precison, Recall, F1 Score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average = 'weighted')
recall = recall_score(y_test, y_pred, average = 'weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# calculate Confusion Matrix, ROC AUC, PR AUC 
conf_matrix = confusion_matrix(y_test, y_pred)

# create Classification Report
class_report = classification_report(y_test, y_pred, target_names=[f'Class {i}' for i in range(-1, 95)])

# print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (Weighted): {precision:.4f}")
print(f"Recall (Weighted): {recall:.4f}")
print(f"F1 Score (Weighted): {f1:.4f}")

print(f"\nConfusion Matrix\n{conf_matrix}")
print("\nClassification Report:\n", class_report)

Accuracy: 0.7543
Precision (Weighted): 0.7784
Recall (Weighted): 0.7543
F1 Score (Weighted): 0.7547

Confusion Matrix
[[490   1   2 ...   3   0   3]
 [  5  27   0 ...   1   0   1]
 [  5   0  31 ...   0   0   0]
 ...
 [ 16   0   0 ...  16   0   0]
 [  2   0   0 ...   0  37   0]
 [  3   0   0 ...   0   0  25]]

Classification Report:
               precision    recall  f1-score   support

    Class -1       0.52      0.82      0.64       600
     Class 0       0.87      0.68      0.76        40
     Class 1       0.86      0.78      0.82        40
     Class 2       0.90      0.88      0.89        40
     Class 3       0.86      0.78      0.82        40
     Class 4       0.79      0.78      0.78        40
     Class 5       0.94      0.75      0.83        40
     Class 6       0.82      0.90      0.86        40
     Class 7       0.82      0.93      0.87        40
     Class 8       0.82      0.78      0.79        40
     Class 9       0.83      0.72      0.77        40
    Class 10    

In [25]:
# 검증 코드: 데이터 분포 확인
print("\nOriginal Training Data Distribution:")
print(f"Label -1 count: {sum(y_train == -1)}")
print(f"Other labels count: {sum(y_train != -1)}")

print("\nBalanced Training Data Distribution:")
print(f"Label -1 count: {sum(y_train_balanced == -1)}")
print(f"Other labels count: {sum(y_train_balanced != -1)}")

print("\nTest Data Distribution:")
print(f"Label -1 count: {sum(y_test == -1)}")
print(f"Other labels count: {sum(y_test != -1)}")



Original Training Data Distribution:
Label -1 count: 2400
Other labels count: 15200

Balanced Training Data Distribution:
Label -1 count: 15200
Other labels count: 15200

Test Data Distribution:
Label -1 count: 600
Other labels count: 3800
