## 1. Data load

In [1]:
import pickle

USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS   = 950

# Load the pickle file
print("Loading datafile...")
with open("mon_standard.pkl", 'rb') as fi:
    data = pickle.load(fi)

X1 = [] # Array to store instances (timestamps) - 19,000 instances, e.g., [[0.0, 0.5, 3.4, ...], [0.0, 4.5, ...], [0.0, 1.5, ...], ... [... ,45.8]]
X2 = [] # Array to store instances (direction*size) - size information
y = [] # Array to store the site of each instance - 19,000 instances, e.g., [0, 0, 0, 0, 0, 0, ..., 94, 94, 94, 94, 94]

# Differentiate instances and sites, and store them in the respective x and y arrays
# x array (direction*timestamp), y array (site label)
for i in range(TOTAL_URLS):
    if USE_SUBLABEL:
        label = i
    else:
        label = i // URL_PER_SITE # Calculate which site's URL the current URL being processed belongs to and set that value as the label. Thus, URLs fetched from the same site are labeled identically.
    for sample in data[i]:
        size_seq = []
        time_seq = []
        for c in sample:
            dr = 1 if c > 0 else -1
            time_seq.append(abs(c))
            size_seq.append(dr * 512)
        X1.append(time_seq)
        X2.append(size_seq)
        y.append(label)
size = len(y)

print(f'Total samples: {size}') # Output: 19000


Loading datafile...
Total samples: 19000


## 2. Feature Extraction

In [2]:
import numpy as np

def create_features(X1, X2):
    X = []
    for i in range(len(X1)):
        packet_size_direction = sum(X2[i])
        cumulative_packet_size = np.sum([abs(c) for c in X2[i]])
        burst_lengths = len([c for c in X2[i] if c != 0])

        num_incoming_packets = len([c for c in X2[i] if c > 0])
        ratio_incoming_packets = num_incoming_packets / len(X2[i]) if len(X2[i]) > 0 else 0
        num_outgoing_packets = len([c for c in X2[i] if c < 0])
        total_packet_count = len(X2[i])

        feature_vector = [
            packet_size_direction,
            np.mean(X1[i]) if X1[i] else 0,
            cumulative_packet_size,
            burst_lengths,
            num_incoming_packets,
            ratio_incoming_packets,
            num_outgoing_packets,
            total_packet_count
        ]
        X.append(feature_vector)
    return np.array(X)

# feature extraction
X = create_features(X1, X2)
y = np.array(y)  # 레이블을 numpy array로 변환

## 3. Data split and scaling

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 스케일링
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## 4. Train the ensemble model

In [5]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier

# Random Forest 모델
rf_model = RandomForestClassifier(
    n_estimators=100, max_depth=20, max_leaf_nodes=500,
    min_samples_split=2, min_samples_leaf=1, max_features='sqrt',
    bootstrap=True, class_weight='balanced', random_state=42, n_jobs=-1
)

# Gradient Boosting 모델
gb_model = GradientBoostingClassifier(
    n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42
)

# 앙상블 모델 생성
ensemble_model = VotingClassifier(
    estimators=[('Random Forest', rf_model), ('Gradient Boosting', gb_model)],
    voting='soft'
)

# 모델 학습
ensemble_model.fit(X_train, y_train)

VotingClassifier(estimators=[('Random Forest',
                              RandomForestClassifier(class_weight='balanced',
                                                     max_depth=20,
                                                     max_features='sqrt',
                                                     max_leaf_nodes=500,
                                                     n_jobs=-1,
                                                     random_state=42)),
                             ('Gradient Boosting',
                              GradientBoostingClassifier(n_estimators=200,
                                                         random_state=42))],
                 voting='soft')

## 5. Test and evaluate model

In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

y_pred = ensemble_model.predict(X_test)

# Accuracy, Precison, Recall, F1 Score 계산
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average = 'weighted')
recall = recall_score(y_test, y_pred, average = 'weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Classification Report 생성
class_report = classification_report(y_test, y_pred, target_names=[f'Class {i}' for i in range(95)])

# 결과 출력
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (Weighted): {precision:.4f}")
print(f"Recall (Weighted): {recall:.4f}")
print(f"F1 Score (Weighted): {f1:.4f}")
print("\nClassification Report:\n", class_report)

Accuracy: 0.4705
Precision (Weighted): 0.4644
Recall (Weighted): 0.4705
F1 Score (Weighted): 0.4626

Classification Report:
               precision    recall  f1-score   support

     Class 0       0.31      0.20      0.24        40
     Class 1       0.26      0.17      0.21        40
     Class 2       0.60      0.60      0.60        40
     Class 3       0.51      0.55      0.53        40
     Class 4       0.28      0.28      0.28        40
     Class 5       0.30      0.35      0.33        40
     Class 6       0.60      0.68      0.64        40
     Class 7       0.38      0.45      0.41        40
     Class 8       0.51      0.57      0.54        40
     Class 9       0.41      0.35      0.38        40
    Class 10       0.35      0.28      0.31        40
    Class 11       0.32      0.33      0.32        40
    Class 12       0.73      0.80      0.76        40
    Class 13       0.32      0.25      0.28        40
    Class 14       0.53      0.42      0.47        40
    Class 

## 6. Hyperparameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, precision_score, recall_score, precision_recall_curve
from scipy.stats import randint

# Random Forest 하이퍼파라미터 랜덤 샘플링 범위
rf_param_dist = {
    'n_estimators': [1000, 2000, 3000, 4000],
    'max_depth': randint(5, 20),
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5)
}

# Gradient Boosting 하이퍼파라미터 랜덤 샘플링 범위
gb_param_dist = {
    'n_estimators': [1000, 2000, 3000, 4000],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': randint(3, 10),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5)
}

# 1. Random Forest RandomizedSearchCV
rf_model = RandomForestClassifier(random_state=42)
rf_random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=rf_param_dist,
    n_iter=50,  # Number of iterations for random search
    cv=3,  # 3-fold cross-validation
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)
rf_random_search.fit(X_train, y_train)

rf_best_params = rf_random_search.best_params_
rf_best_model = rf_random_search.best_estimator_
print(f"Best Random Forest Parameters: {rf_best_params}")

# 2. Gradient Boosting RandomizedSearchCV
gb_model = GradientBoostingClassifier(random_state=42)
gb_random_search = RandomizedSearchCV(
    estimator=gb_model,
    param_distributions=gb_param_dist,
    n_iter=50,  # Number of iterations for random search
    cv=3,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)
gb_random_search.fit(X_train, y_train)

gb_best_params = gb_random_search.best_params_
gb_best_model = gb_random_search.best_estimator_
print(f"Best Gradient Boosting Parameters: {gb_best_params}")

# 3. 최적 모델로 앙상블 구성
ensemble_model = VotingClassifier(
    estimators=[('Random Forest', rf_best_model), ('Gradient Boosting', gb_best_model)],
    voting='soft'
)

# 앙상블 모델 학습
ensemble_model.fit(X_train, y_train)

10 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 450, in fit
    trees = Parallel(
  File "C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py", line 1046, in __call__
    while self.dispatch_one_batch(iterator):
  File "C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\ProgramData\Ana

Best Random Forest Parameters: {'class_weight': 'balanced', 'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 759}


## 7. Visualize the performance

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, f1_score
import numpy as np

# 1. Confusion Matrix 시각화
def plot_confusion_matrix(cm, labels, title="Confusion Matrix"):
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels, cbar=False)
    plt.title(title)
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.show()
    
cm = confusion_matrix(y_test, y_pred)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
labels = [f'Class {i}' for i in range(95)]

# Confusion Matrix 시각화 실행
plot_confusion_matrix(cm, labels)

# 정규화된 Confusion Matrix 시각화
plot_confusion_matrix(cm_normalized, labels, title="Normalized Confusion Matrix")

In [None]:
# 모델의 예측 확률 얻기
y_scores = ensemble_model.predict_proba(X_test)

# 2. ROC Curve 시각화
def plot_roc_curve(y_test, y_scores, n_classes):
    plt.figure(figsize=(8, 6))
    for i in range(n_classes):
        fpr, tpr, _ = roc_curve(y_test == i, y_scores[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'Class {i} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic Curve')
    plt.legend(loc='lower right')
    plt.show()

plot_roc_curve(y_test, y_scores, 95)

In [None]:
# 3. Precision-Recall Curve 시각화
def plot_precision_recall_curve(y_test, y_scores, n_classes):
    plt.figure(figsize=(8, 6))
    for i in range(n_classes):
        precision, recall, _ = precision_recall_curve(y_test == i, y_scores[:, i])
        pr_auc = auc(recall, precision)
        plt.plot(recall, precision, label=f'Class {i} (AUC = {pr_auc:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc='lower left')
    plt.show()
    
plot_precision_recall_curve(y_test, y_scores, 95)

In [None]:
# 4. Threshold에 따른 성능변화 시각화
def plot_threshold_performance(y_test, y_scores):
    thresholds = np.arange(0.0, 1.1, 0.05)
    precision_vals = []
    recall_vals = []
    f1_vals = []

    for threshold in thresholds:
        y_pred = (y_scores >= threshold).astype(int)
        precision_vals.append(precision_score(y_test, y_pred, average='weighted'))
        recall_vals.append(recall_score(y_test, y_pred, average='weighted'))
        f1_vals.append(f1_score(y_test, y_pred, average='weighted'))

    plt.figure(figsize=(8, 6))
    plt.plot(thresholds, precision_vals, label='Precision', marker='o')
    plt.plot(thresholds, recall_vals, label='Recall', marker='o')
    plt.plot(thresholds, f1_vals, label='F1-Score', marker='o')
    plt.xlabel('Threshold')
    plt.ylabel('Score')
    plt.title('Threshold vs Performance')
    plt.legend()
    plt.show()
    
plot_threshold_performance(y_test, y_scores)