# 1. 데이터 로드
`mon_standard.pkl`과 `unmon_standard10_3000.pkl` 파일에서 데이터를 로드합니다.  
각 파일은 Monitored와 Unmonitored 데이터를 포함하며, pickle 형식으로 저장되어 있습니다.
.


In [1]:
import pickle

# Load the pickle files
print("Loading data...")
with open("mon_standard.pkl", 'rb') as fi:  # Monitored 데이터 로드
    mon_data = pickle.load(fi)

with open("unmon_standard10_3000.pkl", 'rb') as fi:  # Unmonitored 데이터 로드
    unmon_data = pickle.load(fi)


Loading data...


# 2. 데이터 전처리
Monitored와 Unmonitored 데이터를 각각 처리하여 X1 (timestamps), X2 (direction * size), y (label) 배열로 변환합니다.  
데이터 구조에 따라 `process_data` 함수가 `dict`와 `list` 데이터를 처리합니다.


In [4]:
def process_data(data, label_value):
    """
    데이터 전처리 함수
    :param data: dict 또는 list 형태의 데이터
    :param label_value: 라벨 값 (monitored=1, unmonitored=-1)
    :return: 전처리된 데이터 (X1, X2, y)
    """
    X1, X2, y = [], [], []

    if isinstance(data, dict):  # Monitored 데이터
        for samples in data.values():
            for sample in samples:
                X1.append([abs(c) for c in sample])
                X2.append([(1 if c > 0 else -1) * 512 for c in sample])
                y.append(label_value)
    elif isinstance(data, list):  # Unmonitored 데이터
        for sample in data:
            if isinstance(sample, list):
                X1.append([abs(c) for c in sample])
                X2.append([(1 if c > 0 else -1) * 512 for c in sample])
                y.append(label_value)

    return X1, X2, y

# Monitored 데이터 처리
X1_mon, X2_mon, y_mon = process_data(mon_data, label_value=1)

# Unmonitored 데이터 처리
X1_unmon, X2_unmon, y_unmon = process_data(unmon_data, label_value=-1)

# 데이터 결합
X1 = X1_mon + X1_unmon
X2 = X2_mon + X2_unmon
y = y_mon + y_unmon


# 3. 피처 생성
X1 (timestamps)와 X2 (direction * size)를 기반으로 피처를 생성합니다.  
각 샘플의 주요 특성을 나타내는 8개의 피처를 추출하여 학습에 사용합니다.


In [7]:
import numpy as np

def create_features(X1, X2):
    X = []
    for i in range(len(X1)):
        packet_size_direction = sum(X2[i])
        cumulative_packet_size = np.sum([abs(c) for c in X2[i]])
        burst_lengths = len([c for c in X2[i] if c != 0])

        num_incoming_packets = len([c for c in X2[i] if c > 0])
        ratio_incoming_packets = num_incoming_packets / len(X2[i]) if len(X2[i]) > 0 else 0
        num_outgoing_packets = len([c for c in X2[i] if c < 0])
        total_packet_count = len(X2[i])

        feature_vector = [
            packet_size_direction,
            np.mean(X1[i]) if X1[i] else 0,
            cumulative_packet_size,
            burst_lengths,
            num_incoming_packets,
            ratio_incoming_packets,
            num_outgoing_packets,
            total_packet_count
        ]
        X.append(feature_vector)
    return np.array(X)

# 피처 생성
X = create_features(X1, X2)
y = np.array(y)  # 레이블을 numpy array로 변환


# 4. 데이터 분할 및 스케일링
전체 데이터를 학습용과 테스트용으로 나누고, StandardScaler를 사용하여 데이터를 정규화합니다.


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 스케일링
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# 5. 모델 생성 및 학습
Random Forest와 Gradient Boosting 모델을 생성하고, VotingClassifier로 앙상블을 구성하여 학습합니다.


In [13]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier

# Random Forest 모델
rf_model = RandomForestClassifier(
    n_estimators=3000, max_depth=400, max_leaf_nodes=3000,
    min_samples_split=2, min_samples_leaf=1, max_features='sqrt',
    bootstrap=True, class_weight='balanced', random_state=42, n_jobs=-1
)

# Gradient Boosting 모델
gb_model = GradientBoostingClassifier(
    n_estimators=1000, learning_rate=0.1, max_depth=5, random_state=42
)

# 앙상블 모델 생성
ensemble_model = VotingClassifier(
    estimators=[('Random Forest', rf_model), ('Gradient Boosting', gb_model)],
    voting='soft'
)

# 모델 학습
ensemble_model.fit(X_train, y_train)


# 6. 모델 평가
학습된 모델을 평가합니다.  


In [16]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, precision_score, recall_score, precision_recall_curve

# 예측 결과
y_pred = ensemble_model.predict(X_test)

# 1. Accuracy와 Confusion Matrix
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix\n{conf_matrix}")

# 2. TPR, FPR, Precision, Recall, ROC AUC, Precision-Recall AUC
fpr, tpr, _ = roc_curve(y_test, y_pred, pos_label=1)
roc_auc = auc(fpr, tpr)

precision = precision_score(y_test, y_pred, pos_label=1)
recall = recall_score(y_test, y_pred, pos_label=1)

precision_values, recall_values, _ = precision_recall_curve(y_test, y_pred, pos_label=1)
pr_auc = auc(recall_values, precision_values)

# TPR과 FPR 계산
true_positive_rate = tpr[1] if len(tpr) > 1 else 0  # TPR
false_positive_rate = fpr[1] if len(fpr) > 1 else 0  # FPR

print("\nAdditional Metrics:")
print(f"True Positive Rate (TPR): {true_positive_rate}")
print(f"False Positive Rate (FPR): {false_positive_rate}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC: {roc_auc}")
print(f"Precision-Recall AUC: {pr_auc}")

Accuracy: 0.8645454545454545
Confusion Matrix
[[  60  540]
 [  56 3744]]

Additional Metrics:
True Positive Rate (TPR): 0.9852631578947368
False Positive Rate (FPR): 0.9
Precision: 0.8739495798319328
Recall: 0.9852631578947368
ROC AUC: 0.5426315789473684
Precision-Recall AUC: 0.9359700052269713


In [None]:
RandomizedSearchCV 하이퍼파라미터 튜닝
이 코드는 Random Forest와 Gradient Boosting 모델의 하이퍼파라미터를 최적화하기 위해 RandomizedSearchCV를 사용합니다.
최적화된 모델을 결합한 앙상블 모델(VotingClassifier)을 구성하고, 이를 학습 및 평가합니다.



In [19]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, precision_score, recall_score, precision_recall_curve
from scipy.stats import randint

# Random Forest 하이퍼파라미터 랜덤 샘플링 범위
rf_param_dist = {
    'n_estimators': randint(100, 1000),
    'max_depth': randint(10, 50),
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'class_weight': ['balanced', None]
}

# Gradient Boosting 하이퍼파라미터 랜덤 샘플링 범위
gb_param_dist = {
    'n_estimators': randint(100, 1000),
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': randint(3, 10),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5)
}

# 1. Random Forest RandomizedSearchCV
rf_model = RandomForestClassifier(random_state=42)
rf_random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=rf_param_dist,
    n_iter=50,  # Number of iterations for random search
    cv=3,  # 3-fold cross-validation
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)
rf_random_search.fit(X_train, y_train)

rf_best_params = rf_random_search.best_params_
rf_best_model = rf_random_search.best_estimator_
print(f"Best Random Forest Parameters: {rf_best_params}")

# 2. Gradient Boosting RandomizedSearchCV
gb_model = GradientBoostingClassifier(random_state=42)
gb_random_search = RandomizedSearchCV(
    estimator=gb_model,
    param_distributions=gb_param_dist,
    n_iter=50,  # Number of iterations for random search
    cv=3,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)
gb_random_search.fit(X_train, y_train)

gb_best_params = gb_random_search.best_params_
gb_best_model = gb_random_search.best_estimator_
print(f"Best Gradient Boosting Parameters: {gb_best_params}")

# 3. 최적 모델로 앙상블 구성
ensemble_model = VotingClassifier(
    estimators=[('Random Forest', rf_best_model), ('Gradient Boosting', gb_best_model)],
    voting='soft'
)

# 앙상블 모델 학습
ensemble_model.fit(X_train, y_train)

# 평가
y_pred = ensemble_model.predict(X_test)

# Accuracy와 Confusion Matrix
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print(f"\nAccuracy after tuning: {accuracy}")
print(f"Confusion Matrix after tuning\n{conf_matrix}")

# 추가 평가 지표
fpr, tpr, _ = roc_curve(y_test, y_pred, pos_label=1)
roc_auc = auc(fpr, tpr)

precision = precision_score(y_test, y_pred, pos_label=1)
recall = recall_score(y_test, y_pred, pos_label=1)

precision_values, recall_values, _ = precision_recall_curve(y_test, y_pred, pos_label=1)
pr_auc = auc(recall_values, precision_values)

print("\nAdditional Metrics after tuning:")
print(f"True Positive Rate (TPR): {tpr[1] if len(tpr) > 1 else 0}")
print(f"False Positive Rate (FPR): {fpr[1] if len(fpr) > 1 else 0}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC: {roc_auc}")
print(f"Precision-Recall AUC: {pr_auc}")


Best Random Forest Parameters: {'class_weight': None, 'max_depth': 16, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 9, 'n_estimators': 134}
Best Gradient Boosting Parameters: {'learning_rate': 0.01, 'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 127}

Accuracy after tuning: 0.8670454545454546
Confusion Matrix after tuning
[[  28  572]
 [  13 3787]]

Additional Metrics after tuning:
True Positive Rate (TPR): 0.996578947368421
False Positive Rate (FPR): 0.9533333333333334
Precision: 0.8687772424868089
Recall: 0.996578947368421
ROC AUC: 0.5216228070175438
Precision-Recall AUC: 0.9341553676548877
