In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, recall_score, f1_score

file_path = 'dataset/Wednesday-workingHours.pcap_ISCX.csv'
data = pd.read_csv(file_path)

In [2]:
df = data.copy()
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.columns = df.columns.str.strip().str.replace('[ /]', '_', regex=True)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(0, inplace=True)

X = df.drop('Label', axis=1)
y = df['Label']
y_binary = np.where(y == 'BENIGN', 0, 1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_binary, test_size=0.15, random_state=42, stratify=y_binary
)

print(f"Train 데이터 크기: {X_train.shape}")
print(f"Test 데이터 크기: {X_test.shape}")
print("-" * 50)

Train 데이터 크기: (588797, 78)
Test 데이터 크기: (103906, 78)
--------------------------------------------------


In [4]:
print("## 1. 전체 Feature를 사용한 모델 학습 및 평가 ##")

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20, None],
    'min_samples_leaf': [1, 3, 5]
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid=param_grid,
    scoring='f1',
    cv=3
)
grid_search.fit(X_train, y_train)

print("최적의 하이퍼파라미터:", grid_search.best_params_)
model_full = grid_search.best_estimator_

y_pred_full = model_full.predict(X_test)
accuracy_full = accuracy_score(y_test, y_pred_full)
recall_full = recall_score(y_test, y_pred_full)
f1_full = f1_score(y_test, y_pred_full)

print(f"모델 정확도: {accuracy_full:.4f}")
print(f"재현율 (Recall): {recall_full:.4f}")
print(f"F1-Score: {f1_full:.4f}")
print("성능 리포트:")
print(classification_report(y_test, y_pred_full, target_names=['BENIGN (0)', 'ATTACK (1)']))
print("-" * 50)

## 1. 전체 Feature를 사용한 모델 학습 및 평가 ##
최적의 하이퍼파라미터: {'max_depth': 20, 'min_samples_leaf': 1, 'n_estimators': 100}
모델 정확도: 0.9995
재현율 (Recall): 0.9996
F1-Score: 0.9994
성능 리포트:
              precision    recall  f1-score   support

  BENIGN (0)       1.00      1.00      1.00     66005
  ATTACK (1)       1.00      1.00      1.00     37901

    accuracy                           1.00    103906
   macro avg       1.00      1.00      1.00    103906
weighted avg       1.00      1.00      1.00    103906

--------------------------------------------------


In [5]:
print("## 2. Gini 중요도 기반 상위 10개 특성 추출 ##")
importances = model_full.feature_importances_
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
top_10_feature_names = feature_importances.head(10)['feature'].tolist()

print("상위 10개 특성:")
print(feature_importances.head(10))
print("-" * 50)

## 2. Gini 중요도 기반 상위 10개 특성 추출 ##
상위 10개 특성:
                   feature  importance
54    Avg_Bwd_Segment_Size    0.073048
39       Max_Packet_Length    0.066022
10   Bwd_Packet_Length_Max    0.063546
41       Packet_Length_Std    0.055580
52     Average_Packet_Size    0.055403
0         Destination_Port    0.052953
37           Bwd_Packets_s    0.048549
13   Bwd_Packet_Length_Std    0.044506
40      Packet_Length_Mean    0.044022
42  Packet_Length_Variance    0.042012
--------------------------------------------------


In [None]:
print("## 3. 상위 10개 특성만 사용한 모델 학습 및 평가 ##")
X_train_top10 = X_train[top_10_feature_names]
X_test_top10 = X_test[top_10_feature_names]

grid_search_top10 = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid=param_grid,
    scoring='f1',
    cv=3
)
grid_search_top10.fit(X_train_top10, y_train)

print("최적의 하이퍼파라미터 (상위 10개 특성):", grid_search_top10.best_params_)
model_top10 = grid_search_top10.best_estimator_

y_pred_top10 = model_top10.predict(X_test_top10)
accuracy_top10 = accuracy_score(y_test, y_pred_top10)
recall_top10 = recall_score(y_test, y_pred_top10)
f1_top10 = f1_score(y_test, y_pred_top10)

print(f"모델 정확도: {accuracy_top10:.4f}")
print(f"재현율 (Recall): {recall_top10:.4f}")
print(f"F1-Score: {f1_top10:.4f}")
print("성능 리포트:")
print(classification_report(y_test, y_pred_top10, target_names=['BENIGN (0)', 'ATTACK (1)']))
print("-" * 50)

## 3. 상위 10개 특성만 사용한 모델 학습 및 평가 ##


In [None]:
print("## 4. 상위 6개 특성만 사용한 모델 학습 및 평가 ##")
X_train_top6 = X_train[top_10_feature_names[:6]]
X_test_top6 = X_test[top_10_feature_names[:6]]

grid_search_top6 = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid=param_grid,
    scoring='f1',
    cv=3
)
grid_search_top6.fit(X_train_top6, y_train)

print("최적의 하이퍼파라미터 (상위 6개 특성):", grid_search_top6.best_params_)
model_top6 = grid_search_top6.best_estimator_

y_pred_top6 = model_top6.predict(X_test_top6)
accuracy_top6 = accuracy_score(y_test, y_pred_top6)
recall_top6 = recall_score(y_test, y_pred_top6)
f1_top6 = f1_score(y_test, y_pred_top6)

print(f"모델 정확도: {accuracy_top6:.4f}")
print(f"재현율 (Recall): {recall_top6:.4f}")
print(f"F1-Score: {f1_top6:.4f}")
print("성능 리포트:")
print(classification_report(y_test, y_pred_top6, target_names=['BENIGN (0)', 'ATTACK (1)']))
print("-" * 50)

In [None]:
print("## 5. 최종 성능 비교 ##")
print("[전체 특성 사용 모델]")
print(f" - 하이퍼파라미터: {grid_search.best_params_}")
print(f" - 정확도 (Accuracy): {accuracy_full:.4f}")
print(f" - 재현율 (Recall): {recall_full:.4f}")
print(f" - F1-Score: {f1_full:.4f}")

print("[상위 10개 특성 사용 모델]")
print(f" - 하이퍼파라미터: {grid_search_top10.best_params_}")
print(f" - 정확도 (Accuracy): {accuracy_top10:.4f}")
print(f" - 재현율 (Recall): {recall_top10:.4f}")
print(f" - F1-Score: {f1_top10:.4f}")

print("[상위 6개 특성 사용 모델]")
print(f" - 하이퍼파라미터: {grid_search_top6.best_params_}")
print(f" - 정확도 (Accuracy): {accuracy_top6:.4f}")
print(f" - 재현율 (Recall): {recall_top6:.4f}")
print(f" - F1-Score: {f1_top6:.4f}")

모델 정확도: 0.9967
재현율 (Recall): 0.9996
F1-Score: 0.9955
성능 리포트:
              precision    recall  f1-score   support

  BENIGN (0)       1.00      1.00      1.00     66005
  ATTACK (1)       0.99      1.00      1.00     37901

    accuracy                           1.00    103906
   macro avg       1.00      1.00      1.00    103906
weighted avg       1.00      1.00      1.00    103906

--------------------------------------------------
## 4. 최종 성능 비교 ##
[전체 특성 사용 모델]
 - 정확도 (Accuracy): 0.9995
 - 재현율 (Recall): 0.9996
 - F1-Score: 0.9994
[상위 10개 특성 사용 모델]
 - 정확도 (Accuracy): 0.9967
 - 재현율 (Recall): 0.9996
 - F1-Score: 0.9955
