In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, accuracy_score, recall_score, f1_score
import tensorflow as tf
import scikeras
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

file_path = 'dataset/Wednesday-workingHours.pcap_ISCX.csv'
data = pd.read_csv(file_path)

df = data.copy()
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.columns = df.columns.str.strip().str.replace('[ /]', '_', regex=True)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(0, inplace=True)

X = df.drop('Label', axis=1)
y = df['Label']
y_binary = np.where(y == 'BENIGN', 0, 1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_binary, test_size=0.15, random_state=42, stratify=y_binary
)

print(f"Train 데이터 크기: {X_train.shape}")
print(f"Test 데이터 크기: {X_test.shape}")
print("-" * 50)


Train 데이터 크기: (588797, 78)
Test 데이터 크기: (103906, 78)
--------------------------------------------------


In [5]:
print("## LSTM 모델 학습 및 평가 ##")

# 데이터 스케일링
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# LSTM 입력 형태에 맞게 데이터 재구성 (samples, timesteps, features)
# 여기서는 각 샘플을 1개의 타임스텝으로 간주
X_train_lstm = X_train_scaled.reshape(X_train_scaled.shape[0], 1, X_train_scaled.shape[1])
X_test_lstm = X_test_scaled.reshape(X_test_scaled.shape[0], 1, X_test_scaled.shape[1])

def create_lstm_model(lstm_units=100, dense_units=50, dropout_rate=0.2, learning_rate=0.001):
    model = Sequential([
        LSTM(lstm_units, activation='relu',
        input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])),
        Dropout(dropout_rate),
        Dense(dense_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

# LSTM 모델 정의
lstm_model = KerasClassifier(build_fn=create_lstm_model)

grid_params = {
    'epochs': [10, 25, 50],
    'batch_size': [32, 64]
}
grid_search = GridSearchCV(estimator=lstm_model, param_grid=grid_params, cv=3, scoring='accuracy' , verbose=1)
grid_search.fit(X_train_lstm, y_train)

print("최적의 파라미터:", grid_search.best_params_)
lstm_best_model = grid_search.best_estimator_



## LSTM 모델 학습 및 평가 ##
Fitting 3 folds for each of 6 candidates, totalling 18 fits


  X, y = self._initialize(X, y)



Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  X, y = self._initialize(X, y)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  X, y = self._initialize(X, y)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

KeyboardInterrupt: 

In [None]:

# 모델 평가
loss, accuracy = lstm_best_model.evaluate(X_test_lstm, y_test, verbose=0)
y_pred_lstm_proba = lstm_best_model.predict(X_test_lstm)
y_pred_lstm = (y_pred_lstm_proba > 0.5).astype(int)

print(f"LSTM 모델 정확도: {accuracy:.4f}")
print(f"LSTM 모델 재현율 (Recall): {recall_score(y_test, y_pred_lstm):.4f}")
print(f"LSTM 모델 F1-Score: {f1_score(y_test, y_pred_lstm):.4f}")
print("LSTM 모델 성능 리포트:")
print(classification_report(y_test, y_pred_lstm, target_names=['BENIGN (0)', 'ATTACK (1)']))
print("-" * 50)

TypeError: Cannot clone object '<keras.src.engine.sequential.Sequential object at 0x000002945FDB3F50>' (type <class 'keras.src.engine.sequential.Sequential'>): it does not seem to be a scikit-learn estimator as it does not implement a 'get_params' method.

In [None]:
## ROC 곡선 및 AUC
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

y_pred_lstm_proba = lstm_best_model.predict_proba(X_test_lstm)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_lstm_proba)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='red', label=f'LSTM (AUC = {roc_auc:.2f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.plot([0, 1], [0, 1], color='grey', linestyle='--')
plt.title('ROC Curve for LSTM Model')