In [30]:
import ipaddress
import os
import time

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, accuracy_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [None]:
def convert_ip2int(ip):
    try:
        return int(ipaddress.ip_address(ip))
    except ValueError:
        return 0

def preprocess_sequence_data(df, time_window='1T', time_steps=10):
    df.columns = df.columns.str.replace('[ /]', '_', regex=True)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    df.dropna(subset=['Timestamp'], inplace=True)
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
    df['Source_IP'] = df['Source_IP'].astype(str).apply(convert_ip2int)
    df['Destination_IP'] = df['Destination_IP'].astype(str).apply(convert_ip2int)
    
    df.drop(columns=['Flow_ID'], inplace=True, errors='ignore')
    df.dropna(inplace=True)
    
    df.set_index('Timestamp', inplace=True)
    df['Label'] = np.where(df['Label'] == 'BENIGN', 0, 1)

    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
    agg_funcs = {col: 'mean' for col in numeric_cols if col not in ['Label']}
    agg_funcs.update({'Label': 'max'})
    special_aggs = {
        'Total_Fwd_Packets': 'sum', 'Total_Backward_Packets': 'sum',
        'Total_Length_of_Fwd_Packets': 'sum', 'Source_IP': 'nunique', 'Destination_IP': 'nunique'
    }
    for col, func in special_aggs.items():
        if col in df.columns:
            agg_funcs[col] = func
    agg_funcs = {k: v for k, v in agg_funcs.items() if k in df.columns}

    df_agg = df.resample(time_window).agg(agg_funcs)
    df_agg.fillna(0, inplace=True)
    
    X_agg = df_agg.drop('Label', axis=1).values
    y_agg = df_agg['Label'].values
    
    X_seq, y_seq = [], []
    if len(X_agg) > time_steps:
        for i in range(len(X_agg) - time_steps):
            X_seq.append(X_agg[i:(i + time_steps)])
            y_seq.append(y_agg[i + time_steps])

    return np.array(X_seq), np.array(y_seq)


In [32]:
data_path = 'dataset/'
all_files = [f for f in os.listdir(data_path) if f.endswith('.csv')]
train_files = [f for f in all_files if 'Wednesday' not in f]
test_files = [f for f in all_files if 'Wednesday' in f]

TIME_STEPS = 10
TIME_WINDOW = '1T'

X_train_list, y_train_list = [], []
for file_name in train_files:
    file_path = os.path.join(data_path, file_name)
    df = pd.read_csv(file_path, encoding='latin1', low_memory=False, on_bad_lines='skip')
    df.columns = df.columns.str.strip()

    X_temp, y_temp = preprocess_sequence_data(df, time_window=TIME_WINDOW, time_steps=TIME_STEPS)
    X_train_list.append(X_temp)
    y_train_list.append(y_temp)

X_test_list, y_test_list = [], []
for file_name in test_files:
    file_path = os.path.join(data_path, file_name)
    df = pd.read_csv(file_path, encoding='latin1', low_memory=False, on_bad_lines='skip')
    df.columns = df.columns.str.strip()

    X_temp, y_temp = preprocess_sequence_data(df, time_window=TIME_WINDOW, time_steps=TIME_STEPS)
    X_test_list.append(X_temp)
    y_test_list.append(y_temp)

X_train_seq = np.concatenate(X_train_list, axis=0)
y_train_seq = np.concatenate(y_train_list, axis=0)
X_test_seq = np.concatenate(X_test_list, axis=0)
y_test_seq = np.concatenate(y_test_list, axis=0)

scaler = StandardScaler()

nsamples, nsteps, nfeatures = X_train_seq.shape
X_train_2d = X_train_seq.reshape((nsamples * nsteps, nfeatures))
X_train_scaled_2d = scaler.fit_transform(X_train_2d)
X_train_scaled_seq = X_train_scaled_2d.reshape((nsamples, nsteps, nfeatures))

nsamples_test, nsteps_test, nfeatures_test = X_test_seq.shape
X_test_2d = X_test_seq.reshape((nsamples_test * nsteps_test, nfeatures_test))
X_test_scaled_2d = scaler.transform(X_test_2d)
X_test_scaled_seq = X_test_scaled_2d.reshape((nsamples_test, nsteps_test, nfeatures_test))

print(f"학습 데이터 형태: {X_train_seq.shape}, {y_train_seq.shape}")
print(f"테스트 데이터 형태: {X_test_seq.shape}, {y_test_seq.shape}")
print("-" * 50)

  df_agg = df.resample(time_window).agg(agg_funcs)
  df_agg = df.resample(time_window).agg(agg_funcs)
  df_agg = df.resample(time_window).agg(agg_funcs)
  df_agg = df.resample(time_window).agg(agg_funcs)
  df_agg = df.resample(time_window).agg(agg_funcs)
  df_agg = df.resample(time_window).agg(agg_funcs)
  df_agg = df.resample(time_window).agg(agg_funcs)
  df_agg = df.resample(time_window).agg(agg_funcs)


ValueError: setting an array element with a sequence.

In [None]:
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_seq),
    y=y_train_seq
)
class_weights_dict = dict(enumerate(class_weights))

print(f"클래스 가중치: {class_weights_dict}")

클래스 가중치: {0: np.float64(0.6132075471698113), 1: np.float64(2.7083333333333335)}


In [None]:
## LSTM
def create_lstm_model(input_shape, lstm_units=100, dense_units=50, dropout_rate=0.2, learning_rate=0.001):
    model = Sequential([
        LSTM(lstm_units, activation='tanh', input_shape=input_shape),
        Dropout(dropout_rate),
        Dense(dense_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

print("LSTM 모델 학습 시작...")
LSTM_starttime = time.time()

input_shape = (X_train_scaled_seq.shape[1], X_train_scaled_seq.shape[2])

lstm_model = create_lstm_model(input_shape=input_shape)

history = lstm_model.fit(
    X_train_scaled_seq,
    y_train_seq,
    epochs=25,
    batch_size=32,
    verbose=1,
    class_weight=class_weights_dict
)
LSTM_endtime = time.time()
LSTM_time = LSTM_endtime - LSTM_starttime
print("LSTM 모델 학습 완료.")

LSTM 모델 학습 시작...
Epoch 1/25


  super().__init__(**kwargs)


[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.5911 - loss: 0.5870
Epoch 2/25
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.8574 - loss: 0.3400
Epoch 3/25
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9035 - loss: 0.2191
Epoch 4/25
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9118 - loss: 0.2000
Epoch 5/25
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9444 - loss: 0.1619
Epoch 6/25
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9446 - loss: 0.1446
Epoch 7/25
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9591 - loss: 0.1004
Epoch 8/25
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9507 - loss: 0.1202
Epoch 9/25
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

In [None]:

y_pred_lstm_prob = lstm_model.predict(X_test_scaled_seq).flatten()
y_pred_lstm = (y_pred_lstm_prob > 0.5).astype(int)
accuracy_lstm = accuracy_score(y_test_seq, y_pred_lstm)
recall_lstm = recall_score(y_test_seq, y_pred_lstm)
f1_lstm = f1_score(y_test_seq, y_pred_lstm)
report = classification_report(y_test_seq, y_pred_lstm, target_names=['BENIGN (0)', 'ATTACK (1)'])

print("\nEvaluating model performance on the Test set...")
# Predict on the separate test set
print(f"\n--- LSTM Results---")
print(f"Accuracy: {accuracy_lstm:.4f}")
print(f"Recall:   {recall_lstm:.4f}")
print(f"F1-Score: {f1_lstm:.4f}")
print("\nClassification Report:")
print(report)

[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step

Evaluating model performance on the Test set...

--- LSTM Results---
Accuracy: 0.8408
Recall:   0.2907
F1-Score: 0.3067

Classification Report:
              precision    recall  f1-score   support

  BENIGN (0)       0.90      0.92      0.91       624
  ATTACK (1)       0.32      0.29      0.31        86

    accuracy                           0.84       710
   macro avg       0.61      0.60      0.61       710
weighted avg       0.83      0.84      0.84       710

