## Data Preprocessing

In [None]:
import pandas as pd

# 데이터 불러오기
df = pd.read_csv('data/train_data.csv')
df.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,80,84918108,10,9,60,0,6,6,6.0,0.0,...,20,23829.5,452.434999,24928,23552,10000000.0,4392.518052,10000000,10000000,BENIGN
1,41850,66,1,1,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,443,5705862,7,4,617,164,517,0,88.142857,189.988972,...,20,99556.0,0.0,99556,99556,5606303.0,0.0,5606303,5606303,BENIGN
3,80,98326768,5,7,391,11595,379,0,78.2,168.179071,...,20,11005.0,0.0,11005,11005,98300000.0,0.0,98300000,98300000,DoS Hulk
4,53,204,2,2,88,188,44,44,44.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [22]:
import numpy as np

# 컬럼명 공백 제거
df.columns = df.columns.str.strip()

# inf, -inf → NaN 변환
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# NaN → 0 대체
df.fillna(0, inplace=True)

## 결측치/이상치 탐색

In [23]:
print("[+] df.shape")
print(df.shape)

print("\n[+] df.info()")
print(df.info())

print("\n[+] df.describe()")
print(df.describe())

print("\n[+] df.isnull().sum()")
print(df.isnull().sum())

[+] df.shape
(2264594, 79)

[+] df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2264594 entries, 0 to 2264593
Data columns (total 79 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Destination Port             int64  
 1   Flow Duration                int64  
 2   Total Fwd Packets            int64  
 3   Total Backward Packets       int64  
 4   Total Length of Fwd Packets  int64  
 5   Total Length of Bwd Packets  int64  
 6   Fwd Packet Length Max        int64  
 7   Fwd Packet Length Min        int64  
 8   Fwd Packet Length Mean       float64
 9   Fwd Packet Length Std        float64
 10  Bwd Packet Length Max        int64  
 11  Bwd Packet Length Min        int64  
 12  Bwd Packet Length Mean       float64
 13  Bwd Packet Length Std        float64
 14  Flow Bytes/s                 float64
 15  Flow Packets/s               float64
 16  Flow IAT Mean                float64
 17  Flow IAT Std                 float64
 18  

In [24]:
df['Label'].value_counts()

Label
BENIGN                        1818663
DoS Hulk                       184587
PortScan                       127219
DDoS                           102444
DoS GoldenEye                    8219
FTP-Patator                      6326
SSH-Patator                      4724
DoS slowloris                    4623
DoS Slowhttptest                 4427
Bot                              1576
Web Attack � Brute Force         1199
Web Attack � XSS                  533
Infiltration                       31
Web Attack � Sql Injection         14
Heartbleed                          9
Name: count, dtype: int64

## Feature Engineering

In [25]:
df['Label_binary'] = df['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)
df['Label_binary'].value_counts()

Label_binary
0    1818663
1     445931
Name: count, dtype: int64

## Data Splitting

In [26]:
import numpy as np
from sklearn.model_selection import train_test_split

X = df.drop(columns=['Label', 'Label_binary'])  # 입력값만 남김
y = df['Label_binary']  # 정답(타겟)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


## Model Selection & Training

In [27]:
from sklearn.ensemble import RandomForestClassifier

# RandomForest 사용
clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

## Model Evaluation

In [28]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# 예측값 구하기
y_pred = clf.predict(X_test)

# 개별 지표 계산
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1-score:  {f1:.4f}")
print("\nConfusion Matrix:\n", cm)

# 더 자세한 리포트(클래스별 지표)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, digits=4))

Accuracy:  0.9988
Precision: 0.9971
Recall:    0.9971
F1-score:  0.9971

Confusion Matrix:
 [[363471    262]
 [   262  88924]]

Classification Report:

              precision    recall  f1-score   support

           0     0.9993    0.9993    0.9993    363733
           1     0.9971    0.9971    0.9971     89186

    accuracy                         0.9988    452919
   macro avg     0.9982    0.9982    0.9982    452919
weighted avg     0.9988    0.9988    0.9988    452919



### Save model

In [35]:
import joblib

joblib.dump(clf, 'RandomForest.pkl')

['RandomForest.pkl']

## Model Evaluation with Fresh Data

In [36]:
import pandas as pd

# 테스트 데이터 불러오기
test_df = pd.read_csv('data/test_data.csv')

In [37]:
import numpy as np

# 컬럼명 공백 제거
test_df.columns = test_df.columns.str.strip()

# inf, -inf → NaN 변환
test_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# NaN → 0 대체
test_df.fillna(0, inplace=True)

In [38]:
test_df['Label_binary'] = test_df['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)
test_df['Label_binary'].value_counts()

Label_binary
0    454434
1    111715
Name: count, dtype: int64

In [39]:
X_test = test_df.drop(columns=['Label', 'Label_binary'])
y_test = test_df['Label_binary']

In [40]:
import joblib

# 학습된 모델 불러오기
clf = joblib.load('RandomForest.pkl')

# 테스트
y_pred = clf.predict(X_test)

In [41]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# 평가 지표 계산
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1-score:  {f1:.4f}")
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, digits=4))

Accuracy:  0.9989
Precision: 0.9972
Recall:    0.9970
F1-score:  0.9971

Confusion Matrix:
 [[454123    311]
 [   331 111384]]

Classification Report:

              precision    recall  f1-score   support

           0     0.9993    0.9993    0.9993    454434
           1     0.9972    0.9970    0.9971    111715

    accuracy                         0.9989    566149
   macro avg     0.9982    0.9982    0.9982    566149
weighted avg     0.9989    0.9989    0.9989    566149



**현재 실험의 의미**
- 학습/튜닝에 사용하지 않은 완전히 새로운 데이터(test_data.csv)에서 평가한 결과임
- 데이터 분할도 랜덤하게 충분히 혼합된 상태에서 진행
- 데이터 양도 충분하고, 클래스 불균형도 심하지 않음