# Data Preprocessing

## Import Library

In [99]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

## 데이터 로드

In [100]:
path = '../../shared-data/merged-traffic.csv'
filename = os.path.basename(path)
df = pd.read_csv(path)

## 극소수 클래스 제거 (전체 비중 0.1% 미만)

In [101]:
# 컬럼명 공백 제거
df.columns = df.columns.str.strip()
df['Label'].value_counts()

Label
BENIGN                        1454613
DoS Hulk                       231073
PortScan                       158930
DDoS                           128027
DoS GoldenEye                   10293
FTP-Patator                      7938
SSH-Patator                      5897
DoS slowloris                    5796
DoS Slowhttptest                 5499
Bot                              1966
Web Attack � Brute Force         1507
Web Attack � XSS                  652
Web Attack � Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64

In [102]:
total_rows = len(df)
label_counts = df['Label'].value_counts()

threshold = total_rows * 0.001  # 기준: 전체의 0.1% 미만인 클래스
rare_labels = label_counts[label_counts < threshold].index.tolist()

print(f"[INFO] Rare classes to be removed (count < {threshold:.0f}):")
for label in rare_labels:
    print(f"- {label}: {label_counts[label]} rows")

df = df[~df['Label'].isin(rare_labels)].reset_index(drop=True)

[INFO] Rare classes to be removed (count < 2012):
- Bot: 1966 rows
- Web Attack � Brute Force: 1507 rows
- Web Attack � XSS: 652 rows
- Web Attack � Sql Injection: 21 rows
- Heartbleed: 11 rows


In [103]:
df['Label'].value_counts()

Label
BENIGN              1454613
DoS Hulk             231073
PortScan             158930
DDoS                 128027
DoS GoldenEye         10293
FTP-Patator            7938
SSH-Patator            5897
DoS slowloris          5796
DoS Slowhttptest       5499
Name: count, dtype: int64

## Binary 라벨링 (악성 트래픽: 1, 정상: 0)

In [104]:
df['Label_binary'] = df['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)
df['Label_binary'].value_counts()

Label_binary
0    1454613
1     553453
Name: count, dtype: int64

## 불필요한 열 제거
- 식별자(Identifiers) 또는 고유값
- 목표값과 직접적인 관련이 있는 열 (Leakage Feature)
- 상수 값 (Constant Features)
- 결측치가 너무 많은 열
- 중복 정보 or 높은 상관관계를 가지는 열
- 텍스트, 로그, 비정형 필드

### 상수값 데이터 제거

In [105]:
low_variance_cols = [col for col in df.columns if df[col].nunique() <= 1]
df = df.drop(columns=low_variance_cols)
print(f'removed_low_variance_cols: {low_variance_cols}')

removed_low_variance_cols: ['Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'CWE Flag Count', 'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk', 'Fwd Avg Bulk Rate', 'Bwd Avg Bytes/Bulk', 'Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate']


### 결측 비율이 90% 이상인 데이터 제거

In [106]:
missing_cols = df.columns[df.isnull().mean() > 0.9]
df = df.drop(columns=missing_cols)
print(f'removed_missing_cols: {missing_cols}')

removed_missing_cols: Index([], dtype='object')


### 목표값과 직접적인 관련이 있는 데이터 제거

In [107]:
leakage_cols = ['Label']
df = df.drop(columns=leakage_cols)
print(f'removed_leakage_cols: {leakage_cols}')

removed_leakage_cols: ['Label']


### 높은 상관관계 데이터 제거 (상관계수 0.95 이상)

**상관계수 0.95 이상 쌍 확인**

In [108]:
# 수치형 데이터만 추출
df_numeric = df.select_dtypes(include=[np.number])

# 상관계수 행렬 계산 (절댓값 기준)
corr_matrix = df_numeric.corr().abs()

# 상삼각 행렬 추출 (중복 제거용)
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# 상관계수 0.95 이상인 쌍 추출
high_corr_pairs = []

for col in upper_tri.columns:
    for row in upper_tri.index:
        corr_value = upper_tri.loc[row, col]
        if pd.notnull(corr_value) and corr_value >= 0.95:
            high_corr_pairs.append((row, col, round(corr_value, 4)))

# 내림차순 정렬
high_corr_pairs.sort(key=lambda x: x[2], reverse=True)

# 결과 출력
for a, b, corr in high_corr_pairs:
    print(f"{a:30} <--> {b:30} : {corr}")

Fwd PSH Flags                  <--> SYN Flag Count                 : 1.0
Fwd Packet Length Mean         <--> Avg Fwd Segment Size           : 1.0
Bwd Packet Length Mean         <--> Avg Bwd Segment Size           : 1.0
Fwd Header Length              <--> Fwd Header Length.1            : 1.0
Total Fwd Packets              <--> Subflow Fwd Packets            : 1.0
Total Length of Fwd Packets    <--> Subflow Fwd Bytes              : 1.0
Total Backward Packets         <--> Subflow Bwd Packets            : 1.0
Total Length of Bwd Packets    <--> Subflow Bwd Bytes              : 1.0
Total Fwd Packets              <--> Total Backward Packets         : 0.9991
Total Backward Packets         <--> Subflow Fwd Packets            : 0.9991
Total Fwd Packets              <--> Subflow Bwd Packets            : 0.9991
Subflow Fwd Packets            <--> Subflow Bwd Packets            : 0.9991
Flow Duration                  <--> Fwd IAT Total                  : 0.9988
Flow IAT Max                   <--> 

**제거할 feature 추출**

In [109]:
features_to_drop = set()
features_to_keep = set()
keywords = ['.1', 'Subflow', 'Avg', 'Segment', 'Bytes', 'Bulk']
already_dropped = set()

for col in upper_tri.columns:
    for row in upper_tri.index:
        corr_value = upper_tri.loc[row, col]
        if pd.notnull(corr_value) and corr_value >= 0.95:
            # 이미 제거된 컬럼은 무시
            if row in features_to_drop or col in features_to_drop:
                continue
            
            # 1. 파생/복제 이름 기준
            if any(kw in row for kw in keywords) and not any(kw in col for kw in keywords):
                drop_feature = row
                keep_feature = col
            elif any(kw in col for kw in keywords) and not any(kw in row for kw in keywords):
                drop_feature = col
                keep_feature = row
            
            # 2. 결측치가 많은 쪽 제거
            elif df[row].isnull().sum() > df[col].isnull().sum():
                drop_feature = row
                keep_feature = col
            elif df[col].isnull().sum() > df[row].isnull().sum():
                drop_feature = col
                keep_feature = row
            
            # 3. 고유값이 적은 쪽 제거
            elif df[row].nunique() < df[col].nunique():
                drop_feature = row
                keep_feature = col
            elif df[col].nunique() < df[row].nunique():
                drop_feature = col
                keep_feature = row
            
            # 4. 나머지는 col 제거(순서상 일관성 위해)
            else:
                drop_feature = col
                keep_feature = row
            features_to_drop.add(drop_feature)
            features_to_keep.add(keep_feature)

df = df.drop(columns=list(features_to_drop))
print(f'removed_high_correlated_cols: {features_to_drop}')

removed_high_correlated_cols: {'Fwd Packet Length Max', 'Total Backward Packets', 'Fwd Packets/s', 'Total Fwd Packets', 'Average Packet Size', 'Subflow Fwd Packets', 'Idle Min', 'Subflow Bwd Packets', 'Fwd IAT Total', 'Idle Mean', 'Fwd Header Length.1', 'Avg Bwd Segment Size', 'Idle Max', 'Subflow Fwd Bytes', 'Bwd Packet Length Max', 'SYN Flag Count', 'Subflow Bwd Bytes', 'Max Packet Length', 'Avg Fwd Segment Size', 'Fwd IAT Max', 'ECE Flag Count'}


### 남아있는 필드명 확인

In [110]:
columns_list = df.columns.tolist()
columns_list

['Destination Port',
 'Flow Duration',
 'Total Length of Fwd Packets',
 'Total Length of Bwd Packets',
 'Fwd Packet Length Min',
 'Fwd Packet Length Mean',
 'Fwd Packet Length Std',
 'Bwd Packet Length Min',
 'Bwd Packet Length Mean',
 'Bwd Packet Length Std',
 'Flow Bytes/s',
 'Flow Packets/s',
 'Flow IAT Mean',
 'Flow IAT Std',
 'Flow IAT Max',
 'Flow IAT Min',
 'Fwd IAT Mean',
 'Fwd IAT Std',
 'Fwd IAT Min',
 'Bwd IAT Total',
 'Bwd IAT Mean',
 'Bwd IAT Std',
 'Bwd IAT Max',
 'Bwd IAT Min',
 'Fwd PSH Flags',
 'Fwd Header Length',
 'Bwd Header Length',
 'Bwd Packets/s',
 'Min Packet Length',
 'Packet Length Mean',
 'Packet Length Std',
 'Packet Length Variance',
 'FIN Flag Count',
 'RST Flag Count',
 'PSH Flag Count',
 'ACK Flag Count',
 'URG Flag Count',
 'Down/Up Ratio',
 'Init_Win_bytes_forward',
 'Init_Win_bytes_backward',
 'act_data_pkt_fwd',
 'min_seg_size_forward',
 'Active Mean',
 'Active Std',
 'Active Max',
 'Active Min',
 'Idle Std',
 'Label_binary']

## 결측치 처리
- 수치형: 평균
- 범주형: 최빈값

In [111]:
df = df.replace([np.inf, -np.inf], np.nan)  # inf 값을 NaN으로 변환

for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].mean(), inplace=True)

print(df.isnull().sum().sum())  # 전체 NaN 개수
print(np.isinf(df.select_dtypes(include=[np.number])).sum().sum())  # 전체 inf 개수

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


0
0


## 이상치 처리

In [112]:
# 수치형 컬럼만 선택 (정규화/라벨 컬럼 등은 제외)
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
numerical_cols = [col for col in numerical_cols if col not in ['Label', 'Label_binary']]

# IQR 계산 및 클리핑
Q1 = df[numerical_cols].quantile(0.25)
Q3 = df[numerical_cols].quantile(0.75)
IQR = Q3 - Q1

# 각 feature별로 lower, upper 구해서 클리핑
for col in numerical_cols:
    lower = Q1[col] - 1.5 * IQR[col]
    upper = Q3[col] + 1.5 * IQR[col]
    df[col] = df[col].clip(lower=lower, upper=upper)

# 이상치 처리 결과 확인
df[numerical_cols].describe()

Unnamed: 0,Destination Port,Flow Duration,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,...,Down/Up Ratio,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Std
count,2008066.0,2008066.0,2008066.0,2008066.0,2008066.0,2008066.0,2008066.0,2008066.0,2008066.0,2008066.0,...,2008066.0,2008066.0,2008066.0,2008066.0,2008066.0,2008066.0,2008066.0,2008066.0,2008066.0,2008066.0
mean,333.1657,3134571.0,173.4911,603.1749,15.46568,35.12839,25.60706,36.32248,157.5549,134.4154,...,0.6732279,4435.611,120.0213,1.4709,26.12816,0.0,0.0,0.0,0.0,0.0
std,386.5595,5187479.0,242.4486,876.9449,21.16086,33.25743,41.63901,54.70584,197.7911,227.6242,...,0.5538132,7438.205,183.9667,1.676465,6.697132,0.0,0.0,0.0,0.0,0.0
min,0.0,-13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.0,-1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
25%,53.0,164.0,6.0,6.0,0.0,6.0,0.0,0.0,5.5,0.0,...,0.0,-1.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0
50%,80.0,37688.5,62.0,126.0,0.0,34.0,0.0,0.0,74.0,0.0,...,1.0,251.0,0.0,1.0,20.0,0.0,0.0,0.0,0.0,0.0
75%,443.0,5245993.0,293.0,861.0,35.0,50.0,40.36998,70.0,214.75,217.3724,...,1.0,8192.0,235.0,2.0,32.0,0.0,0.0,0.0,0.0,0.0
max,1028.0,13114740.0,723.5,2143.5,87.5,116.0,100.925,175.0,528.625,543.4309,...,2.5,20481.5,589.0,5.0,50.0,0.0,0.0,0.0,0.0,0.0


## 수치형 정규화
- 수치형 데이터를 0~1 범위로 스케일링

In [113]:
# 수치형 컬럼 추출 (라벨/정답 컬럼은 제외)
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
numerical_cols = [col for col in numerical_cols if col not in ['Label', 'Label_binary']]

# 정규화 객체 생성 및 fit_transform
scaler = MinMaxScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# 정규화 결과 확인
df[numerical_cols].head()

Unnamed: 0,Destination Port,Flow Duration,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,...,Down/Up Ratio,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Std
0,1.0,1.0,1.0,0.537439,0.0,1.0,1.0,0.411429,0.136202,0.0,...,0.0,0.018455,1.0,1.0,0.625,0.0,0.0,0.0,0.0,0.0
1,0.378405,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.597777,0.0,...,0.0,0.046674,1.0,1.0,0.625,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.007636,0.851417,0.0,0.32,0.241379,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.625,0.0,0.0,0.0,0.0,0.0
4,0.0,0.004176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 결과 저장

In [None]:
save_path = f'../data/preprocessed-{filename}'
df.to_csv(save_path, index=False)