### 1. combine `mon_features.pkl` & `unmon_features.pkl` into `features_df`

In [None]:
import pandas as pd
import numpy as np
import pickle

MON_FILE_PATH = '/content/mon_features.pkl'
UNMON_FILE_PATH = '/content/unmon_features.pkl'

LABEL_COLUMN = ['website_label', 'monitored_label']

FEATURES_VER3 = [
    'total_transmission_time', 'std_inter_packet_time',
    'avg_outgoing_burst_size', 'avg_incoming_burst_size',
    'num_outgoing_packets', 'incoming_packet_ratio',
    'outgoing_packet_ratio', 'cumul_packets_10pct',
    'cumul_packets_30pct', 'outgoing_order_skew',
    'incoming_order_skew', 'cumul_max',
    'num_incoming_first_30', 'outgoing_first_30',
    'avg_incoming_order_first_30', 'avg_outgoing_order_first_30'
]

mon_features_df = pd.read_pickle(MON_FILE_PATH)
unmon_features_df = pd.read_pickle(UNMON_FILE_PATH)

features_df = pd.concat([mon_features_df, unmon_features_df], ignore_index=True)

X = features_df[FEATURES_VER3]
y = features_df[LABEL_COLUMN[1]]

y = y.replace({-1: 0, 1: 1})

print(X)
print(y)

       total_transmission_time  std_inter_packet_time  \
0                        10.14               0.041168   
1                        10.16               0.163930   
2                        11.11               0.066661   
3                        13.36               0.047809   
4                        10.64               0.038760   
...                        ...                    ...   
28995                    32.09               0.163669   
28996                    38.62               0.114350   
28997                    34.93               1.331199   
28998                    11.84               0.083521   
28999                     9.62               0.026874   

       avg_outgoing_burst_size  avg_incoming_burst_size  num_outgoing_packets  \
0                     1.551282                16.666667                 121.0   
1                     1.702128                 9.319149                  80.0   
2                     1.552632                16.315789                 

### 2. LightGBM

#### (1) install

In [None]:
pip install lightgbm



In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

#### (2) Split dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#### (3) LightGBM 모델 정의

In [None]:
model = lgb.LGBMClassifier(
    objective="binary",
    boosting_type="gbdt",
    n_estimators=400,
    learning_rate=0.03,
    num_leaves=30,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

#### (4) 모델 학습

In [None]:
model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 15200, number of negative: 8000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009007 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3385
[LightGBM] [Info] Number of data points in the train set: 23200, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655172 -> initscore=0.641854
[LightGBM] [Info] Start training from score 0.641854


#### (5) 예측

In [None]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  # positive class 확률

#### (6) 평가

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.978103448275862

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97      2000
           1       0.98      0.98      0.98      3800

    accuracy                           0.98      5800
   macro avg       0.98      0.98      0.98      5800
weighted avg       0.98      0.98      0.98      5800



#### (7) 하이퍼파라미터 튜닝

In [None]:
model = lgb.LGBMClassifier(random_state=42)

lgbm_param_grid = {
    'n_estimators': [300, 400],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.03],
    'num_leaves': [30, 50],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

lgbm_grid = GridSearchCV(estimator=model, param_grid=lgbm_param_grid,
                         scoring='accuracy', cv=3, verbose=1, n_jobs=-1)

lgbm_grid.fit(X_train, y_train)

print("LightGBM Best Params:", lgbm_grid.best_params_)
print("LightGBM Best Accuracy:", lgbm_grid.best_score_)

# 테스트 데이터 성능 확인
y_pred_lgbm = lgbm_grid.predict(X_test)
print("LightGBM Test Accuracy:", accuracy_score(y_test, y_pred_lgbm))

Fitting 3 folds for each of 144 candidates, totalling 432 fits
[LightGBM] [Info] Number of positive: 15200, number of negative: 8000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001108 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3385
[LightGBM] [Info] Number of data points in the train set: 23200, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655172 -> initscore=0.641854
[LightGBM] [Info] Start training from score 0.641854
LightGBM Best Params: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 400, 'num_leaves': 30, 'subsample': 0.8}
LightGBM Best Accuracy: 0.9827155636228357
LightGBM Test Accuracy: 0.9810344827586207
