### 1. mon_feature.pkl upload

In [None]:
import pandas as pd
import numpy as np
import pickle

FILE_PATH = '/content/mon_features.pkl'
LABEL_COLUMN = ['website_label', 'monitored_label']

FEATURES_VER3 = [
    'total_transmission_time', 'std_inter_packet_time', 'avg_outgoing_burst_size',
    'avg_incoming_burst_size', 'num_outgoing_packets', 'incoming_packet_ratio',
    'outgoing_packet_ratio', 'cumul_packets_10pct', 'cumul_packets_30pct',
    'outgoing_order_skew', 'incoming_order_skew', 'cumul_max',
    'num_incoming_first_30', 'outgoing_first_30', 'avg_incoming_order_first_30',
    'avg_outgoing_order_first_30'
]

features_df = pd.read_pickle(FILE_PATH)
X = features_df[FEATURES_VER3]
y = features_df[LABEL_COLUMN[0]]

print(X)
print(y)

       total_transmission_time  std_inter_packet_time  \
0                        10.14               0.041168   
1                        10.16               0.163930   
2                        11.11               0.066661   
3                        13.36               0.047809   
4                        10.64               0.038760   
...                        ...                    ...   
18995                    43.91               0.143962   
18996                    15.60               0.019465   
18997                    14.93               0.016411   
18998                    19.91               0.033281   
18999                    13.76               0.011074   

       avg_outgoing_burst_size  avg_incoming_burst_size  num_outgoing_packets  \
0                     1.551282                16.666667                 121.0   
1                     1.702128                 9.319149                  80.0   
2                     1.552632                16.315789                 

### 2. XGBOOST

#### (1) install

In [None]:
!pip install xgboost



In [None]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report

#### (2) Split dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### (3) XGBoost 다중분류 모델 정의

In [None]:
model = xgb.XGBClassifier(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softmax',
    num_class=len(np.unique(y)),
    eval_metric='mlogloss',
    random_state=42
)

#### (4) 모델 학습

In [None]:
model.fit(X_train, y_train)

#### (5) 예측

In [None]:
y_pred = model.predict(X_test)

#### (6) 평가

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8692105263157894

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.93      0.87        42
           1       0.92      0.81      0.86        42
           2       0.89      0.94      0.92        35
           3       0.93      0.90      0.91        29
           4       0.87      0.87      0.87        39
           5       0.98      0.93      0.95        45
           6       0.89      0.95      0.92        44
           7       0.76      0.81      0.78        36
           8       0.92      0.68      0.78        34
           9       0.72      0.84      0.78        31
          10       0.91      0.89      0.90        47
          11       0.91      0.91      0.91        35
          12       0.89      0.93      0.91        42
          13       0.76      0.80      0.78        40
          14       0.89      0.89      0.89        36
          15       0.81      0.86      0.83        35
          16       0.89    

#### (7) 하이퍼파라미터 튜닝

In [None]:
model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y)), eval_metric='mlogloss', random_state=42)

xgb_param_grid = {
    'n_estimators': [200, 300],
    'max_depth': [4, 5],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb_grid = GridSearchCV(estimator=model, param_grid=xgb_param_grid,
                        scoring='accuracy', cv=3, verbose=1, n_jobs=-1)

xgb_grid.fit(X_train, y_train)

print("XGBoost Best Params (Multiclass):", xgb_grid.best_params_)
print("XGBoost Best Accuracy (CV):", xgb_grid.best_score_)

y_pred_xgb = xgb_grid.predict(X_test)
print("XGBoost Test Accuracy:", accuracy_score(y_test, y_pred_xgb))

Fitting 3 folds for each of 32 candidates, totalling 96 fits
XGBoost Best Params (Multiclass): {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 300, 'subsample': 1.0}
XGBoost Best Accuracy (CV): 0.8476975965151586
XGBoost Test Accuracy: 0.8707894736842106
