### 0. Import

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

### 1. combine mon_features.pkl & unmon_features.pkl into features_df

In [2]:
MON_FILE_PATH = '/content/mon_features.pkl'
UNMON_FILE_PATH = '/content/unmon_features.pkl'

LABEL_COLUMN = ['website_label', 'monitored_label']

FEATURES_VER1 = [
    'total_transmission_time', 'cumul_packets_10pct',
    'cumul_packets_30pct', 'cumul_max',
    'num_incoming_first_30', 'outgoing_first_30',
    'avg_incoming_order_first_30', 'avg_outgoing_order_first_30'
]

mon_features_df = pd.read_pickle(MON_FILE_PATH)
unmon_features_df = pd.read_pickle(UNMON_FILE_PATH)

features_df = pd.concat([mon_features_df, unmon_features_df], ignore_index=True)

X = features_df[FEATURES_VER1]
y = features_df[LABEL_COLUMN[1]]

y = y.replace({-1: 0, 1: 1})

print(X)
print(y)

       total_transmission_time  cumul_packets_10pct  cumul_packets_30pct  \
0                        10.14                 23.0                 55.0   
1                        10.16                  5.0                 49.0   
2                        11.11                  8.0                 34.0   
3                        13.36                 15.0                 57.0   
4                        10.64                 22.0                 53.0   
...                        ...                  ...                  ...   
28995                    32.09                 24.0               1941.0   
28996                    38.62                  9.0               1656.0   
28997                    34.93                240.0                357.0   
28998                    11.84                  7.0                 28.0   
28999                     9.62                  4.0                  8.0   

       cumul_max  num_incoming_first_30  outgoing_first_30  \
0            0.0         

In [3]:
# 로그 변환
log_cols = []

for col in X.columns:
    # 숫자형이고 & 음수가 하나도 없는 경우만 로그 변환 리스트에 추가
    if pd.api.types.is_numeric_dtype(X[col]) and (X[col] < 0).sum() == 0:
        log_cols.append(col)

# 리스트에 있는 컬럼들 로그 변환
if log_cols:
    X[log_cols] = np.log1p(X[log_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[log_cols] = np.log1p(X[log_cols])


### 2. model [K-NN]

#### default 학습

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
# Scaling 추가
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [7]:
y_pred = knn.predict(X_test)
y_pred_proba = knn.predict_proba(X_test)[:, 1]

In [8]:
acc = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
report = classification_report(y_test, y_pred)

print("=== K-NN 기본 성능 ===")
print("Accuracy:", acc)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:\n", report)

=== K-NN 기본 성능 ===
Accuracy: 0.9391379310344827
ROC-AUC: 0.9741491447368421

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.89      0.91      2000
           1       0.94      0.96      0.95      3800

    accuracy                           0.94      5800
   macro avg       0.94      0.93      0.93      5800
weighted avg       0.94      0.94      0.94      5800



#### 하이퍼 파라미터 튜닝 후 학습(GridSearchCV 이용)

In [9]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 15, 21],
    'weights': ['uniform', 'distance'],
    'metric': ['minkowski', 'manhattan', 'euclidean']
}

grid = GridSearchCV(knn, param_grid, cv=5, scoring='f1_macro')
grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_estimator_)

grid_pred = grid.predict(X_test)
grid_pred_proba = grid.predict_proba(X_test)[:, 1]

{'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
KNeighborsClassifier(metric='manhattan', n_neighbors=3, weights='distance')


In [10]:
acc = accuracy_score(y_test, grid_pred)
roc_auc = roc_auc_score(y_test, grid_pred_proba)
report = classification_report(y_test, grid_pred)

print("=== K-NN 튜닝 후 성능 ===")
print("Accuracy:", acc)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:\n", report)

=== K-NN 튜닝 후 성능 ===
Accuracy: 0.9503448275862069
ROC-AUC: 0.9744363815789473

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.92      0.93      2000
           1       0.96      0.97      0.96      3800

    accuracy                           0.95      5800
   macro avg       0.95      0.94      0.94      5800
weighted avg       0.95      0.95      0.95      5800

