### 0.Import

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 1. combine mon_features.pkl & unmon_features.pkl into features_df

In [3]:
import pandas as pd
import numpy as np
import pickle

MON_FILE_PATH = '/content/drive/MyDrive/기학/모델 학습/data/mon_features.pkl'
UNMON_FILE_PATH = '/content/drive/MyDrive/기학/모델 학습/data/unmon_features.pkl'

LABEL_COLUMN = ['website_label', 'monitored_label']

FEATURES_VER2 = [
    'num_outgoing_packets', 'incoming_packet_ratio',
    'outgoing_order_skew', 'outgoing_first_30',
    'avg_outgoing_order_first_30', 'avg_incoming_burst_size',
    'cumul_packets_10pct', 'cumul_packets_30pct',
    'cumul_max'
]

mon_features_df = pd.read_pickle(MON_FILE_PATH)
unmon_features_df = pd.read_pickle(UNMON_FILE_PATH)

features_df = pd.concat([mon_features_df, unmon_features_df], ignore_index=True)

X = features_df[FEATURES_VER2]
y = features_df[LABEL_COLUMN[0]]


# replace unmonitored label(-1) into 95
y = y.replace({-1: 95})

print(X)
print(y)

       num_outgoing_packets  incoming_packet_ratio  outgoing_order_skew  \
0                     121.0               0.914849            -0.257072   
1                      80.0               0.845560             0.153926   
2                     118.0               0.913108            -0.463423   
3                     122.0               0.915629            -0.391122   
4                     115.0               0.918208            -0.355596   
...                     ...                    ...                  ...   
28995                 413.0               0.910081            -0.272106   
28996                 447.0               0.912524            -0.550236   
28997                  59.0               0.836565            -0.028954   
28998                  96.0               0.811395            -0.703690   
28999                 322.0               0.967768             0.045687   

       outgoing_first_30  avg_outgoing_order_first_30  \
0               0.300000                  

In [4]:
# 로그 변환
log_cols = []

for col in X.columns:
    # 숫자형이고 & 음수가 하나도 없는 경우만 로그 변환 리스트에 추가
    if pd.api.types.is_numeric_dtype(X[col]) and (X[col] < 0).sum() == 0:
        log_cols.append(col)

# 리스트에 있는 컬럼들 로그 변환
if log_cols:
    X[log_cols] = np.log1p(X[log_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[log_cols] = np.log1p(X[log_cols])


### 2. model [K-NN]

#### default 학습

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
# Scaling 추가
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [8]:
y_pred = knn.predict(X_test)
y_pred_proba = knn.predict_proba(X_test)

In [9]:
acc = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class="ovr")
report = classification_report(y_test, y_pred)

print("=== K-NN 기본 성능 ===")
print("Accuracy:", acc)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:\n", report)

=== K-NN 기본 성능 ===
Accuracy: 0.7191379310344828
ROC-AUC: 0.9165308958390442

Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.50      0.49        40
           1       0.56      0.80      0.66        40
           2       0.59      0.75      0.66        40
           3       0.67      0.80      0.73        40
           4       0.51      0.60      0.55        40
           5       0.63      0.60      0.62        40
           6       0.58      0.82      0.68        40
           7       0.55      0.53      0.54        40
           8       0.47      0.53      0.49        40
           9       0.40      0.53      0.46        40
          10       0.55      0.68      0.61        40
          11       0.56      0.62      0.59        40
          12       0.65      0.78      0.70        40
          13       0.32      0.30      0.31        40
          14       0.52      0.65      0.58        40
          15       0.54      0.65 

#### 하이퍼 파라미터 튜닝 후 학습(GridSearchCV 이용)

In [10]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 15, 21],
    'weights': ['uniform', 'distance'],
    'metric': ['minkowski', 'manhattan', 'euclidean']
}

grid = GridSearchCV(knn, param_grid, cv=5, scoring='f1_macro')
grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_estimator_)

grid_pred = grid.predict(X_test)
grid_pred_proba = grid.predict_proba(X_test)

{'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
KNeighborsClassifier(metric='manhattan', n_neighbors=3, weights='distance')


In [11]:
acc = accuracy_score(y_test, grid_pred)
roc_auc = roc_auc_score(y_test, grid_pred_proba, multi_class="ovr")
report = classification_report(y_test, grid_pred)

print("=== K-NN 튜닝 후 성능 ===")
print("Accuracy:", acc)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:\n", report)

=== K-NN 튜닝 후 성능 ===
Accuracy: 0.7872413793103449
ROC-AUC: 0.9166618708405672

Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.57      0.60        40
           1       0.72      0.85      0.78        40
           2       0.74      0.88      0.80        40
           3       0.74      0.88      0.80        40
           4       0.66      0.78      0.71        40
           5       0.74      0.72      0.73        40
           6       0.72      0.90      0.80        40
           7       0.65      0.55      0.59        40
           8       0.65      0.55      0.59        40
           9       0.60      0.75      0.67        40
          10       0.71      0.68      0.69        40
          11       0.59      0.65      0.62        40
          12       0.77      0.82      0.80        40
          13       0.46      0.45      0.46        40
          14       0.71      0.72      0.72        40
          15       0.67      0.7

#### default 학습

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [13]:
# Scaling 추가
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
clf = LogisticRegression(multi_class="multinomial")
clf.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)

In [16]:
acc = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class="ovr")
report = classification_report(y_test, y_pred)

print("=== Logistic Regression 기본 성능 ===")
print("Accuracy:", acc)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:\n", report)

=== Logistic Regression 기본 성능 ===
Accuracy: 0.476551724137931
ROC-AUC: 0.9245655707846003

Classification Report:
               precision    recall  f1-score   support

           0       0.08      0.03      0.04        40
           1       0.42      0.12      0.19        40
           2       0.00      0.00      0.00        40
           3       0.00      0.00      0.00        40
           4       0.00      0.00      0.00        40
           5       0.27      0.30      0.29        40
           6       0.32      0.75      0.44        40
           7       0.60      0.15      0.24        40
           8       0.50      0.33      0.39        40
           9       0.08      0.05      0.06        40
          10       0.67      0.30      0.41        40
          11       0.46      0.28      0.34        40
          12       0.66      0.68      0.67        40
          13       0.00      0.00      0.00        40
          14       0.00      0.00      0.00        40
          15       1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'saga'],
    'max_iter': [300, 500]
}

grid = GridSearchCV(clf, param_grid, cv=5, scoring='f1_macro')
grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_estimator_)

grid_pred = grid.predict(X_test)
grid_pred_proba = grid.predict_proba(X_test)



In [None]:
acc = accuracy_score(y_test, grid_pred)
roc_auc = roc_auc_score(y_test, grid_pred_proba, multi_class="ovr")
report = classification_report(y_test, grid_pred)

print("=== Logistic Regression 튜닝 후 성능 ===")
print("Accuracy:", acc)
print("ROC-AUC:", roc_auc)
print("\nClassification Report:\n", report)