### 1. combine `mon_features.pkl` & `unmon_features.pkl` into `features_df`


In [1]:
import pandas as pd
import numpy as np
import pickle

MON_FILE_PATH = '../content/mon_features.pkl'
UNMON_FILE_PATH = '../content/unmon_features.pkl'

LABEL_COLUMN = ['website_label', 'monitored_label']

FEATURES_VER2 = [
    'num_outgoing_packets', 'incoming_packet_ratio',
    'outgoing_order_skew', 'outgoing_first_30',
    'avg_outgoing_order_first_30', 'avg_incoming_burst_size',
    'cumul_packets_10pct', 'cumul_packets_30pct',
    'cumul_max'
]

mon_features_df = pd.read_pickle(MON_FILE_PATH)
unmon_features_df = pd.read_pickle(UNMON_FILE_PATH)

features_df = pd.concat([mon_features_df, unmon_features_df], ignore_index=True)

X = features_df[FEATURES_VER2]
y = features_df[LABEL_COLUMN[0]]

print(X)
print(y)


       num_outgoing_packets  incoming_packet_ratio  outgoing_order_skew  \
0                     121.0               0.914849            -0.257072   
1                      80.0               0.845560             0.153926   
2                     118.0               0.913108            -0.463423   
3                     122.0               0.915629            -0.391122   
4                     115.0               0.918208            -0.355596   
...                     ...                    ...                  ...   
28995                 413.0               0.910081            -0.272106   
28996                 447.0               0.912524            -0.550236   
28997                  59.0               0.836565            -0.028954   
28998                  96.0               0.811395            -0.703690   
28999                 322.0               0.967768             0.045687   

       outgoing_first_30  avg_outgoing_order_first_30  \
0               0.300000                  

### 2. Random Forest


Split the dataset into training and testing sets


In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


Train and test Random Forest


In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

clf_rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1)
clf_rf.fit(X_train, y_train)
y_pred_rf = clf_rf.predict(X_test)

print("============ Before Hyperparameter tuning ============")
print("RF Accuracy: {}".format(accuracy_score(y_test, y_pred_rf)))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


RF Accuracy: 0.8590344827586207
Classification Report:
               precision    recall  f1-score   support

          -1       0.84      0.99      0.91      2470
           0       0.94      0.71      0.81        42
           1       0.88      0.75      0.81        51
           2       1.00      0.72      0.84        47
           3       0.92      0.69      0.79        51
           4       0.90      0.84      0.87        43
           5       0.98      0.98      0.98        44
           6       0.86      0.96      0.91        53
           7       0.76      0.71      0.74        55
           8       0.79      0.66      0.72        47
           9       0.84      0.71      0.77        45
          10       0.87      0.75      0.80        44
          11       0.97      0.58      0.73        55
          12       0.90      0.85      0.87        52
          13       0.91      0.53      0.67        57
          14       0.81      0.79      0.80        58
          15       0.73  

### 3. Hyperparameter tuning by using Grid Search


In [4]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

search = HalvingGridSearchCV(RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1), 
                             param_grid, cv=5, factor=3, n_jobs=-1)
search.fit(X_train, y_train)
y_pred_tuned = search.predict(X_test)

print("============ After Hyperparameter tuning ============")
print("Best Parameters:", search.best_params_)
print("RF Accuracy: {}".format(accuracy_score(y_test, y_pred_tuned)))
print("Classification Report:\n", classification_report(y_test, y_pred_tuned))


Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
RF Accuracy: 0.876
Classification Report:
               precision    recall  f1-score   support

          -1       0.91      0.98      0.94      2470
           0       0.89      0.79      0.84        42
           1       0.87      0.80      0.84        51
           2       0.93      0.81      0.86        47
           3       0.93      0.75      0.83        51
           4       0.89      0.91      0.90        43
           5       0.95      0.95      0.95        44
           6       0.85      0.96      0.90        53
           7       0.75      0.80      0.77        55
           8       0.92      0.74      0.82        47
           9       0.83      0.76      0.79        45
          10       0.88      0.80      0.83        44
          11       0.91      0.71      0.80        55
          12       0.88      0.87      0.87        52
          13     