### 1. combine `mon_features.pkl` & `unmon_features.pkl` into `features_df`


In [3]:
import pandas as pd
import numpy as np
import pickle

MON_FILE_PATH = '../content/mon_features.pkl'
UNMON_FILE_PATH = '../content/unmon_features.pkl'

LABEL_COLUMN = ['website_label', 'monitored_label']

FEATURES_VER1 = [
    'total_transmission_time', 'cumul_packets_10pct',
    'cumul_packets_30pct', 'cumul_max',
    'num_incoming_first_30', 'outgoing_first_30',
    'avg_incoming_order_first_30', 'avg_outgoing_order_first_30'
]

mon_features_df = pd.read_pickle(MON_FILE_PATH)
unmon_features_df = pd.read_pickle(UNMON_FILE_PATH)

features_df = pd.concat([mon_features_df, unmon_features_df], ignore_index=True)

X = features_df[FEATURES_VER1]
y = features_df[LABEL_COLUMN[1]]

# Convert labels: -1 -> 0 (unmonitored), 1 -> 1 (monitored)
y = y.replace({-1: 0, 1: 1})

print(X)
print(y)


       total_transmission_time  cumul_packets_10pct  cumul_packets_30pct  \
0                        10.14                 23.0                 55.0   
1                        10.16                  5.0                 49.0   
2                        11.11                  8.0                 34.0   
3                        13.36                 15.0                 57.0   
4                        10.64                 22.0                 53.0   
...                        ...                  ...                  ...   
28995                    32.09                 24.0               1941.0   
28996                    38.62                  9.0               1656.0   
28997                    34.93                240.0                357.0   
28998                    11.84                  7.0                 28.0   
28999                     9.62                  4.0                  8.0   

       cumul_max  num_incoming_first_30  outgoing_first_30  \
0            0.0         

### 2. Random Forest


Split the dataset into training and testing sets


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


Train and test Random Forest


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

clf_rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1)
clf_rf.fit(X_train, y_train)
y_pred_rf = clf_rf.predict(X_test)

print("============ Before Hyperparameter tuning ============")
print("RF Accuracy: {}".format(accuracy_score(y_test, y_pred_rf)))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


RF Accuracy: 0.9717241379310345
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.95      0.96      2470
           1       0.98      0.98      0.98      4780

    accuracy                           0.97      7250
   macro avg       0.97      0.97      0.97      7250
weighted avg       0.97      0.97      0.97      7250



### 3. Hyperparameter tuning by using Grid Search


In [7]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

search = HalvingGridSearchCV(RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1), 
                             param_grid, cv=5, factor=3, n_jobs=-1)
search.fit(X_train, y_train)
y_pred_tuned = search.predict(X_test)

print("============ After Hyperparameter tuning ============")
print("Best Parameters:", search.best_params_)
print("RF Accuracy: {}".format(accuracy_score(y_test, y_pred_tuned)))
print("Classification Report:\n", classification_report(y_test, y_pred_tuned))


Best Parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
RF Accuracy: 0.9710344827586207
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96      2470
           1       0.98      0.98      0.98      4780

    accuracy                           0.97      7250
   macro avg       0.97      0.97      0.97      7250
weighted avg       0.97      0.97      0.97      7250

