### 1. mon_feature.pkl upload


In [2]:
import pandas as pd
import numpy as np
import pickle

FILE_PATH = './content/mon_features.pkl'
LABEL_COLUMN = ['website_label', 'monitored_label']

FEATURES_VER3 = [
    'total_transmission_time', 'std_inter_packet_time', 'avg_outgoing_burst_size',
    'avg_incoming_burst_size', 'num_outgoing_packets', 'incoming_packet_ratio',
    'outgoing_packet_ratio', 'cumul_packets_10pct', 'cumul_packets_30pct',
    'outgoing_order_skew', 'incoming_order_skew', 'cumul_max',
    'num_incoming_first_30', 'outgoing_first_30', 'avg_incoming_order_first_30',
    'avg_outgoing_order_first_30'
]

features_df = pd.read_pickle(FILE_PATH)
X = features_df[FEATURES_VER3]
y = features_df[LABEL_COLUMN[0]]

print(X)
print(y)


       total_transmission_time  std_inter_packet_time  \
0                        10.14               0.041168   
1                        10.16               0.163930   
2                        11.11               0.066661   
3                        13.36               0.047809   
4                        10.64               0.038760   
...                        ...                    ...   
18995                    43.91               0.143962   
18996                    15.60               0.019465   
18997                    14.93               0.016411   
18998                    19.91               0.033281   
18999                    13.76               0.011074   

       avg_outgoing_burst_size  avg_incoming_burst_size  num_outgoing_packets  \
0                     1.551282                16.666667                 121.0   
1                     1.702128                 9.319149                  80.0   
2                     1.552632                16.315789                 

### 2. Random Forest


Split the dataset into training and testing sets


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


Train and test Random Forest


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

clf_rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1)
clf_rf.fit(X_train, y_train)
y_pred_rf = clf_rf.predict(X_test)

print("============ Before Hyperparameter tuning ============")
print("RF Accuracy: {}".format(accuracy_score(y_test, y_pred_rf)))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


RF Accuracy: 0.8530526315789474
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.82      0.82        51
           1       0.97      0.83      0.90        47
           2       0.90      0.96      0.93        48
           3       0.92      0.97      0.95        37
           4       0.97      0.80      0.88        44
           5       0.91      0.95      0.93        56
           6       0.86      0.98      0.92        58
           7       0.76      0.85      0.80        53
           8       0.90      0.76      0.82        49
           9       0.70      0.79      0.74        38
          10       0.92      0.80      0.85        55
          11       0.85      0.89      0.87        45
          12       0.91      0.91      0.91        55
          13       0.79      0.69      0.74        45
          14       0.79      0.81      0.80        52
          15       0.77      0.83      0.80        48
          16       0.86  

### 3. Hyperparameter tuning by using Grid Search


In [5]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10,20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

search = HalvingGridSearchCV(RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1), 
                             param_grid, cv=5, factor=3, n_jobs=-1)
search.fit(X_train, y_train)
y_pred_tuned = search.predict(X_test)

print("============ After Hyperparameter tuning ============")
print("Best Parameters:", search.best_params_)
print("RF Accuracy: {}".format(accuracy_score(y_test, y_pred_tuned)))
print("Classification Report:\n", classification_report(y_test, y_pred_tuned))


Best Parameters: {'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
RF Accuracy: 0.8526315789473684
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.75      0.76        51
           1       0.95      0.83      0.89        47
           2       0.87      0.98      0.92        48
           3       0.95      0.95      0.95        37
           4       0.94      0.77      0.85        44
           5       0.91      0.93      0.92        56
           6       0.89      0.98      0.93        58
           7       0.82      0.85      0.83        53
           8       0.90      0.76      0.82        49
           9       0.70      0.74      0.72        38
          10       0.88      0.76      0.82        55
          11       0.87      0.87      0.87        45
          12       0.91      0.91      0.91        55
          13       0.85      0.73      0.79        45
      