## 1. Data Load

In [9]:
import pickle
import numpy as np
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# load data
print("Loading data...")
with open("mon_standard.pkl", 'rb') as fi:
    mon_data = pickle.load(fi)

with open("unmon_standard10_3000.pkl", 'rb') as fi:
    unmon_data = pickle.load(fi)

Loading data...


## 2. Data Preprocess

In [10]:
# process Monitored data
def process_monitored(data, total_urls=950, use_sublabel=False, url_per_site=10):
    X1, X2, y = [], [], []
    for i in range(total_urls):
        label = i if use_sublabel else i // url_per_site
        for sample in data[i]:
            size_seq = []
            time_seq = []
            for c in sample:
                dr = 1 if c > 0 else -1
                time_seq.append(abs(c))
                size_seq.append(dr * 512)
            X1.append(time_seq)
            X2.append(size_seq)
            y.append(label)
    return X1, X2, y

# process Unmonitored data
def process_unmonitored(data, total_urls=10000):
    X1, X2 = [], []
    for i in range(total_urls):
        size_seq = []
        time_seq = []
        for c in data[i]:
            dr = 1 if c > 0 else -1
            time_seq.append(abs(c))
            size_seq.append(dr * 512)
        X1.append(time_seq)
        X2.append(size_seq)
    y = [-1] * len(X1)  # label Unmonitored data as -1
    return X1, X2, y

# process Monitored data
X1_mon, X2_mon, y_mon = process_monitored(
    mon_data, total_urls=950, use_sublabel=False, url_per_site=10
)

# process Unmonitored data
X1_unmon, X2_unmon, y_unmon = process_unmonitored(
    unmon_data, total_urls=3000
)

# concatenate data
X1 = X1_mon + X1_unmon
X2 = X2_mon + X2_unmon
y = y_mon + y_unmon

## 3. Feature Extraction

In [11]:
# create feautures
def create_features(X1, X2):
    X = []
    for i in range(len(X1)):
        packet_size_direction = sum(X2[i])
        cumulative_packet_size = np.sum([abs(c) for c in X2[i]])
        burst_lengths = len([c for c in X2[i] if c != 0])

        num_incoming_packets = len([c for c in X2[i] if c > 0])
        ratio_incoming_packets = num_incoming_packets / len(X2[i]) if len(X2[i]) > 0 else 0
        num_outgoing_packets = len([c for c in X2[i] if c < 0])
        total_packet_count = len(X2[i])

        feature_vector = [
            packet_size_direction,
            np.mean(X1[i]) if X1[i] else 0,
            cumulative_packet_size,
            burst_lengths,
            num_incoming_packets,
            ratio_incoming_packets,
            num_outgoing_packets,
            total_packet_count
        ]
        X.append(feature_vector)
    return np.array(X)

X = create_features(X1, X2)
y = np.array(y)

## 4. Data split and downsampling

In [12]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# take Monitored and Unmonitored data from Train data
train_data = pd.DataFrame({
    'features': list(X_train),
    'label': y_train
})
train_monitored = train_data[train_data['label'] != -1]
train_unmonitored = train_data[train_data['label'] == -1]

# downsample Monitored data
train_monitored_downsampled = resample(
    train_monitored,
    replace=True,
    n_samples=len(train_unmonitored),
    random_state=42
)

# re-concatenate Train data
balanced_train_data = pd.concat([train_monitored_downsampled, train_unmonitored])
X_train_balanced = np.array(balanced_train_data['features'].tolist())
y_train_balanced = balanced_train_data['label'].values

# scale data 
scaler = StandardScaler()
X_train_balanced = scaler.fit_transform(X_train_balanced)
X_test = scaler.transform(X_test)

## 5. Train the Random Forest model

In [13]:
from sklearn.metrics import f1_score
# train Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=300, max_depth=30, random_state=42, n_jobs=-1
)
rf_model.fit(X_train_balanced, y_train_balanced)

# prediction and evaluate
y_pred = rf_model.predict(X_test)

y_pred = rf_model.predict(X_test)
print("==== Random Forest ====")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


==== Random Forest ====
Accuracy: 0.3104545454545454
F1 Score: 0.28640803161082506
Confusion Matrix:
 [[522   1   0 ...   0   0   0]
 [ 31   2   0 ...   0   0   0]
 [ 19   0   4 ...   0   0   0]
 ...
 [ 30   0   0 ...   0   0   0]
 [ 21   0   0 ...   0   8   0]
 [ 36   0   0 ...   0   1   0]]
Classification Report:
               precision    recall  f1-score   support

          -1       0.18      0.87      0.30       600
           0       0.25      0.05      0.08        40
           1       0.33      0.10      0.15        40
           2       0.62      0.33      0.43        40
           3       0.73      0.28      0.40        40
           4       0.35      0.15      0.21        40
           5       0.36      0.12      0.19        40
           6       0.64      0.23      0.33        40
           7       0.28      0.17      0.22        40
           8       0.69      0.28      0.39        40
           9       0.43      0.07      0.13        40
          10       0.78      0.17

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
# evaluate data distribution
print("\nOriginal Training Data Distribution:")
print(f"Label -1 count: {sum(y_train == -1)}")
print(f"Other labels count: {sum(y_train != -1)}")

print("\nBalanced Training Data Distribution:")
print(f"Label -1 count: {sum(y_train_balanced == -1)}")
print(f"Other labels count: {sum(y_train_balanced != -1)}")

print("\nTest Data Distribution:")
print(f"Label -1 count: {sum(y_test == -1)}")
print(f"Other labels count: {sum(y_test != -1)}")



Original Training Data Distribution:
Label -1 count: 2400
Other labels count: 15200

Balanced Training Data Distribution:
Label -1 count: 2400
Other labels count: 2400

Test Data Distribution:
Label -1 count: 600
Other labels count: 3800


In [15]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb

# hyper-parameter tuning for random forest model
rf = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [100, 300, 500],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

grid_search_rf = GridSearchCV(
    rf, param_grid_rf, cv=3, scoring='accuracy', n_jobs=-1, verbose=1
)
grid_search_rf.fit(X_train_balanced, y_train_balanced)

# Best parameters and performance
best_rf = grid_search_rf.best_estimator_
print("\nBest Random Forest Parameters:", grid_search_rf.best_params_)
y_pred_rf = best_rf.predict(X_test)
print("\nRandom Forest Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


# Other models' performance
models = {
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    #"XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    "SVM": SVC(probability=True, random_state=42)
}

for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    model.fit(X_train_balanced, y_train_balanced)
    y_pred = model.predict(X_test)
    
    print(f"{model_name} Performance:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
    print("Classification Report:\n", classification_report(y_test, y_pred))

Fitting 3 folds for each of 81 candidates, totalling 243 fits





Best Random Forest Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}

Random Forest Performance:
Accuracy: 0.30477272727272725
F1 Score: 0.2788512756528181
Classification Report:
               precision    recall  f1-score   support

          -1       0.18      0.88      0.30       600
           0       0.40      0.05      0.09        40
           1       0.27      0.07      0.12        40
           2       0.65      0.33      0.43        40
           3       0.73      0.28      0.40        40
           4       0.27      0.10      0.15        40
           5       0.31      0.10      0.15        40
           6       0.56      0.12      0.20        40
           7       0.30      0.17      0.22        40
           8       0.62      0.25      0.36        40
           9       0.50      0.07      0.13        40
          10       0.67      0.15      0.24        40
          11       0.62      0.12      0.21        40
          12  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Gradient Boosting Performance:
Accuracy: 0.21954545454545454
F1 Score: 0.1881001285458769
Classification Report:
               precision    recall  f1-score   support

          -1       0.16      0.74      0.26       600
           0       0.17      0.07      0.10        40
           1       0.10      0.03      0.04        40
           2       0.50      0.17      0.26        40
           3       0.55      0.28      0.37        40
           4       0.31      0.10      0.15        40
           5       0.00      0.00      0.00        40
           6       0.33      0.10      0.15        40
           7       0.06      0.05      0.05        40
           8       0.22      0.05      0.08        40
           9       0.10      0.03      0.04        40
          10       0.33      0.10      0.15        40
          11       0.50      0.12      0.20        40
          12       0.38      0.38      0.38        40
          13       0.00      0.00      0.00        40
          14       0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
