In [2]:
import pickle
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Function for preprocessing monitored data
def load_and_preprocess_mon(file_path):
    USE_SUBLABEL = False # Whether to use sub-labeling for detailed pages
    URL_PER_SITE = 10 # Number of URLs per monitored site
    TOTAL_URLS = 950  # Total monitored URLs (95 classes x 10 subpages)

    with open(file_path, 'rb') as fi:
        data = pickle.load(fi)

    X1, X2, y = [], [], [] # Stores time sequences, size sequences, and labels
    additional_features = {
        'num_incoming': [], # Number of incoming packets
        'num_outgoing': [], # Number of outgoing packets
        'total_packets': [], # Total number of packets
        'average_packet_interval': [], # Average interval between packets
        'std_packet_interval': [], # Standard deviation of packet intervals
        'total_data_in': [], # Total size of incoming data
        'total_data_out': [] # Total size of outgoing data
    }

    for i in range(TOTAL_URLS):
        label = i if USE_SUBLABEL else i // URL_PER_SITE
        for sample in data[i]:
            size_seq, time_seq = [], []
            incoming, outgoing = 0, 0
            for c in sample:
                dr = 1 if c > 0 else -1 # Determine packet direction (outgoing=1, incoming=-1)
                time_seq.append(abs(c))
                size_seq.append(dr * 512)
                if dr > 0:
                    outgoing += 1
                else:
                    incoming += 1

            X1.append(time_seq)
            X2.append(size_seq)
            y.append(label)

            # additional feature extraction
            additional_features['num_incoming'].append(incoming)
            additional_features['num_outgoing'].append(outgoing)
            additional_features['total_packets'].append(len(time_seq))
            additional_features['average_packet_interval'].append(
                np.mean(np.diff(time_seq)) if len(time_seq) > 1 else 0
            )
            additional_features['std_packet_interval'].append(
                np.std(np.diff(time_seq)) if len(time_seq) > 1 else 0
            )
            additional_features['total_data_in'].append(sum([abs(s) for s in size_seq if s < 0]))
            additional_features['total_data_out'].append(sum([abs(s) for s in size_seq if s > 0]))

    # Normalization
    # Convert to values between 0 and 1 for better SVM and Random Forest performance
    scaler = MinMaxScaler()
    for key in additional_features:
        additional_features[key] = scaler.fit_transform(
            np.array(additional_features[key]).reshape(-1, 1)
        ).flatten()

    return (
        np.array(X1, dtype=object),
        np.array(X2, dtype=object),
        np.array(y),
        additional_features
    )

# Function for preprocessing unmonitored data
def load_and_preprocess_unmon(file_path):
    TOTAL_URLS = 3000  # Total number of unmonitored URLs

    with open(file_path, 'rb') as fi:
        data = pickle.load(fi)

    X1, X2 = [], []
    additional_features = {
        'num_incoming': [],
        'num_outgoing': [],
        'total_packets': [],
        'average_packet_interval': [],
        'std_packet_interval': [],
        'total_data_in': [],
        'total_data_out': []
    }

    for i in range(TOTAL_URLS):
        size_seq, time_seq = [], []
        incoming, outgoing = 0, 0
        for c in data[i]:
            dr = 1 if c > 0 else -1
            time_seq.append(abs(c))
            size_seq.append(dr * 512)
            if dr > 0:
                outgoing += 1
            else:
                incoming += 1

        X1.append(time_seq)
        X2.append(size_seq)

        # additional feature extraction
        additional_features['num_incoming'].append(incoming)
        additional_features['num_outgoing'].append(outgoing)
        additional_features['total_packets'].append(len(time_seq))
        additional_features['average_packet_interval'].append(
            np.mean(np.diff(time_seq)) if len(time_seq) > 1 else 0
        )
        additional_features['std_packet_interval'].append(
            np.std(np.diff(time_seq)) if len(time_seq) > 1 else 0
        )
        additional_features['total_data_in'].append(sum([abs(s) for s in size_seq if s < 0]))
        additional_features['total_data_out'].append(sum([abs(s) for s in size_seq if s > 0]))

    # Normalization
    scaler = MinMaxScaler()
    for key in additional_features:
        additional_features[key] = scaler.fit_transform(
            np.array(additional_features[key]).reshape(-1, 1)
        ).flatten()

    return (
        np.array(X1, dtype=object),
        np.array(X2, dtype=object),
        additional_features
    )

In [3]:
# Preparing and saving Closed-world monitored data
mon_path = "./mon_standard.pkl" # Path to the monitored data file
X1_mon, X2_mon, y_mon, features_mon = load_and_preprocess_mon(mon_path)

# Save the processed monitored data
mon_data = {
    'X1': X1_mon, # Time sequences
    'X2': X2_mon, # Size sequences
    'y': y_mon, # Labels
    'features': features_mon # Additional extracted features
}

with open("./processed_mon_data_closed_world.pkl", "wb") as f:
    pickle.dump(mon_data, f)

print("Closed-world Monitored data saved as processed_mon_data_closed_world.pkl")

# Create a DataFrame
df_mon = pd.DataFrame({
    'time_seq': X1_mon,
    'size_seq': X2_mon,
    'label': y_mon,
    'num_incoming': features_mon['num_incoming'],
    'num_outgoing': features_mon['num_outgoing'],
    'total_packets': features_mon['total_packets'],
    'avg_packet_interval': features_mon['average_packet_interval'],
    'std_packet_interval': features_mon['std_packet_interval'],
    'total_data_in': features_mon['total_data_in'],
    'total_data_out': features_mon['total_data_out']
})

print("Closed-world Monitored Data Sample:")
print(df_mon)

Closed-world Monitored data saved as processed_mon_data_closed_world.pkl
Closed-world Monitored Data Sample:
                                                time_seq  \
0      [0.0, 0.14, 0.14, 0.31, 0.31, 0.51, 0.51, 0.51...   
1      [0.0, 0.13, 0.13, 0.31, 0.77, 1.11, 1.11, 1.11...   
2      [0.0, 0.11, 0.11, 0.23, 0.97, 1.11, 1.11, 1.11...   
3      [0.0, 0.27, 0.27, 0.6, 0.6, 0.88, 0.89, 0.89, ...   
4      [0.0, 0.11, 0.11, 0.36, 0.36, 0.6, 0.6, 0.6, 0...   
...                                                  ...   
18995  [0.0, 0.15, 0.15, 0.33, 0.91, 1.12, 1.13, 1.13...   
18996  [0.0, 0.16, 0.16, 0.35, 0.99, 1.26, 1.26, 1.26...   
18997  [0.0, 0.11, 0.11, 0.36, 0.36, 0.83, 0.83, 0.83...   
18998  [0.0, 0.17, 0.17, 0.32, 1.98, 2.56, 2.56, 2.56...   
18999  [0.0, 0.12, 0.12, 0.46, 0.46, 0.72, 0.73, 0.73...   

                                                size_seq  label  num_incoming  \
0      [-512, -512, 512, -512, 512, -512, 512, 512, -...      0      0.131810   
1      [

In [None]:
# Preparing and saving Open-world Binary data
mon_path = "./mon_standard.pkl" # Path to the monitored data file
unmon_path = "./unmon_standard10_3000.pkl" # Path to the unmonitored data file

X1_mon, X2_mon, y_mon, features_mon = load_and_preprocess_mon(mon_path)
X1_unmon, X2_unmon, features_unmon = load_and_preprocess_unmon(unmon_path)

# Monitored label: 1, Unmonitored label: -1
y_mon_binary = np.ones(len(y_mon))
y_unmon_binary = -1 * np.ones(len(X1_unmon))

# Merge monitored and unmonitored data
X1 = np.concatenate((X1_mon, X1_unmon))
X2 = np.concatenate((X2_mon, X2_unmon))
y = np.concatenate((y_mon_binary, y_unmon_binary))

# Merge additional features
features = {key: np.concatenate((features_mon[key], features_unmon[key])) for key in features_mon}

# Save the merged data
binary_data = {
    'X1': X1,
    'X2': X2,
    'y': y,
    'features': features
}

with open("./processed_binary_data_open_world.pkl", "wb") as f:
    pickle.dump(binary_data, f)

print("Open-world Binary data saved as processed_binary_data_open_world.pkl")


# Create a DataFrame
df_binary = pd.DataFrame({
    'time_seq': X1,
    'size_seq': X2,
    'label': y, # Monitored label (1) / Unmonitored label (-1)
    'num_incoming': features['num_incoming'],
    'num_outgoing': features['num_outgoing'],
    'total_packets': features['total_packets'],
    'avg_packet_interval': features['average_packet_interval'],
    'std_packet_interval': features['std_packet_interval'],
    'total_data_in': features['total_data_in'],
    'total_data_out': features['total_data_out']
})

print("Open-world Binary Data Sample:")
print(df_binary)

Open-world Binary data saved as processed_binary_data_open_world.pkl
Open-world Binary Data Sample:
                                                time_seq  \
0      [0.0, 0.14, 0.14, 0.31, 0.31, 0.51, 0.51, 0.51...   
1      [0.0, 0.13, 0.13, 0.31, 0.77, 1.11, 1.11, 1.11...   
2      [0.0, 0.11, 0.11, 0.23, 0.97, 1.11, 1.11, 1.11...   
3      [0.0, 0.27, 0.27, 0.6, 0.6, 0.88, 0.89, 0.89, ...   
4      [0.0, 0.11, 0.11, 0.36, 0.36, 0.6, 0.6, 0.6, 0...   
...                                                  ...   
21995  [0.0, 0.1, 0.1, 0.22, 0.79, 0.95, 0.96, 1.09, ...   
21996  [0.0, 0.17, 0.17, 0.37, 1.73, 2.23, 2.23, 2.56...   
21997  [0.0, 0.11, 0.11, 0.23, 0.86, 1.18, 1.18, 1.5,...   
21998  [0.0, 0.17, 0.17, 0.35, 3.07, 3.28, 3.28, 3.71...   
21999  [0.0, 0.13, 0.13, 0.35, 0.98, 1.46, 1.46, 1.9,...   

                                                size_seq  label  num_incoming  \
0      [-512, -512, 512, -512, 512, -512, 512, 512, -...    1.0      0.131810   
1      [-512, -51

In [None]:
# Preparing and saving Open-world Multi-class data
mon_path = "./mon_standard.pkl" # Path to the monitored data file
unmon_path = "./unmon_standard10_3000.pkl" # Path to the unmonitored data file

X1_mon, X2_mon, y_mon, features_mon = load_and_preprocess_mon(mon_path)
X1_unmon, X2_unmon, features_unmon = load_and_preprocess_unmon(unmon_path)

# Monitored labels: 0 to 94, Unmonitored label: -1
y_unmon_multi = -1 * np.ones(len(X1_unmon))

# Merge monitored and unmonitored data
X1 = np.concatenate((X1_mon, X1_unmon))
X2 = np.concatenate((X2_mon, X2_unmon))
y = np.concatenate((y_mon, y_unmon_multi))

# Merge additional features
features = {key: np.concatenate((features_mon[key], features_unmon[key])) for key in features_mon}

# Save the merged data
multi_class_data = {
    'X1': X1,
    'X2': X2,
    'y': y,
    'features': features
}

with open("./processed_multiclass_data_open_world.pkl", "wb") as f:
    pickle.dump(multi_class_data, f)

print("Open-world Multi-class data saved as processed_multiclass_data_open_world.pkl")

# Create a DataFrame
df_multiclass = pd.DataFrame({
    'time_seq': X1,
    'size_seq': X2,
    'label': y,
    'num_incoming': features['num_incoming'],
    'num_outgoing': features['num_outgoing'],
    'total_packets': features['total_packets'],
    'avg_packet_interval': features['average_packet_interval'],
    'std_packet_interval': features['std_packet_interval'],
    'total_data_in': features['total_data_in'],
    'total_data_out': features['total_data_out']
})

print("Open-world Multi-class Data Sample:")
print(df_multiclass)


Open-world Multi-class data saved as processed_multiclass_data_open_world.pkl
Open-world Multi-class Data Sample:
                                                time_seq  \
0      [0.0, 0.14, 0.14, 0.31, 0.31, 0.51, 0.51, 0.51...   
1      [0.0, 0.13, 0.13, 0.31, 0.77, 1.11, 1.11, 1.11...   
2      [0.0, 0.11, 0.11, 0.23, 0.97, 1.11, 1.11, 1.11...   
3      [0.0, 0.27, 0.27, 0.6, 0.6, 0.88, 0.89, 0.89, ...   
4      [0.0, 0.11, 0.11, 0.36, 0.36, 0.6, 0.6, 0.6, 0...   
...                                                  ...   
21995  [0.0, 0.1, 0.1, 0.22, 0.79, 0.95, 0.96, 1.09, ...   
21996  [0.0, 0.17, 0.17, 0.37, 1.73, 2.23, 2.23, 2.56...   
21997  [0.0, 0.11, 0.11, 0.23, 0.86, 1.18, 1.18, 1.5,...   
21998  [0.0, 0.17, 0.17, 0.35, 3.07, 3.28, 3.28, 3.71...   
21999  [0.0, 0.13, 0.13, 0.35, 0.98, 1.46, 1.46, 1.9,...   

                                                size_seq  label  num_incoming  \
0      [-512, -512, 512, -512, 512, -512, 512, 512, -...    0.0      0.131810   
1  