In [None]:
import pandas as pd
import numpy as np
import pickle

# Monitored Data

## Importing mon_standard.pkl

In [None]:
USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS   = 950

# Load the pickle file
print("Loading datafile...")
with open("mon_standard.pkl", 'rb') as fi: 
    data = pickle.load(fi)

X1 = [] # Array to store instances (timestamps) - 19,000 instances, e.g., [[0.0, 0.5, 3.4, ...], [0.0, 4.5, ...], [0.0, 1.5, ...], ... [... ,45.8]]
X2 = [] # Array to store instances (direction*size) - size information
y = [] # Array to store the site of each instance - 19,000 instances, e.g., [0, 0, 0, 0, 0, 0, ..., 94, 94, 94, 94, 94]

# Differentiate instances and sites, and store them in the respective x and y arrays
# x array (direction*timestamp), y array (site label)
for i in range(TOTAL_URLS):
    if USE_SUBLABEL:
        label = i
    else:
        label = i // URL_PER_SITE # Calculate which site's URL the current URL being processed belongs to and set that value as the label. Thus, URLs fetched from the same site are labeled identically.
    for sample in data[i]:
        size_seq = []
        time_seq = []
        for c in sample:
            dr = 1 if c > 0 else -1
            time_seq.append(abs(c))
            size_seq.append(dr * 512)
        X1.append(time_seq)
        X2.append(size_seq)
        y.append(label)
size = len(y)

print(f'Total samples: {size}') # Output: 19000

Loading datafile...
Total samples: 19000


## Extracting Features

Here, we will be extracting features for our monitored data. We will be storing our data as a pandas dataframe.

In [5]:
df_mon = pd.DataFrame({'label': y})

**Feature 1: Number of incoming packets**  

This feature represents the number of incoming packet per instance. It is calculated by counting the number of packets that have a negative direction (incoming packets).

This measures the volume of data being received, which provides insights into how much incoming data is being received.

In [6]:
# 1 Number of incoming packets
incoming_packet_counts = []
for instance in X2:
    incoming_count = sum(1 for packet in instance if packet < 0)
    incoming_packet_counts.append(incoming_count)

df_mon['incoming_packet_counts'] = incoming_packet_counts

**Feature 2: Number of outgoing packets**  

This feature represents the number of outgoing packet per instance. It is calculated by counting the number of packets that have a positive direction (outgoing packets).

This measures the volume of data that is being sent out, which provides insights into user actions.


In [7]:
# 2 Number of outgoing packets
outgoinging_packet_counts = []
for instance in X2:
    outgoing_count = sum(1 for packet in instance if packet > 0)
    outgoinging_packet_counts.append(outgoing_count)

df_mon['outgoing_packet_counts'] = outgoinging_packet_counts

**Feature 3: Total number of packets**  

This feature represents the total number of packet per instance. 

This gives an overall view of the data flow, measuring how active a connection is.

In [8]:
# 3 Total number of packets
total_packet_counts = []
for instance in X2:
    total_count = len(instance)
    total_packet_counts.append(total_count)

df_mon['total_packet_counts'] = total_packet_counts

**Feature 4: Number of incoming packets as a fraction of the total number of packets**  

This feature represents the fraction of number of incoming packets out of the total number of packets per instance. 

This measures the proportion of incoming traffic relative to total traffic. It can provide insights into the usage. For instance, a higher fraction can indicate more passive consumption, while a lower fraction can suggest more user-initiated actions.

In [9]:
# 4 Number of incoming packets as a fraction of the total number of packets
df_mon['incoming_packet_fraction'] = df_mon['incoming_packet_counts'] / df_mon['total_packet_counts']

**Feature 5: Number of outgoing packets as a fraction of the total number of packets**
This feature represents the fraction of number of outgoing packets out of the total number of packets per instance. 

This feature complements feature 4 - Number of incoming packets as a fraction of the total number of packets

In [10]:
# 5 Number of outgoing packets as a fraction of the total number of packets
df_mon['outgoing_packet_fraction'] = df_mon['outgoing_packet_counts'] / df_mon['total_packet_counts']

**Feature 6: Standard deviation of the outgoing packets ordering list**  
**Feature 7: Average of the outgoing packets ordering list**

We define ordering list as the list of positions (indices) of outgoing packets in the sequence of all packets. 

For example, suppose we have packets in the sequence of [in, out, in, out, out, in, out, in], where out denotes outgoing packets. The ordering list for outgoing pacekts would be [1, 3, 4, 6], in other words, the indices of the outgoing packets in the full packet list.

Standard deviation and Average is then taken based on this ordering list.

Standard deviation of the outgoing packets ordering list measures the variability in the sequence of outgoing packets. It can provide us with insights to whether the network traffic consists of more consistent communication (low variability which is implied by low standard deviation), or bursty communication (high variability which is implied by high standard deviation). 

Average of the outgoing packets ordering list measures the average position in the sequence that outgoing packets tend to occur. It can provide us with insights into the typical position of outgoing packets in the sequence and hence a general sense of timing. This can also suggest to us a different communication patterns.

In [11]:
# 6 Standard deviation of the outgoing packets ordering list
# 7 Average of the outgoing packets ordering list

std_outgoing_order = []
avg_outgoing_order = []

for directions in X2:
    outgoing_order_indices = [i for i, dir_size in enumerate(directions) if dir_size > 0]
    
    if outgoing_order_indices:
        std_outgoing_order.append(np.std(outgoing_order_indices))  
        avg_outgoing_order.append(np.mean(outgoing_order_indices))  
    else:
        std_outgoing_order.append(0) 
        avg_outgoing_order.append(0)  

df_mon['std_outgoing_order'] = std_outgoing_order
df_mon['avg_outgoing_order'] = avg_outgoing_order

**Feature 8: Sum of all items in the alternative concentration feature list**

**Feature 9: Average of all items in the alternative concentration feature list**

We define alternative concentration feature list to be the list of time intervals between consecutive packets. This list will capture how clustered packets are, based on their time intervals, with shorter intervals indicating packets are sent close together, and longer intervals indicating more spread-out transmissions of packets. 

The concentration of packets can provide insights into periods of data transfer. For instance, higher concentration of packets in short time burst could indicate peak periods, while a more distributed pattern could indicate a more stable or background communication.

Feature 8 takes the sum of the alternative concentration feature list, while Feature 9 takes the average of the alternative concentration feature list. 

Feature 8 is the raw total concentration which can provide insights into the absolute intensity for that instance. This can allow us to capture how active or bursty the entire instance is in absolute sense.

Feature 9 on the other hand is the normalised concentration sum, where we divide the sum by the total number of packets, creating a per-packet average concentration. This makes the feature reflect the avergae intesntiy per packet instead, which captures the burst intensity relative to the instance length, which makes it easier to compare burstiness across instances of varying length.

In [12]:
# 8 Sum of all items in the alternative concentration feature list
sum_concentration = []

for timestamps in X1:
    if len(timestamps) > 1:
        concentration_feature = np.diff(timestamps)  # Time differences between packets
        sum_concentration.append(np.sum(concentration_feature))  # Sum of the differences
    else:
        sum_concentration.append(0) 

df_mon['sum_concentration'] = sum_concentration

In [13]:
# 9 Average of all items in the alternative concentration feature list
df_mon['avg_concentration'] = df_mon['sum_concentration'] / df_mon['total_packet_counts']

In [14]:
df_mon

Unnamed: 0,label,incoming_packet_counts,outgoing_packet_counts,total_packet_counts,incoming_packet_fraction,outgoing_packet_fraction,std_outgoing_order,avg_outgoing_order,sum_concentration,avg_concentration
0,0,1300,121,1421,0.914849,0.085151,515.483953,773.322314,10.14,0.007136
1,0,438,80,518,0.845560,0.154440,139.231951,226.162500,10.16,0.019614
2,0,1240,118,1358,0.913108,0.086892,472.735508,786.110169,11.11,0.008181
3,0,1324,122,1446,0.915629,0.084371,513.916038,820.139344,13.36,0.009239
4,0,1291,115,1406,0.918208,0.081792,503.993490,789.608696,10.64,0.007568
...,...,...,...,...,...,...,...,...,...,...
18995,94,8815,619,9434,0.934386,0.065614,3053.116218,4844.586430,43.91,0.004654
18996,94,9404,552,9956,0.944556,0.055444,3010.091146,4541.974638,15.60,0.001567
18997,94,9373,579,9952,0.941821,0.058179,3102.381602,4766.072539,14.93,0.001500
18998,94,9236,690,9926,0.930486,0.069514,3116.574388,5278.146377,19.91,0.002006


# Unmonitored Data

## Importing unmon_standard10.pkl

In [15]:
TOTAL_URLS = 10000  # total number in the dataset

with open("unmon_standard10.pkl", 'rb') as f: 
    x = pickle.load(f)
    
### use above for local

size = len(x)
print(f'Total samples: {size}')

X1 = [] # Array to store instances (timestamps) - 10,000 instances, e.g., [[0.0, 0.5, 3.4, ...], [0.0, 4.5, ...], [0.0, 1.5, ...], ... [... ,45.8]]
X2 = [] # Array to store instances (direction*size) - size information

for i in range(TOTAL_URLS):
    size_seq = []
    time_seq = []
    for c in x[i]:
        dr = 1 if c > 0 else -1
        time_seq.append(abs(c))
        size_seq.append(dr * 512) # In the pickle file, there is no size information, so the conversion code is set to multiply by 512 uniformly.
    X1.append(time_seq)
    X2.append(size_seq)

print(len(X1)) # Print the length of X1

Total samples: 10000
10000


## Extracting Features

Here, we will repeat the steps done in above for the Monitored data

In [16]:
df_unmon = pd.DataFrame()

In [17]:
# 1 Number of incoming packets
incoming_packet_counts = []
for instance in X2:
    incoming_count = sum(1 for packet in instance if packet < 0)
    incoming_packet_counts.append(incoming_count)

df_unmon['incoming_packet_counts'] = incoming_packet_counts

In [18]:
# 2 Number of outgoing packets
outgoinging_packet_counts = []
for instance in X2:
    outgoing_count = sum(1 for packet in instance if packet > 0)
    outgoinging_packet_counts.append(outgoing_count)

df_unmon['outgoing_packet_counts'] = outgoinging_packet_counts

In [19]:
# 3 Total number of packets
total_packet_counts = []
for instance in X2:
    total_count = len(instance)
    total_packet_counts.append(total_count)

df_unmon['total_packet_counts'] = total_packet_counts

In [20]:
# 4 Number of incoming packets as a fraction of the total number of packets
df_unmon['incoming_packet_fraction'] = df_unmon['incoming_packet_counts'] / df_unmon['total_packet_counts']

In [21]:
# 5 Number of outgoing packets as a fraction of the total number of packets
df_unmon['outgoing_packet_fraction'] = df_unmon['outgoing_packet_counts'] / df_unmon['total_packet_counts']

In [22]:
# 6 Standard deviation of the outgoing packets ordering list
# 7 verage of the outgoing packet ordering list

std_outgoing_order = []
avg_outgoing_order = []

for directions in X2:
    outgoing_order_indices = [i for i, dir_size in enumerate(directions) if dir_size > 0]
    
    if outgoing_order_indices:
        std_outgoing_order.append(np.std(outgoing_order_indices))  
        avg_outgoing_order.append(np.mean(outgoing_order_indices))  
    else:
        std_outgoing_order.append(0) 
        avg_outgoing_order.append(0)  

df_unmon['std_outgoing_order'] = std_outgoing_order
df_unmon['avg_outgoing_order'] = avg_outgoing_order

In [23]:
# 8 Sum of all items in the alternative concentration feature list
sum_concentration = []

for timestamps in X1:
    if len(timestamps) > 1:
        concentration_feature = np.diff(timestamps)  
        sum_concentration.append(np.sum(concentration_feature))  
    else:
        sum_concentration.append(0) 

df_unmon['sum_concentration'] = sum_concentration

In [24]:
# 9 Average of all items in the alternative concentration feature list
df_unmon['avg_concentration'] = df_unmon['sum_concentration'] / df_unmon['total_packet_counts']

In [25]:
df_unmon.insert(0, 'label', -1)

In [26]:
df_unmon

Unnamed: 0,label,incoming_packet_counts,outgoing_packet_counts,total_packet_counts,incoming_packet_fraction,outgoing_packet_fraction,std_outgoing_order,avg_outgoing_order,sum_concentration,avg_concentration
0,-1,101,29,130,0.776923,0.223077,27.053784,36.758621,5.56,0.042769
1,-1,9189,738,9927,0.925657,0.074343,2885.461055,3845.246612,11.88,0.001197
2,-1,285,74,359,0.793872,0.206128,111.145044,155.905405,17.81,0.049610
3,-1,9407,541,9948,0.945617,0.054383,3120.454082,4667.994455,9.17,0.000922
4,-1,2215,293,2508,0.883174,0.116826,778.715376,1227.269625,53.59,0.021368
...,...,...,...,...,...,...,...,...,...,...
9995,-1,4180,413,4593,0.910081,0.089919,1173.380403,2549.414044,32.09,0.006987
9996,-1,4663,447,5110,0.912524,0.087476,1621.869237,3062.015660,38.62,0.007558
9997,-1,302,59,361,0.836565,0.163435,118.245320,179.101695,34.93,0.096759
9998,-1,413,96,509,0.811395,0.188605,166.667122,309.197917,11.84,0.023261


# Combining the Data

Here, we will merge the 2 dataframes into 1 singular dataframe 

In [27]:
df = pd.concat([df_mon, df_unmon], ignore_index=True)

In [28]:
df

Unnamed: 0,label,incoming_packet_counts,outgoing_packet_counts,total_packet_counts,incoming_packet_fraction,outgoing_packet_fraction,std_outgoing_order,avg_outgoing_order,sum_concentration,avg_concentration
0,0,1300,121,1421,0.914849,0.085151,515.483953,773.322314,10.14,0.007136
1,0,438,80,518,0.845560,0.154440,139.231951,226.162500,10.16,0.019614
2,0,1240,118,1358,0.913108,0.086892,472.735508,786.110169,11.11,0.008181
3,0,1324,122,1446,0.915629,0.084371,513.916038,820.139344,13.36,0.009239
4,0,1291,115,1406,0.918208,0.081792,503.993490,789.608696,10.64,0.007568
...,...,...,...,...,...,...,...,...,...,...
28995,-1,4180,413,4593,0.910081,0.089919,1173.380403,2549.414044,32.09,0.006987
28996,-1,4663,447,5110,0.912524,0.087476,1621.869237,3062.015660,38.62,0.007558
28997,-1,302,59,361,0.836565,0.163435,118.245320,179.101695,34.93,0.096759
28998,-1,413,96,509,0.811395,0.188605,166.667122,309.197917,11.84,0.023261
