## 1. mon_standard.pkl > mon_features.pkl

### (1) mon_standard.pkl > array code

In [1]:
import pickle

USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS   = 950

# Load the pickle file
print("Loading datafile...")
with open("/content/mon_standard.pkl", 'rb') as fi: # Path to mon_standard.pkl in Colab
    data = pickle.load(fi)

X1 = [] # Array to store instances (timestamps) - 19,000 instances, e.g., [[0.0, 0.5, 3.4, ...], [0.0, 4.5, ...], [0.0, 1.5, ...], ... [... ,45.8]]
X2 = [] # Array to store instances (direction*size) - size information
y = [] # Array to store the site of each instance - 19,000 instances, e.g., [0, 0, 0, 0, 0, 0, ..., 94, 94, 94, 94, 94]

# Differentiate instances and sites, and store them in the respective x and y arrays
# x array (direction*timestamp), y array (site label)
for i in range(TOTAL_URLS):
    if USE_SUBLABEL:
        label = i
    else:
        label = i // URL_PER_SITE # Calculate which site's URL the current URL being processed belongs to and set that value as the label. Thus, URLs fetched from the same site are labeled identically.
    for sample in data[i]:
        size_seq = []
        time_seq = []
        for c in sample:
            dr = 1 if c > 0 else -1
            time_seq.append(abs(c))
            size_seq.append(dr * 512)
        X1.append(time_seq)
        X2.append(size_seq)
        y.append(label)
size = len(y)

print(f'Total samples: {size}') # Output: 19000

Loading datafile...
Total samples: 19000


##### X1, X2, monitored label, y를 하나의 dataframe으로 변환

In [2]:
import pandas as pd
import numpy as np

monitored_label = np.full(len(X1), 1)
df = pd.DataFrame({'time': X1, 'size': X2, 'monitored_label': monitored_label, 'website_label': y})

In [3]:
print(df)

                                                    time  \
0      [0.0, 0.14, 0.14, 0.31, 0.31, 0.51, 0.51, 0.51...   
1      [0.0, 0.13, 0.13, 0.31, 0.77, 1.11, 1.11, 1.11...   
2      [0.0, 0.11, 0.11, 0.23, 0.97, 1.11, 1.11, 1.11...   
3      [0.0, 0.27, 0.27, 0.6, 0.6, 0.88, 0.89, 0.89, ...   
4      [0.0, 0.11, 0.11, 0.36, 0.36, 0.6, 0.6, 0.6, 0...   
...                                                  ...   
18995  [0.0, 0.15, 0.15, 0.33, 0.91, 1.12, 1.13, 1.13...   
18996  [0.0, 0.16, 0.16, 0.35, 0.99, 1.26, 1.26, 1.26...   
18997  [0.0, 0.11, 0.11, 0.36, 0.36, 0.83, 0.83, 0.83...   
18998  [0.0, 0.17, 0.17, 0.32, 1.98, 2.56, 2.56, 2.56...   
18999  [0.0, 0.12, 0.12, 0.46, 0.46, 0.72, 0.73, 0.73...   

                                                    size  monitored_label  \
0      [-512, -512, 512, -512, 512, -512, 512, 512, -...                1   
1      [-512, -512, 512, -512, 512, -512, 512, 512, -...                1   
2      [-512, -512, 512, -512, 512, -512, 512, 5

### (2) Feature Extraction

- total_transmission_time(총 전송 시간)
- std_inter_packet_time(패킷 간 시간 간격의 표준편차)
- avg_outgoing_burst_size(송신 버스트 평균 크기)
- num_outgoing_packets(송신 패킷의 총 개수)
- avg_incoming_burst_size(수신 버스트 평균 크기)
- cumul_packets_30pct(전체 시간의 30% 시점까지의 패킷 수)
- cumul_packets_10pct(전체 시간의 10% 시점까지의 패킷 수)
- incoming_order_skew(수신 패킷 순서 목록의 왜도)
- outgoing_first_30(세션 첫 30개 패킷 중 송신 비율)
- outgoing_order_skew(송신 패킷 순서 목록의 왜도)
- cumul_max(누적 패킷 시퀀스의 최대)
- bigram_OO (방향 N-gram 빈도 (N=2, Bigram))
- avg_incoming_order_first_30(처음 30 패킷의 수신 패킷 순서 목록의 평균)
- avg_outgoing_order_first_30(처음 30 패킷의 송신 패킷 순서 목록의 평균)
- num_incoming_first_30(처음 30 패킷의 수신 패킷 수)
- incoming_packet_ratio(수신 패킷 비율)
- outgoing_packet_ratio(송신 패킷 비율)

In [4]:
import numpy as np
from scipy import stats

def extract_all_features(df):
    time = np.array(df['time'])
    size = np.array(df['size'])

    # The sequence of direction
    direction = np.sign(size)

    # Total packet count
    total_packets = len(time)

    # 1. total_transmission_time (총 전송 시간)
    total_transmission_time = time[-1]

    # 2. std_inter_packet_time (패킷 간 시간 간격의 표준편차)
    inter_packet_times = np.diff(time)
    std_inter_packet_time = np.std(inter_packet_times) if len(inter_packet_times) > 0 else 0

    # 3 & 5. avg_outgoing_burst_size, avg_incoming_burst_size (버스트 계산)
    if total_packets > 0:
        direction_changes = np.where(np.diff(direction) != 0)[0] + 1
        burst_starts = np.insert(direction_changes, 0, 0)
        total_bursts = len(burst_starts)
        burst_lengths = np.diff(np.append(burst_starts, total_packets))
        burst_directions = direction[burst_starts]
        outgoing_burst_lengths = burst_lengths[burst_directions == 1]
        incoming_burst_lengths = burst_lengths[burst_directions == -1]

        if len(outgoing_burst_lengths) > 0:
            avg_outgoing_burst_size = np.mean(outgoing_burst_lengths)

        if len(incoming_burst_lengths) > 0:
            avg_incoming_burst_size = np.mean(incoming_burst_lengths)

    # 4. num_outgoing_packets (송신 패킷의 총 개수)
    num_outgoing_packets = np.sum(direction == 1)

    # 16. incoming_packet_ratio (수신 패킷 비율)
    num_incoming_packets = np.sum(direction == -1)
    incoming_packet_ratio = num_incoming_packets / total_packets

    # 17. outgoing_packet_ratio (송신 패킷 비율)
    outgoing_packet_ratio = num_outgoing_packets / total_packets

    # 6 & 7. cumul_packets_10pct, cumul_packets_30pct (누적 패킷)
    max_time = max(time) if total_packets > 0 else 0
    max_time_safe = max_time if max_time > 0 else 1.0 # 0으로 나누기 방지

    time_threshold_10 = 0.10 * max_time
    time_threshold_30 = 0.30 * max_time

    cumul_packets_10pct = sum(1 for t in time if t <= time_threshold_10)
    cumul_packets_30pct = sum(1 for t in time if t <= time_threshold_30)

    # 8. incoming_order_skew (수신 패킷 순서 목록의 왜도)
    incoming_order = np.where(direction == -1)[0]
    if len(incoming_order) > 1:
        incoming_order_skew = stats.skew(incoming_order)
    else:
        incoming_order_skew = 0

    # 10. outgoing_order_skew (송신 패킷 순서 목록의 왜도)
    outgoing_order = np.where(direction == 1)[0]
    if len(outgoing_order) > 1:
        outgoing_order_skew = stats.skew(outgoing_order)
    else:
        outgoing_order_skew = 0

    # 11. cumul_max (누적 패킷 시퀀스의 최대) (방향의 누적 합(csum)의 최대값으로 해석)
    csum_seq = np.cumsum(direction)
    cumul_max = np.max(csum_seq)

    # 12. bigram_OO (방향 Bigram 빈도 - OO)
    total_bigrams = total_packets - 1
    if total_bigrams > 0:
        oo_count = sum(1 for i in range(total_bigrams) if direction[i] == -1 and direction[i+1] == -1)
        bigram_OO = oo_count / total_bigrams

    # --- 첫 30개 패킷 관련 피처 ---
    first_30_directions = direction[:30]
    total_first_30 = len(first_30_directions)

    if total_first_30 > 0:
        # 15. num_incoming_first_30 (처음 30 패킷의 수신 패킷 수)
        num_incoming_first_30 = sum(1 for d in first_30_directions if d == -1)

        # 9. outgoing_first_30 (세션 첫 30개 패킷 중 송신 비율)
        num_out_30 = sum(1 for d in first_30_directions if d == 1)
        outgoing_first_30 = num_out_30 / total_first_30

        # 13. savg_incoming_order_first_30 (처음 30 패킷의 수신 패킷 순서 목록의 평균)
        in_order_indices_30 = [i for i, d in enumerate(first_30_directions) if d == -1]
        if in_order_indices_30:
            avg_incoming_order_first_30 = np.mean(in_order_indices_30)

        # 14. avg_outgoing_order_first_30 (처음 30 패킷의 송신 패킷 순서 목록의 평균)
        out_order_indices_30 = [i for i, d in enumerate(first_30_directions) if d == 1]
        if out_order_indices_30:
            avg_outgoing_order_first_30 = np.mean(out_order_indices_30)

    return pd.Series({
        'total_transmission_time': total_transmission_time,
        'std_inter_packet_time': std_inter_packet_time,
        'avg_outgoing_burst_size': avg_outgoing_burst_size,
        'avg_incoming_burst_size': avg_incoming_burst_size,
        'num_outgoing_packets': num_outgoing_packets,
        'incoming_packet_ratio': incoming_packet_ratio,
        'outgoing_packet_ratio': outgoing_packet_ratio,
        'cumul_packets_10pct': cumul_packets_10pct,
        'cumul_packets_30pct': cumul_packets_30pct,
        'outgoing_order_skew': outgoing_order_skew,
        'incoming_order_skew': incoming_order_skew,
        'cumul_max': cumul_max,
        'bigram_OO': bigram_OO,
        'num_incoming_first_30': num_incoming_first_30,
        'outgoing_first_30': outgoing_first_30,
        'avg_incoming_order_first_30': avg_incoming_order_first_30,
        'avg_outgoing_order_first_30': avg_outgoing_order_first_30
    })

# axis=1: DataFrame의 각 행(row)에 함수를 적용
features = df.apply(extract_all_features, axis=1)

print(features)

       total_transmission_time  std_inter_packet_time  \
0                        10.14               0.041168   
1                        10.16               0.163930   
2                        11.11               0.066661   
3                        13.36               0.047809   
4                        10.64               0.038760   
...                        ...                    ...   
18995                    43.91               0.143962   
18996                    15.60               0.019465   
18997                    14.93               0.016411   
18998                    19.91               0.033281   
18999                    13.76               0.011074   

       avg_outgoing_burst_size  avg_incoming_burst_size  num_outgoing_packets  \
0                     1.551282                16.666667                 121.0   
1                     1.702128                 9.319149                  80.0   
2                     1.552632                16.315789                 

##### correlation analysis를 위해 feature dataframe과 label을 하나의 파일로 저장

In [5]:
import pandas as pd
import numpy as np
import pickle

mon_features = pd.concat([features, df['monitored_label'], df['website_label']], axis=1)
print(mon_features.head())

mon_features.to_pickle('/content/mon_features.pkl')

   total_transmission_time  std_inter_packet_time  avg_outgoing_burst_size  \
0                    10.14               0.041168                 1.551282   
1                    10.16               0.163930                 1.702128   
2                    11.11               0.066661                 1.552632   
3                    13.36               0.047809                 1.525000   
4                    10.64               0.038760                 1.455696   

   avg_incoming_burst_size  num_outgoing_packets  incoming_packet_ratio  \
0                16.666667                 121.0               0.914849   
1                 9.319149                  80.0               0.845560   
2                16.315789                 118.0               0.913108   
3                16.550000                 122.0               0.915629   
4                16.341772                 115.0               0.918208   

   outgoing_packet_ratio  cumul_packets_10pct  cumul_packets_30pct  \
0         

## 2. unmon_standard.pkl > unmon_features.pkl

### (1) unmon_standard10.pkl > array code

In [1]:
import pickle

TOTAL_URLS = 10000  # total number in the dataset

# Load 10,000 unmon pickle file
print("Loading datafile...")
with open('/content/unmon_standard10.pkl', 'rb') as f:  # Path to unmon_standard10.pkl in Colab
    x = pickle.load(f)

size = len(x)
print(f'Total samples: {size}')

X1 = [] # Array to store instances (timestamps) - 10,000 instances, e.g., [[0.0, 0.5, 3.4, ...], [0.0, 4.5, ...], [0.0, 1.5, ...], ... [... ,45.8]]
X2 = [] # Array to store instances (direction*size) - size information

for i in range(TOTAL_URLS):
    size_seq = []
    time_seq = []
    for c in x[i]:
        dr = 1 if c > 0 else -1
        time_seq.append(abs(c))
        size_seq.append(dr * 512) # In the pickle file, there is no size information, so the conversion code is set to multiply by 512 uniformly.
    X1.append(time_seq)
    X2.append(size_seq)


print(len(X1)) # Print the length of X1

Loading datafile...
Total samples: 10000
10000


##### X1, X2, monitored label, y를 하나의 dataframe으로 변환

In [2]:
import pandas as pd
import numpy as np

y1 = np.full(len(X1), -1)
y2 = np.full(len(X1), -1)
df = pd.DataFrame({'time': X1, 'size': X2, 'monitored_label': y1, 'website_label': y2})
print(df)

                                                   time  \
0     [0.0, 0.13, 0.13, 0.36, 0.83, 1.1, 1.1, 1.35, ...   
1     [0.0, 0.21, 0.21, 0.39, 0.85, 1.36, 1.36, 1.58...   
2     [0.0, 0.23, 0.24, 0.61, 1.84, 2.28, 2.29, 2.69...   
3     [0.0, 0.14, 0.14, 0.31, 0.85, 1.37, 1.37, 1.73...   
4     [0.0, 0.12, 0.12, 0.26, 0.83, 2.56, 2.56, 3.05...   
...                                                 ...   
9995  [0.0, 0.17, 0.17, 0.35, 0.63, 0.97, 0.97, 1.75...   
9996  [0.0, 0.79, 0.8, 1.21, 1.54, 2.65, 2.65, 3.47,...   
9997  [0.0, 0.13, 0.13, 0.29, 0.35, 0.77, 0.77, 1.71...   
9998  [0.0, 0.14, 0.14, 0.28, 0.86, 1.15, 1.15, 1.38...   
9999  [0.0, 0.11, 0.11, 0.38, 1.48, 1.71, 1.71, 1.9,...   

                                                   size  monitored_label  \
0     [-512, -512, 512, -512, 512, -512, 512, -512, ...               -1   
1     [-512, -512, 512, -512, 512, -512, 512, -512, ...               -1   
2     [-512, -512, 512, -512, 512, -512, 512, -512, ...        

### (2) Feature Extraction

- total_transmission_time(총 전송 시간)
- std_inter_packet_time(패킷 간 시간 간격의 표준편차)
- avg_outgoing_burst_size(송신 버스트 평균 크기)
- num_outgoing_packets(송신 패킷의 총 개수)
- avg_incoming_burst_size(수신 버스트 평균 크기)
- cumul_packets_30pct(전체 시간의 30% 시점까지의 패킷 수)
- cumul_packets_10pct(전체 시간의 10% 시점까지의 패킷 수)
- incoming_order_skew(수신 패킷 순서 목록의 왜도)
- outgoing_first_30(세션 첫 30개 패킷 중 송신 비율)
- outgoing_order_skew(송신 패킷 순서 목록의 왜도)
- cumul_max(누적 패킷 시퀀스의 최대)
- bigram_OO (방향 N-gram 빈도 (N=2, Bigram))
- avg_incoming_order_first_30(처음 30 패킷의 수신 패킷 순서 목록의 평균)
- avg_outgoing_order_first_30(처음 30 패킷의 송신 패킷 순서 목록의 평균)
- num_incoming_first_30(처음 30 패킷의 수신 패킷 수)
- incoming_packet_ratio(수신 패킷 비율)
- outgoing_packet_ratio(송신 패킷 비율)

In [3]:
import numpy as np
from scipy import stats

def extract_all_features(df):
    time = np.array(df['time'])
    size = np.array(df['size'])

    # The sequence of direction
    direction = np.sign(size)

    # Total packet count
    total_packets = len(time)

    # 1. total_transmission_time (총 전송 시간)
    total_transmission_time = time[-1]

    # 2. std_inter_packet_time (패킷 간 시간 간격의 표준편차)
    inter_packet_times = np.diff(time)
    std_inter_packet_time = np.std(inter_packet_times) if len(inter_packet_times) > 0 else 0

    # 3 & 5. avg_outgoing_burst_size, avg_incoming_burst_size (버스트 계산)
    if total_packets > 0:
        direction_changes = np.where(np.diff(direction) != 0)[0] + 1
        burst_starts = np.insert(direction_changes, 0, 0)
        total_bursts = len(burst_starts)
        burst_lengths = np.diff(np.append(burst_starts, total_packets))
        burst_directions = direction[burst_starts]
        outgoing_burst_lengths = burst_lengths[burst_directions == 1]
        incoming_burst_lengths = burst_lengths[burst_directions == -1]

        if len(outgoing_burst_lengths) > 0:
            avg_outgoing_burst_size = np.mean(outgoing_burst_lengths)

        if len(incoming_burst_lengths) > 0:
            avg_incoming_burst_size = np.mean(incoming_burst_lengths)

    # 4. num_outgoing_packets (송신 패킷의 총 개수)
    num_outgoing_packets = np.sum(direction == 1)

    # 16. incoming_packet_ratio (수신 패킷 비율)
    num_incoming_packets = np.sum(direction == -1)
    incoming_packet_ratio = num_incoming_packets / total_packets

    # 17. outgoing_packet_ratio (송신 패킷 비율)
    outgoing_packet_ratio = num_outgoing_packets / total_packets

    # 6 & 7. cumul_packets_10pct, cumul_packets_30pct (누적 패킷)
    max_time = max(time) if total_packets > 0 else 0
    max_time_safe = max_time if max_time > 0 else 1.0 # 0으로 나누기 방지

    time_threshold_10 = 0.10 * max_time
    time_threshold_30 = 0.30 * max_time

    cumul_packets_10pct = sum(1 for t in time if t <= time_threshold_10)
    cumul_packets_30pct = sum(1 for t in time if t <= time_threshold_30)

    # 8. incoming_order_skew (수신 패킷 순서 목록의 왜도)
    incoming_order = np.where(direction == -1)[0]
    if len(incoming_order) > 1:
        incoming_order_skew = stats.skew(incoming_order)
    else:
        incoming_order_skew = 0

    # 10. outgoing_order_skew (송신 패킷 순서 목록의 왜도)
    outgoing_order = np.where(direction == 1)[0]
    if len(outgoing_order) > 1:
        outgoing_order_skew = stats.skew(outgoing_order)
    else:
        outgoing_order_skew = 0

    # 11. cumul_max (누적 패킷 시퀀스의 최대) (방향의 누적 합(csum)의 최대값으로 해석)
    csum_seq = np.cumsum(direction)
    cumul_max = np.max(csum_seq)

    # 12. bigram_OO (방향 Bigram 빈도 - OO)
    total_bigrams = total_packets - 1
    if total_bigrams > 0:
        oo_count = sum(1 for i in range(total_bigrams) if direction[i] == -1 and direction[i+1] == -1)
        bigram_OO = oo_count / total_bigrams

    # --- 첫 30개 패킷 관련 피처 ---
    first_30_directions = direction[:30]
    total_first_30 = len(first_30_directions)

    if total_first_30 > 0:
        # 15. num_incoming_first_30 (처음 30 패킷의 수신 패킷 수)
        num_incoming_first_30 = sum(1 for d in first_30_directions if d == -1)

        # 9. outgoing_first_30 (세션 첫 30개 패킷 중 송신 비율)
        num_out_30 = sum(1 for d in first_30_directions if d == 1)
        outgoing_first_30 = num_out_30 / total_first_30

        # 13. savg_incoming_order_first_30 (처음 30 패킷의 수신 패킷 순서 목록의 평균)
        in_order_indices_30 = [i for i, d in enumerate(first_30_directions) if d == -1]
        if in_order_indices_30:
            avg_incoming_order_first_30 = np.mean(in_order_indices_30)

        # 14. avg_outgoing_order_first_30 (처음 30 패킷의 송신 패킷 순서 목록의 평균)
        out_order_indices_30 = [i for i, d in enumerate(first_30_directions) if d == 1]
        if out_order_indices_30:
            avg_outgoing_order_first_30 = np.mean(out_order_indices_30)

    return pd.Series({
        'total_transmission_time': total_transmission_time,
        'std_inter_packet_time': std_inter_packet_time,
        'avg_outgoing_burst_size': avg_outgoing_burst_size,
        'avg_incoming_burst_size': avg_incoming_burst_size,
        'num_outgoing_packets': num_outgoing_packets,
        'incoming_packet_ratio': incoming_packet_ratio,
        'outgoing_packet_ratio': outgoing_packet_ratio,
        'cumul_packets_10pct': cumul_packets_10pct,
        'cumul_packets_30pct': cumul_packets_30pct,
        'outgoing_order_skew': outgoing_order_skew,
        'incoming_order_skew': incoming_order_skew,
        'cumul_max': cumul_max,
        'bigram_OO': bigram_OO,
        'num_incoming_first_30': num_incoming_first_30,
        'outgoing_first_30': outgoing_first_30,
        'avg_incoming_order_first_30': avg_incoming_order_first_30,
        'avg_outgoing_order_first_30': avg_outgoing_order_first_30
    })

# axis=1: DataFrame의 각 행(row)에 함수를 적용
features = df.apply(extract_all_features, axis=1)

print(features)

      total_transmission_time  std_inter_packet_time  avg_outgoing_burst_size  \
0                        5.56               0.105741                 1.705882   
1                       11.88               0.019357                 1.906977   
2                       17.81               0.249810                 1.947368   
3                        9.17               0.017914                 1.478142   
4                       53.59               0.263498                 1.723529   
...                       ...                    ...                      ...   
9995                    32.09               0.163669                 1.619608   
9996                    38.62               0.114350                 1.995536   
9997                    34.93               1.331199                 2.107143   
9998                    11.84               0.083521                 1.714286   
9999                     9.62               0.026874                 1.076923   

      avg_incoming_burst_si

##### correlation analysis를 위해 feature dataframe과 label을 하나의 파일로 저장

In [4]:
import pandas as pd
import numpy as np
import pickle

mon_features = pd.concat([features, df['monitored_label'], df['website_label']], axis=1)
print(mon_features.head())

mon_features.to_pickle('/content/unmon_features.pkl')

   total_transmission_time  std_inter_packet_time  avg_outgoing_burst_size  \
0                     5.56               0.105741                 1.705882   
1                    11.88               0.019357                 1.906977   
2                    17.81               0.249810                 1.947368   
3                     9.17               0.017914                 1.478142   
4                    53.59               0.263498                 1.723529   

   avg_incoming_burst_size  num_outgoing_packets  incoming_packet_ratio  \
0                 5.611111                  29.0               0.776923   
1                23.682990                 738.0               0.925657   
2                 7.500000                  74.0               0.793872   
3                25.702186                 541.0               0.945617   
4                13.029412                 293.0               0.883174   

   outgoing_packet_ratio  cumul_packets_10pct  cumul_packets_30pct  \
0         