압축 풀기

In [20]:
import zipfile

zip_path = "/content/mon.zip"
extract_path = "/content/mon_data"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("ZIP extraction complete")

ZIP extraction complete


In [4]:
import zipfile

zip_path = "/content/unmon.zip"
extract_path = "/content/unmon_data"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("ZIP extraction complete")

ZIP extraction complete


## 1. Data Loading

In [41]:
import os
import numpy as np

def parse_file(content):
    features = []
    for line in content.strip().splitlines():
        if not line.strip():
            continue
        try:
            timestamp, direction, size = map(float, line.split())
            signed_size = size
            features.append([timestamp, signed_size])
        except ValueError:
            continue
    return np.array(features)


def load_data(folder_path, top_k=2):
    grouped_files = {}

    # Step 1. read split files and group by original ID
    for file in os.listdir(folder_path):
        if not file.endswith(".cell") or "split" not in file:
            continue

        file_path = os.path.join(folder_path, file)
        instance_id = file.replace(".cell", "")          # e.g. 0-61_split_1 or 9999_split_4
        parts = instance_id.split("_split")[0]           # e.g. 0-61 or 9999

        # Determine if monitored or unmonitored
        if "-" in parts:  # monitored
            class_label_str = parts.split("-")[0]
            class_label = int(class_label_str)
            group_id = parts
        else:             # unmonitored
            class_label = -1
            group_id = parts

        file_size = os.path.getsize(file_path)

        grouped_files.setdefault(group_id, [])
        grouped_files[group_id].append({
            "path": file_path,
            "class": class_label,
            "size": file_size
        })

    instances = []
    labels = []

    # Step 2. select top-k by size and merge
    sorted_group_ids = sorted(grouped_files.keys(), key=lambda x: [int(p) for p in x.split('-')])
    for group_id in sorted_group_ids:
        files = grouped_files[group_id]
        sorted_files = sorted(files, key=lambda x: x["size"], reverse=True)
        selected = sorted_files[:top_k]

        packets = []

        for item in selected:
            with open(item["path"], 'r') as f:
                instance = parse_file(f.read())
                if instance.size > 0:
                    packets.extend(instance.tolist())

        if len(packets) > 0:
            packets.sort(key=lambda x: x[0])
            start_time = packets[0][0]
            packets = [[p[0] - start_time, p[1]] for p in packets]

            instances.append(np.array(packets))
            labels.append(selected[0]["class"])

    print(f"Total merged groups loaded: {len(instances)}")
    print(f"Class distribution: {set(labels)}")

    return np.array(instances, dtype=object), np.array(labels)


## 2. mon

In [42]:
mon_folder_path = "/content/mon_data/scratch2/TrafficSliver/DeepCoAST/BigEnough/path5/mon/ts"

X_raw, y = load_data(mon_folder_path, top_k=2)

Total merged groups loaded: 19000
Class distribution: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94}


In [43]:
print("Total instances:", len(X_raw))
print("Labels distribution:", np.unique(y, return_counts=True))

Total instances: 19000
Labels distribution: (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94]), array([200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200,
       200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200,
       200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200,
       200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200,
       200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200,
       200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200,
       200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200,
       200, 200, 200, 200])

In [44]:
import pickle

save_path = "/content/processed_mon_data.pkl"

with open(save_path, "wb") as f:
    pickle.dump((X_raw, y), f)

print("Pickle 파일 저장 완료:", save_path)

Pickle 파일 저장 완료: /content/processed_mon_data.pkl


## 3. unmon

In [48]:
unmon_folder_path = "/content/unmon_data/scratch2/TrafficSliver/DeepCoAST/BigEnough/path5/unmon/ts"

X_raw, y = load_data(unmon_folder_path, top_k=2)

Total merged groups loaded: 19000
Class distribution: {-1}


In [49]:
print("Total instances:", len(X_raw))
print("Labels distribution:", np.unique(y, return_counts=True))

Total instances: 19000
Labels distribution: (array([-1]), array([19000]))


In [50]:
import pickle

save_path = "/content/processed_unmon_data.pkl"

with open(save_path, "wb") as f:
    pickle.dump((X_raw, y), f)

print("Pickle 파일 저장 완료:", save_path)

Pickle 파일 저장 완료: /content/processed_unmon_data.pkl
