## 1. 데이터 로드 

In [2]:
import os
import numpy as np

def parse_cell_file(content):
    """
    Parse the content of a .cell file into numerical features.
    Each line should contain exactly 3 columns: [timestamp, direction, size].
    :param content: Raw content of a .cell file as a string
    :return: Parsed data as a NumPy array
    """
    features = []
    for line in content.strip().splitlines():  # Support Windows (CRLF) and UNIX (LF) line endings
        if not line.strip():  # Skip empty lines
            continue
        try:
            # Split the line into 3 parts
            timestamp, direction, size = map(float, line.split())
            # Combine size with direction for signed size
            signed_size = size if direction > 0 else -size
            features.append([timestamp, signed_size])
        except ValueError:
            # Skip lines that cannot be parsed into 3 float values
            print(f"Skipping invalid line: {line}")
    return np.array(features)

def load_cell_files(mon_folder, unmon_folder):
    """
    Load .cell files from 'mon' and 'unmon' folders and label them.
    :param mon_folder: Path to the folder containing monitored .cell files
    :param unmon_folder: Path to the folder containing unmonitored .cell files
    :return: X_raw (list of instances), y (list of labels)
    """
    instances = []  # To store all instances
    labels = []     # To store corresponding labels (1 for monitored, 0 for unmonitored)

    # Load monitored .cell files
    for file in os.listdir(mon_folder):
        if file.endswith('.cell'):
            file_path = os.path.join(mon_folder, file)
            with open(file_path, 'r') as f:
                instance = parse_cell_file(f.read())
                if instance.size > 0:  # Add instance only if valid data exists
                    instances.append(instance)
                    labels.append(1)  # Label for monitored

    # Load unmonitored .cell files
    for file in os.listdir(unmon_folder):
        if file.endswith('.cell'):
            file_path = os.path.join(unmon_folder, file)
            with open(file_path, 'r') as f:
                instance = parse_cell_file(f.read())
                if instance.size > 0:  # Add instance only if valid data exists
                    instances.append(instance)
                    labels.append(0)  # Label for unmonitored

    return np.array(instances, dtype=object), np.array(labels)

# Define paths to the 'mon' and 'unmon' folders
mon_folder_path = './mon'
unmon_folder_path = './unmon'

# Load data
X_raw, y = load_cell_files(mon_folder_path, unmon_folder_path)

print(f"Loaded {len(X_raw)} instances.")
print(f"Labels distribution: Monitored={np.sum(y)}, Unmonitored={len(y) - np.sum(y)}")


Loaded 15160 instances.
Labels distribution: Monitored=7677, Unmonitored=7483


In [12]:
import numpy as np

def create_features(X_raw):
    """
    Extract features from raw data.
    Each instance in X_raw is a NumPy array with columns [timestamp, signed_size].
    :param X_raw: List of NumPy arrays, each representing a .cell file's raw content
    :return: Feature matrix as a NumPy array
    """
    feature_matrix = []
    
    for instance in X_raw:
        # Extract timestamps and signed_sizes
        timestamps = instance[:, 0]
        signed_sizes = instance[:, 1]
        
        # 기본 Feature 계산
        packet_size_direction = np.sum(signed_sizes)  # Feature 1
        cumulative_packet_size = np.sum(np.abs(signed_sizes))  # Feature 2
        burst_lengths = len(signed_sizes)  # Feature 3
        num_incoming_packets = np.sum(signed_sizes > 0)  # Feature 4
        ratio_incoming_packets = (
            num_incoming_packets / burst_lengths if burst_lengths > 0 else 0
        )  # Feature 5
        num_outgoing_packets = np.sum(signed_sizes < 0)  # Feature 6
        total_packet_count = burst_lengths  # Feature 7

        # Mean Time Intervals 추가 (Feature 9)
        if len(timestamps) > 1:
            time_intervals = np.diff(timestamps)
            mean_time_intervals = np.mean(time_intervals)
        else:
            mean_time_intervals = 0

        # Feature 벡터 생성
        feature_vector = [
            packet_size_direction,
            np.mean(timestamps) if len(timestamps) > 0 else 0,  # Feature 8: 평균 타임스탬프
            cumulative_packet_size,
            burst_lengths,
            num_incoming_packets,
            ratio_incoming_packets,
            num_outgoing_packets,
            total_packet_count,
            mean_time_intervals  # 추가된 Feature
        ]
        feature_matrix.append(feature_vector)
    
    return np.array(feature_matrix)
X = create_features(X_raw)

# 출력
print(f"Feature matrix shape: {X.shape}")
print(f"Label vector shape: {y.shape}")

Feature matrix shape: (15160, 9)
Label vector shape: (15160,)
