<a href="https://colab.research.google.com/github/Fidelisaboke/robust-nids/blob/feat%2Fbaseline-model/02_sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Simplified Sampling for TII-SSRC-23

This notebook performs a stratified reservoir sample of the TII-SSRC-23 dataset.
It aims to create a balanced 200k-row subset by:
1. Keeping ALL benign samples (approx. 1,301).
2. Sampling malicious traffic proportionally to the square root of their frequency, ensuring rare attacks are represented.



## Installations and Imports

In [4]:
!pip install fastparquet
!pip install pyarrow

Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading fastparquet-2024.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fastparquet
Successfully installed fastparquet-2024.11.0


In [5]:
# Imports
import os
import random
import numpy as np
import pandas as pd
import pyarrow.dataset as ds
from tqdm.auto import tqdm
from collections import defaultdict
import math

## 1. Configuration

### Mount Google Drive if needed

In [6]:
# Mount Google Drive if needed (uncomment if running on Colab)
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
# Paths
CSV_PATH = '/content/drive/MyDrive/Datasets/TII-SSRC-23/data.csv'
PARQUET_DIR = 'parquet_files'
OUT_PATH_CSV = '/content/drive/MyDrive/Datasets/TII-SSRC-23/sampled_200k_simple.csv'
OUT_PATH_PARQUET = '/content/drive/MyDrive/Datasets/TII-SSRC-23/sampled_200k_simple.parquet'

# Constants
N_TOTAL = 200_000
CHUNKSIZE = 200_000
MIN_PER_SUBTYPE = 50
ALPHA = 0.5  # For square-root weighting
RANDOM_SEED = 42

# Column Definitions
TARGET_COL = "Label"
TYPE_COL = "Traffic Type"
SUBTYPE_COL = "Traffic Subtype"

# These are not essential for intrusion detection.
COLUMNS_TO_REMOVE = [
    'Flow ID',
    'Src IP',
    'Dst IP',
    'Timestamp',
]

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

### CSV to Parquet

In [10]:
os.makedirs(PARQUET_DIR, exist_ok=True)
for i, chunk in tqdm(enumerate(pd.read_csv(CSV_PATH, chunksize=CHUNKSIZE))):
    fname = os.path.join(PARQUET_DIR, f"part_{i:05d}.parquet")
    chunk.to_parquet(fname, index=False, engine="fastparquet")

0it [00:00, ?it/s]

## 2. Helpers

In [11]:
# =========================================
# 2. Helper Functions
# =========================================

def get_feature_columns(parquet_dir, remove_cols):
    """Determines the list of feature columns to keep."""
    try:
        dataset = ds.dataset(parquet_dir, format="parquet")
        all_cols = set(dataset.schema.names)
        remove_set = set(remove_cols)
        # We keep the label columns during sampling for filtering
        feature_cols = sorted(list(all_cols - remove_set))
        return feature_cols
    except Exception as e:
        print(f"Error reading schema: {e}")
        return []

def parquet_generator(parquet_dir, chunksize, columns=None):
    """Yields chunks of data from the parquet dataset."""
    dataset = ds.dataset(parquet_dir, format="parquet")
    scanner = dataset.scanner(batch_size=chunksize, columns=columns)
    for batch in scanner.to_batches():
        yield batch.to_pandas()

## 3. Pass 1: Count Subtypes

In [12]:
# =========================================
# 3. Pass 1: Count Subtypes
# =========================================
print("--- Pass 1: Counting Subtypes ---")

subtype_counts = defaultdict(int)
total_rows = 0

# We only need the Subtype column for counting
for chunk in tqdm(parquet_generator(PARQUET_DIR, CHUNKSIZE, columns=[SUBTYPE_COL])):
    counts = chunk[SUBTYPE_COL].value_counts()
    for subtype, count in counts.items():
        subtype_counts[subtype] += count
    total_rows += len(chunk)

print(f"\nTotal rows scanned: {total_rows:,}")
print("Subtype counts:")
for s, c in sorted(subtype_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"  {s:<25}: {c:,}")

--- Pass 1: Counting Subtypes ---


0it [00:00, ?it/s]


Total rows scanned: 8,656,767
Subtype counts:
  DoS RST                  : 1,072,504
  Information Gathering    : 1,038,363
  DoS ACK                  : 936,307
  DoS PSH                  : 909,507
  DoS URG                  : 906,190
  DoS CWR                  : 872,523
  DoS ECN                  : 871,150
  DoS SYN                  : 856,764
  DoS FIN                  : 725,600
  DoS UDP                  : 257,994
  DoS HTTP                 : 82,351
  Mirai DDoS DNS           : 55,196
  Bruteforce DNS           : 22,179
  Mirai DDoS SYN           : 14,210
  Mirai DDoS HTTP          : 8,923
  Mirai Scan Bruteforce    : 8,731
  Bruteforce Telnet        : 4,913
  Bruteforce SSH           : 3,967
  Mirai DDoS ACK           : 3,779
  Bruteforce FTP           : 3,485
  Bruteforce HTTP          : 628
  Video HTTP               : 376
  Video RTP                : 349
  Text                     : 209
  Audio                    : 190
  Video UDP                : 145
  Mirai DDoS UDP           

## 4. Allocation Strategy

In [13]:
# =========================================
# 4. Allocation Strategy
# =========================================
print("\n--- Calculating Allocations ---")

# Define Benign Subtypes (to keep 100%)
BENIGN_SUBTYPES = [
    "Audio", "Background", "Video HTTP", "Video RTP", "Video UDP", "Text"
]

# 4a. Allocate ALL Benign
allocations = {}
benign_total = 0
for subtype in BENIGN_SUBTYPES:
    if subtype in subtype_counts:
        count = subtype_counts[subtype]
        allocations[subtype] = count
        benign_total += count

print(f"Benign samples allocated: {benign_total:,}")

# 4b. Allocate Malicious (Square-Root Weighting)
malicious_budget = N_TOTAL - benign_total
malicious_subtypes = [s for s in subtype_counts if s not in BENIGN_SUBTYPES]

# Calculate weights: weight = count^ALPHA (e.g., sqrt(count))
weights = {s: subtype_counts[s]**ALPHA for s in malicious_subtypes}
total_weight = sum(weights.values())

for s in malicious_subtypes:
    # Proportional allocation based on weight
    quota = (weights[s] / total_weight) * malicious_budget
    # Enforce min/max constraints
    quota = max(MIN_PER_SUBTYPE, int(round(quota)))
    quota = min(quota, subtype_counts[s])
    allocations[s] = quota

# 4c. Adjust to exactly match N_TOTAL
current_total = sum(allocations.values())
delta = N_TOTAL - current_total

# Simple adjustment: add/remove from the largest malicious subtypes
sorted_malicious = sorted(malicious_subtypes, key=lambda s: allocations[s], reverse=True)
idx = 0
while delta != 0:
    subtype = sorted_malicious[idx % len(sorted_malicious)]
    if delta > 0 and allocations[subtype] < subtype_counts[subtype]:
        allocations[subtype] += 1
        delta -= 1
    elif delta < 0 and allocations[subtype] > MIN_PER_SUBTYPE:
        allocations[subtype] -= 1
        delta += 1
    idx += 1

print(f"Total allocated: {sum(allocations.values()):,}")
print("Final Allocations:", allocations)


--- Calculating Allocations ---
Benign samples allocated: 1,301
Total allocated: 200,000


## 5. Pass 2: Reservoir Sampling

In [14]:
# =========================================
# 5. Pass 2: Reservoir Sampling
# =========================================
print("\n--- Pass 2: Sampling ---")

# Prepare feature columns (keeping Src/Dst Port!)
feature_cols = get_feature_columns(PARQUET_DIR, COLUMNS_TO_REMOVE)
print(f"Features selected: {len(feature_cols)}")

# Initialize reservoirs and seen counters for each subtype
reservoirs = {s: [] for s in allocations}
seen_counts = {s: 0 for s in allocations}

# Main sampling loop
# We read ALL necessary columns now
for chunk in tqdm(parquet_generator(PARQUET_DIR, CHUNKSIZE, columns=feature_cols)):
    # Optional: Shuffle chunk to randomize input order slightly
    chunk = chunk.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

    # Group by subtype for faster processing than row-by-row
    # (This is much faster than iterating every row)
    grouped = chunk.groupby(SUBTYPE_COL)

    for subtype, group_df in grouped:
        if subtype not in allocations: continue

        quota = allocations[subtype]
        # Convert group to a list of records (dicts) for easier reservoir handling
        rows = group_df.to_dict('records')

        for row in rows:
            seen_counts[subtype] += 1
            current_seen = seen_counts[subtype]

            # Standard Reservoir Sampling Logic
            if len(reservoirs[subtype]) < quota:
                # Phase 1: Fill the reservoir
                reservoirs[subtype].append(row)
            else:
                # Phase 2: Randomly replace
                # Generate random integer j between 0 and current_seen-1
                j = random.randint(0, current_seen - 1)
                if j < quota:
                    reservoirs[subtype][j] = row


--- Pass 2: Sampling ---
Features selected: 82


0it [00:00, ?it/s]

## 6. Finish and Save

In [15]:
# =========================================
# 6. Finalize & Save
# =========================================
print("\n--- Finalizing and Saving ---")

# Combine all reservoirs into one DataFrame
final_samples = []
for subtype_samples in reservoirs.values():
    final_samples.extend(subtype_samples)

df_sampled = pd.DataFrame(final_samples)

# Shuffle the final dataset
df_sampled = df_sampled.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

print(f"Final Sample Shape: {df_sampled.shape}")
print("Sample Label Distribution:")
print(df_sampled[TARGET_COL].value_counts())

# Save to disk
df_sampled.to_csv(OUT_PATH_CSV, index=False)
print(f"Saved CSV to: {OUT_PATH_CSV}")

# df_sampled.to_parquet(OUT_PATH_PARQUET, index=False)
# print(f"Saved Parquet to: {OUT_PATH_PARQUET}")

print("\nDone.")


--- Finalizing and Saving ---
Final Sample Shape: (200000, 82)
Sample Label Distribution:
Label
Malicious    198699
Benign         1301
Name: count, dtype: int64
Saved CSV to: /content/drive/MyDrive/Datasets/TII-SSRC-23/sampled_200k_simple.csv

Done.
