In [None]:
# 1) Mount Drive (run once)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Final Two-Pass Chunked Pipeline: Drop noise, Timestamp, constant cols, downsample benign, save reduced CSV
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

# ---------- USER CONFIG ----------
file_path = "/content/drive/MyDrive/AI_IDS_Graduation_Project/Dataset/All-Processed-Dataset-for-ML-Algorithms.csv"
output_file_csv = "/content/drive/MyDrive/AI_IDS_Graduation_Project/Dataset/All-Processed-Reduced-Cleaned.csv"
chunk_size = 1_000_000             # adjust based on available RAM
target_ratio = 4.2               # target benign:attacks ratio
random_seed = 42
# ---------------------------------

# Ensure output directory exists
os.makedirs(os.path.dirname(output_file_csv), exist_ok=True)

# -------------------------
# Pass 1: Count labels & detect constant columns
# -------------------------
print("Pass 1: counting labels and detecting constant columns...")

label_counts = {}
constant_cols = None

reader = pd.read_csv(file_path, chunksize=chunk_size, low_memory=False)
for chunk in tqdm(reader, desc="Scanning chunks"):
    # normalize Label column: strip whitespace
    chunk['Label'] = chunk['Label'].astype(str).str.strip()
    
    # drop noisy header rows
    chunk = chunk.loc[chunk['Label'] != 'Label']

    # drop Timestamp for constant column detection
    if 'Timestamp' in chunk.columns:
        chunk = chunk.drop(columns=['Timestamp'])

    # update label counts
    vc = chunk['Label'].value_counts()
    for k, v in vc.items():
        label_counts[k] = label_counts.get(k, 0) + int(v)

    # detect constant columns
    nunique = chunk.nunique(dropna=False)
    if constant_cols is None:
        constant_cols = set(nunique.index[nunique <= 1])
    else:
        constant_cols &= set(nunique.index[nunique <= 1])

print("Label counts (sample):")
for k, v in sorted(label_counts.items(), key=lambda x: -x[1])[:20]:
    print(f"  {k}: {v:,}")

print(f"\nConstant columns detected ({len(constant_cols)}): {constant_cols}")

# Compute benign/attack counts
total_benign = label_counts.get('Benign', 0)
total_attacks = sum(v for k, v in label_counts.items() if k not in ('Benign','Label'))

print(f"\nTotal benign: {total_benign:,}")
print(f"Total attacks: {total_attacks:,}")

# Compute sampling fraction for benign rows
if total_benign == 0:
    raise ValueError("No 'Benign' rows found; check Label column spelling/casing.")

target_benign = int(total_attacks * target_ratio)
sample_frac = min(1.0, target_benign / total_benign)

print(f"Target benign rows: {target_benign:,}")
print(f"Sampling fraction for benign rows: {sample_frac:.4f} ({sample_frac*100:.2f}%)")

# -------------------------
# Pass 2: Clean, downsample, and write reduced CSV
# -------------------------
if os.path.exists(output_file_csv):
    print("Removing existing output_file_csv to avoid duplication.")
    os.remove(output_file_csv)

write_header = True
kept_counts = {
    'benign_kept': 0,
    'attacks_kept': 0,
    'dropped_label_rows': 0,
    'total_processed': 0
}

reader = pd.read_csv(file_path, chunksize=chunk_size, low_memory=False)
rng = np.random.default_rng(random_seed)

for chunk in tqdm(reader, desc="Pass 2: processing chunks"):
    chunk['Label'] = chunk['Label'].astype(str).str.strip()

    # drop noisy header rows
    mask_bad_label = chunk['Label'] == 'Label'
    if mask_bad_label.any():
        kept_counts['dropped_label_rows'] += int(mask_bad_label.sum())
        chunk = chunk.loc[~mask_bad_label]

    # drop Timestamp
    if 'Timestamp' in chunk.columns:
        chunk = chunk.drop(columns=['Timestamp'])

    # drop constant columns
    if constant_cols:
        chunk = chunk.drop(columns=list(constant_cols), errors="ignore")

    # split benign vs attacks
    mask_benign = chunk['Label'] == 'Benign'
    benign_df = chunk.loc[mask_benign]
    attack_df = chunk.loc[~mask_benign]

    # sample benign rows
    if sample_frac >= 1.0:
        benign_sampled = benign_df
    else:
        benign_sampled = benign_df.sample(frac=sample_frac, random_state=random_seed)

    # update counters
    kept_counts['benign_kept'] += len(benign_sampled)
    kept_counts['attacks_kept'] += len(attack_df)
    kept_counts['total_processed'] += len(chunk)

    # concat attacks + sampled benign and write to CSV
    to_write = pd.concat([attack_df, benign_sampled], ignore_index=True)
    to_write.to_csv(output_file_csv, mode='a', header=write_header, index=False)
    write_header = False

# -------------------------
# Final summary
# -------------------------
print("\n✅ Done writing reduced & cleaned CSV:")
print(output_file_csv)

print("\n📊 Final summary:")
print(f"  Benign kept:   {kept_counts['benign_kept']:,}")
print(f"  Attacks kept:  {kept_counts['attacks_kept']:,}")
print(f"  Dropped Label rows: {kept_counts['dropped_label_rows']:,}")
print(f"  Total processed: {kept_counts['total_processed']:,}")
print(f"  Final benign:attack ratio ≈ {kept_counts['benign_kept'] / max(1, kept_counts['attacks_kept']):.2f} : 1")
print(f"  Constant columns dropped: {constant_cols}")