In [None]:
from google.colab import files
uploaded = files.upload()


Saving archive (11).zip to archive (11).zip


In [None]:
import zipfile

with zipfile.ZipFile("archive (11).zip", "r") as zip_ref:
    zip_ref.extractall()


In [None]:
# Create the model file directory
!mkdir -p /content/local_model_files

# Create the test data directory
!mkdir -p /content/test_logs

In [None]:
# This command moves all files ending with .csv in the current directory (/content/)
# into the /content/test_logs/ directory.
!mv /content/*.csv /content/test_logs/

In [None]:
# Move the configuration file
!mv /content/config.json /content/local_model_files/

# Move the PyTorch weights file (the one used by your main script)
!mv /content/pytorch_model.bin /content/local_model_files/

# Move the ONNX optimized model file (not used by the current script, but good to keep)
!mv /content/ONNX-model-Network-Logs-Classification.onnx /content/local_model_files/

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("yashika0998/IoT-23-BERT-Network-Logs-Classification")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/717 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
import torch
import os
import time
import json
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import argparse

# ============================================================
# 1. CONFIGURATION
# ============================================================

TEST_LOGS_DIR = "/content/test_logs"

BATCH_SIZE = 256
OUTPUT_FILE = "/content/classification_output.jsonl"
METRICS_REPORT_FILE = "/content/benchmark_report.json"

CLASSIFICATION_THRESHOLD = 0.85
HUB_MODEL_ID = "yashika0998/IoT-23-BERT-Network-Logs-Classification"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if not torch.cuda.is_available() and torch.backends.mps.is_available():
    DEVICE = torch.device("mps")

if DEVICE.type == "cuda":
    capability = torch.cuda.get_device_capability()
    DTYPE = torch.float16 if capability[0] >= 7 else torch.float32
elif DEVICE.type == "mps":
    DTYPE = torch.bfloat16
else:
    DTYPE = torch.float32

print(f"DEVICE = {DEVICE}, dtype = {DTYPE}")

# ============================================================
# 2. PREPROCESSING
# ============================================================

FEATURE_COLUMNS = [
    'Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets',
    'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Flow Bytes/s',
    'Flow Packets/s', 'Fwd PSH Flags', 'FIN Flag Count', 'SYN Flag Count',
    'ACK Flag Count', 'URG Flag Count', 'Init_Win_bytes_forward', 'Init_Win_bytes_backward'
]

def format_row_to_sentence(row):
    parts = []
    parts.append(f"dest_port is {row['Destination Port']}")
    parts.append("protocol is tcp")
    parts.append(f"flow_duration is {row['Flow Duration']}")
    parts.append(f"fwd_pkts is {row['Total Fwd Packets']}")
    parts.append(f"bwd_pkts is {row['Total Backward Packets']}")
    parts.append(f"fwd_bytes is {row['Total Length of Fwd Packets']}")
    parts.append(f"bwd_bytes is {row['Total Length of Bwd Packets']}")
    parts.append(f"flow_rate_bytes is {row['Flow Bytes/s']:.0f}")
    parts.append(f"flow_rate_pkts is {row['Flow Packets/s']:.0f}")
    if row['SYN Flag Count'] > 0: parts.append("SYN_flag_set")
    if row['ACK Flag Count'] > 0: parts.append("ACK_flag_set")
    if row['FIN Flag Count'] > 0: parts.append("FIN_flag_set")
    if row['Fwd PSH Flags'] > 0: parts.append("PSH_flag_set")
    parts.append(f"fwd_win_bytes is {row['Init_Win_bytes_forward']}")
    parts.append(f"bwd_win_bytes is {row['Init_Win_bytes_backward']}")
    return ". ".join(parts) + "."

def load_and_preprocess_data():
    print(f"\nLoading CSV files from {TEST_LOGS_DIR}")
    start = time.time()

    try:
        csv_files = [f for f in os.listdir(TEST_LOGS_DIR) if f.endswith(".csv")]
    except FileNotFoundError:
        print(f"ERROR: Directory not found at {TEST_LOGS_DIR}")
        return [], [], 0

    if not csv_files:
        print("ERROR: No CSV files found.")
        return [], [], 0

    frames = []
    for f in tqdm(csv_files, desc="Reading Files"):
        df = pd.read_csv(os.path.join(TEST_LOGS_DIR, f), low_memory=False)
        df.columns = df.columns.str.strip()
        if not all(col in df.columns for col in FEATURE_COLUMNS):
            tqdm.write(f"Skipping {f} (missing columns)")
            continue
        frames.append(df)

    if not frames:
        print("ERROR: No valid CSV files.")
        return [], [], 0

    df = pd.concat(frames, ignore_index=True)

    text_logs = df.apply(format_row_to_sentence, axis=1).tolist()
    labels = df["Label"].tolist()

    duration = time.time() - start
    print(f"Loaded {len(df)} rows in {duration:.2f} sec")

    return text_logs, labels, duration

# ============================================================
# 3. BENCHMARK PIPELINE
# ============================================================

def write_report(data):
    with open(METRICS_REPORT_FILE, "w") as f:
        json.dump(data, f, indent=4)
    print(f"Saved report to {METRICS_REPORT_FILE}")

def run_benchmark():

    logs, labels, preprocess_time = load_and_preprocess_data()
    if len(logs) == 0:
        return

    print("\nLoading tokenizer…")
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    print("Loading model from HuggingFace Hub…")
    try:
        model = AutoModelForSequenceClassification.from_pretrained(HUB_MODEL_ID)
    except Exception as e:
        print(f"ERROR loading model: {e}")
        return

    model.config.id2label = {0: "BENIGN", 1: "MALICIOUS"}
    model = model.to(DEVICE)

    if DTYPE != torch.float32:
        model = model.to(DTYPE)

    model.eval()

    total_logs = 0
    total_time = 0

    print(f"\nRunning inference on {len(logs)} samples…")

    with open(OUTPUT_FILE, "w") as out:
        pbar = tqdm(range(0, len(logs), BATCH_SIZE))

        autocast_enabled = (DTYPE != torch.float32)

        for i in pbar:
            batch = logs[i:i+BATCH_SIZE]
            gt = labels[i:i+BATCH_SIZE]

            inputs = tokenizer(
                batch,
                padding=True,
                truncation=True,
                return_tensors="pt",
                max_length=512
            )
            inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

            if DEVICE.type in ["cuda", "mps"]:
                start_ev = torch.cuda.Event(enable_timing=True) if DEVICE.type == "cuda" else torch.mps.Event(enable_timing=True)
                end_ev = torch.cuda.Event(enable_timing=True) if DEVICE.type == "cuda" else torch.mps.Event(enable_timing=True)
                start_ev.record()
            else:
                start_time = time.time()

            with torch.no_grad():
                with torch.autocast(device_type=DEVICE.type, dtype=DTYPE, enabled=autocast_enabled):
                    outputs = model(**inputs)

            if DEVICE.type in ["cuda", "mps"]:
                end_ev.record()
                if DEVICE.type == "cuda":
                    torch.cuda.synchronize()
                else:
                    torch.mps.synchronize()
                elapsed = start_ev.elapsed_time(end_ev) / 1000
            else:
                elapsed = time.time() - start_time

            total_logs += len(batch)
            total_time += elapsed

            probs = torch.softmax(outputs.logits, dim=1).cpu().float().numpy()[:, 1]
            preds = (probs >= CLASSIFICATION_THRESHOLD).astype(int)

            for j in range(len(batch)):
                out.write(json.dumps({
                    "flow_id": i + j,
                    "log_input": batch[j],
                    "prediction": "MALICIOUS" if preds[j] == 1 else "BENIGN",
                    "confidence": float(probs[j]),
                    "ground_truth": gt[j]
                }) + "\n")

            pbar.set_postfix_str(f"{total_logs/total_time:.0f} logs/s")

    throughput = total_logs / total_time
    latency = (total_time / total_logs) * 1000

    report = {
        "DEVICE": str(DEVICE),
        "DTYPE": str(DTYPE),
        "TOTAL_LOGS": total_logs,
        "PREPROCESS_TIME": preprocess_time,
        "INFERENCE_TIME": total_time,
        "THROUGHPUT": throughput,
        "LATENCY_MS": latency
    }

    write_report(report)
    print("\nDone.")

# ============================================================
# ENTRY POINT
# ============================================================

run_benchmark()


DEVICE = cuda, dtype = torch.float16

Loading CSV files from /content/test_logs


Reading Files: 100%|██████████| 8/8 [00:28<00:00,  3.57s/it]


Loaded 2830743 rows in 114.96 sec

Loading tokenizer…
Loading model from HuggingFace Hub…

Running inference on 2830743 samples…


100%|██████████| 11058/11058 [15:47<00:00, 11.66it/s, 6031 logs/s]


Saved report to /content/benchmark_report.json

Done.


In [None]:
from google.colab import files
files.download('classification_output.jsonl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>