# IE105 – Full preprocessing pipeline (author-aligned, train/val/test)
Pipeline tổng hợp 4 nguồn dữ liệu (CSV XSS + PortSwigger JSON cheat-sheet + JS library source code từ Materialize & Bootstrap), tiền xử lý, khử trùng lặp, cân bằng theo paper (tuỳ chọn), và chia train/val/test.

**Ghi chú**: set `ROOT` trỏ đến thư mục `Dataset` của bạn trên máy local.

In [45]:

from pathlib import Path
import re, json, hashlib
import numpy as np
import pandas as pd

# =========================
# CONFIG (LOCAL)
# =========================
# TODO: chỉnh ROOT cho đúng máy bạn
ROOT = Path(r"./Dataset").resolve()

OUT = ROOT / "_outputs_train_val_test"
OUT.mkdir(parents=True, exist_ok=True)

RANDOM_SEED = 42

# ---- Source paths (auto-detect common folder names) ----
CSV_PATH = ROOT / "XSS_dataset.csv"

MATERIALIZE_DIR = None
for cand in ["materialize_js", "materialize"]:
    p = ROOT / cand
    if p.exists():
        MATERIALIZE_DIR = p
        break

BOOTSTRAP_DIR = None
for cand in ["bootstrap_js", "boostrap_js", "bootstrap"]:
    p = ROOT / cand
    if p.exists():
        BOOTSTRAP_DIR = p
        break

JSON_DIR = None
for cand in ["json", "xss-cheatsheet-data/json", "xss-cheatsheet-data\\json"]:
    p = ROOT / cand
    if p.exists():
        JSON_DIR = p
        break

print("ROOT:", ROOT)
print("CSV:", CSV_PATH, CSV_PATH.exists())
print("JSON_DIR:", JSON_DIR, (JSON_DIR.exists() if JSON_DIR else None))
print("MATERIALIZE_DIR:", MATERIALIZE_DIR)
print("BOOTSTRAP_DIR:", BOOTSTRAP_DIR)

# ---- Normalization ----
LOWERCASE = True

# ---- Filtering thresholds ----
MIN_XSS_CHARS = 6
MAX_XSS_CHARS = 2000

# JS snippet thresholds (để tạo được nhiều benign samples như paper)
MIN_BENIGN_CHARS = 20
MAX_BENIGN_CHARS = 350  # snippet-level, không phải file-level

# File-level filters
DROP_MINIFIED = True
EXCLUDE_BOOTSTRAP_TESTS = True

# ---- Split ----
# Paper: 80/20 train/test
TEST_SIZE = 0.20
# Nếu cần val: tách từ train (tỉ lệ theo train)
MAKE_VAL = True
VAL_FRAC_OF_TRAIN = 0.10  # 10% của tập train

# ---- Optional: match paper final size exactly (sampling) ----
MATCH_PAPER_COUNTS = True
TARGET_BENIGN = 12038 if MATCH_PAPER_COUNTS else None
TARGET_MAL    = 7321  if MATCH_PAPER_COUNTS else None

# ---- Diagnostics ----
SHOW_EXAMPLES_PER_LABEL = 3


ROOT: D:\UIT Document\UIT subjects\IE105 - Nhập môn đảm bảo và an toàn thông tin\LLM-for-XSS\Dataset
CSV: D:\UIT Document\UIT subjects\IE105 - Nhập môn đảm bảo và an toàn thông tin\LLM-for-XSS\Dataset\XSS_dataset.csv True
JSON_DIR: D:\UIT Document\UIT subjects\IE105 - Nhập môn đảm bảo và an toàn thông tin\LLM-for-XSS\Dataset\xss-cheatsheet-data\json True
MATERIALIZE_DIR: D:\UIT Document\UIT subjects\IE105 - Nhập môn đảm bảo và an toàn thông tin\LLM-for-XSS\Dataset\materialize
BOOTSTRAP_DIR: D:\UIT Document\UIT subjects\IE105 - Nhập môn đảm bảo và an toàn thông tin\LLM-for-XSS\Dataset\bootstrap


In [46]:

# =========================
# HELPERS
# =========================
def preprocess_payload(x: str) -> str:
    x = "" if pd.isna(x) else str(x)
    if LOWERCASE:
        x = x.lower()
    x = x.replace("\r"," ").replace("\n"," ").replace("\t"," ")
    x = " ".join(x.split())
    return x

def sha1_text(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8", errors="ignore")).hexdigest()

def read_text_file(path: Path) -> str:
    return path.read_text(encoding="utf-8", errors="ignore")

def is_minified_file(path: Path) -> bool:
    # Rule 1: filename contains .min.js
    if path.name.endswith(".min.js") or ".min." in path.name:
        return True
    # Rule 2: extremely long lines (rough heuristic)
    try:
        txt = read_text_file(path)
    except Exception:
        return False
    lines = txt.splitlines()
    if not lines:
        return False
    max_len = max(len(l) for l in lines)
    avg_len = sum(len(l) for l in lines)/len(lines)
    return max_len > 2000 and avg_len > 200

def strip_js_comments(text: str) -> str:
    # remove /* ... */ and // ... (simple heuristic)
    text = re.sub(r"/\*.*?\*/", " ", text, flags=re.S)
    text = re.sub(r"(?m)^\s*//.*?$", " ", text)
    return text

def js_file_filter(path: Path, text: str) -> bool:
    if DROP_MINIFIED and is_minified_file(path):
        return False
    if EXCLUDE_BOOTSTRAP_TESTS and ("tests" in path.parts):
        return False
    return True

# XSS “signal” regex (dùng cho malicious, và dùng để *loại* khỏi benign)
XSS_SIGNAL_RE = re.compile(
    r"(<\s*script\b|javascript\s*:|on\w+\s*=|document\s*\.\s*cookie|"
    r"window\s*\.\s*location|alert\s*\(|prompt\s*\(|confirm\s*\(|"
    r"eval\s*\(|settimeout\s*\(|setinterval\s*\(|<\s*img\b|<\s*svg\b|"
    r"src\s*=|data\s*:)",
    re.IGNORECASE,
)

def extract_strings(obj):
    out = []
    if isinstance(obj, dict):
        for v in obj.values():
            out.extend(extract_strings(v))
    elif isinstance(obj, list):
        for v in obj:
            out.extend(extract_strings(v))
    elif isinstance(obj, str):
        out.append(obj)
    return out

def dedup_and_resolve(df: pd.DataFrame) -> pd.DataFrame:
    # Dedup by hash. If same hash appears with both labels, keep malicious (label=1).
    if df.empty:
        return df
    df = df.copy()
    df["hash"] = df["payload"].apply(sha1_text)

    conflicts = df.groupby("hash")["label"].nunique()
    conflict_hashes = set(conflicts[conflicts > 1].index.tolist())
    if conflict_hashes:
        df = df[~((df["hash"].isin(conflict_hashes)) & (df["label"] == 0))].copy()

    df = df.drop_duplicates(subset=["hash","label"]).reset_index(drop=True)
    df = df.drop_duplicates(subset=["hash"]).reset_index(drop=True)
    return df

def sample_df(df, n, seed=RANDOM_SEED):
    if n is None or n >= len(df):
        return df.copy()
    rng = np.random.default_rng(seed)
    idx = rng.choice(len(df), size=n, replace=False)
    return df.iloc[idx].copy()

def length_stats(series: pd.Series) -> dict:
    lens = series.str.len()
    return {
        "n": int(len(series)),
        "mean": float(lens.mean()),
        "median": float(lens.median()),
        "p95": float(lens.quantile(0.95)),
        "max": float(lens.max())
    }


## 1) Load & clean `XSS_dataset.csv` (malicious + benign)

In [47]:

assert CSV_PATH.exists(), f"Không tìm thấy {CSV_PATH}"

df_csv = pd.read_csv(CSV_PATH)

# Paper dùng cột Sentence/Label; nếu dataset bạn khác, sửa lại ở đây
df_csv = df_csv.rename(columns={"Sentence":"payload", "Label":"label"})
if "payload" not in df_csv.columns or "label" not in df_csv.columns:
    raise ValueError(f"CSV columns hiện có: {df_csv.columns.tolist()}")

if "Unnamed: 0" in df_csv.columns:
    df_csv = df_csv.drop(columns=["Unnamed: 0"])

df_csv["payload"] = df_csv["payload"].apply(preprocess_payload)
df_csv["label"]   = df_csv["label"].astype(int)

df_csv = dedup_and_resolve(df_csv)
df_csv["source"] = "kaggle_csv"
df_csv["path"] = str(CSV_PATH)

print("CSV clean:", df_csv.shape)
print("CSV label counts:", df_csv["label"].value_counts().to_dict())


CSV clean: (10849, 5)
CSV label counts: {1: 7317, 0: 3532}


## 2) Load PortSwigger JSON cheat-sheet -> malicious payloads

In [48]:
def extract_strings(obj):
    out = []
    if isinstance(obj, dict):
        for v in obj.values():
            out.extend(extract_strings(v))
    elif isinstance(obj, list):
        for v in obj:
            out.extend(extract_strings(v))
    elif isinstance(obj, str):
        out.append(obj)
    return out

mal_rows = []

if JSON_DIR is None:
    print("JSON_DIR not found; skipping JSON source.")
else:
    json_files = sorted(JSON_DIR.rglob("*.json"))
    print("JSON files found:", len(json_files))
    for jf in json_files:
        data = json.loads(jf.read_text(encoding="utf-8", errors="ignore"))
        for s in extract_strings(data):
            s2 = preprocess_payload(s)
            if not s2:
                continue
            if len(s2) < MIN_XSS_CHARS or len(s2) > MAX_XSS_CHARS:
                continue
            if not XSS_SIGNAL_RE.search(s2):
                continue
            mal_rows.append({"payload": s2, "label": 1, "source": f"portswigger_json:{jf.name}", "path": str(jf)})

df_json_xss = pd.DataFrame(mal_rows)
if len(df_json_xss):
    df_json_xss["hash"] = df_json_xss["payload"].apply(sha1_text)
    df_json_xss = df_json_xss.drop_duplicates(subset=["hash"]).reset_index(drop=True)

print("JSON XSS kept:", df_json_xss.shape)

JSON files found: 21
JSON XSS kept: (1276, 5)


## 3) Load JS library source code -> benign snippets
**Cải tiến**: chuyển từ *file-level* sang *snippet-level* để tạo đủ benign samples (paper ~12k).

In [49]:

def js_to_snippets(text: str):
    # Convert a JS file into many short benign snippets.
    text = strip_js_comments(text)
    raw_lines = [ln.strip() for ln in text.splitlines()]
    raw_lines = [ln for ln in raw_lines if ln]

    # statement-level candidates
    stmts = []
    for ln in raw_lines:
        parts = [p.strip() for p in ln.split(";")]
        for p in parts:
            if p:
                stmts.append(p)

    # line-window candidates (context)
    K = 3
    windows = []
    for i in range(0, len(raw_lines) - K + 1):
        windows.append(" ".join(raw_lines[i:i+K]))

    candidates = stmts + windows

    snippets = []
    for c in candidates:
        c2 = preprocess_payload(c)
        if len(c2) < MIN_BENIGN_CHARS or len(c2) > MAX_BENIGN_CHARS:
            continue
        if XSS_SIGNAL_RE.search(c2):
            continue
        if not re.search(r"\b(function|const|let|var|return|class|=>|new)\b", c2):
            if not re.search(r"[a-zA-Z_]\w*\s*\(|\w+\s*\.\s*\w+", c2):
                continue
        snippets.append(c2)

    return snippets

benign_rows = []

if MATERIALIZE_DIR is not None:
    mat_files = sorted(MATERIALIZE_DIR.rglob("*.js"))
    print("Materialize .js files found:", len(mat_files))
    for p in mat_files:
        txt = read_text_file(p)
        if not js_file_filter(p, txt):
            continue
        for snip in js_to_snippets(txt):
            benign_rows.append({"payload": snip, "label": 0, "source": "materialize_js", "path": str(p)})
else:
    print("Materialize dir not found; skipping.")

if BOOTSTRAP_DIR is not None:
    bs_files = sorted(BOOTSTRAP_DIR.rglob("*.js"))
    print("Bootstrap .js files found:", len(bs_files))
    for p in bs_files:
        txt = read_text_file(p)
        if not js_file_filter(p, txt):
            continue
        for snip in js_to_snippets(txt):
            benign_rows.append({"payload": snip, "label": 0, "source": "bootstrap_js", "path": str(p)})
else:
    print("Bootstrap dir not found; skipping.")

df_js_benign = pd.DataFrame(benign_rows)
df_js_benign = dedup_and_resolve(df_js_benign) if len(df_js_benign) else df_js_benign

print("JS benign kept:", df_js_benign.shape)
print("JS benign breakdown:", df_js_benign["source"].value_counts().to_dict() if len(df_js_benign) else {})


Materialize .js files found: 59
Bootstrap .js files found: 128
JS benign kept: (27802, 5)
JS benign breakdown: {'materialize_js': 17147, 'bootstrap_js': 10655}


## 4) Aggregate -> final dataset, optional match paper counts

In [50]:

parts = [df_csv]
if len(df_json_xss):
    parts.append(df_json_xss)
if len(df_js_benign):
    parts.append(df_js_benign)

df_all = pd.concat(parts, ignore_index=True)
df_all = dedup_and_resolve(df_all)

print("Combined (dedup/resolved):", df_all.shape)
print("Label counts:", df_all["label"].value_counts().to_dict())
print("Source counts (top 10):", df_all["source"].value_counts().head(10).to_dict())

benign_pool = df_all[df_all["label"]==0].copy()
mal_pool    = df_all[df_all["label"]==1].copy()

print("\nBenign pool:", benign_pool.shape, "Malicious pool:", mal_pool.shape)

if MATCH_PAPER_COUNTS:
    if len(benign_pool) < TARGET_BENIGN:
        print(f"[WARN] Benign pool chưa đủ để match paper: {len(benign_pool)} < {TARGET_BENIGN}.")
    if len(mal_pool) < TARGET_MAL:
        print(f"[WARN] Malicious pool chưa đủ để match paper: {len(mal_pool)} < {TARGET_MAL}.")
    benign_pool = sample_df(benign_pool, min(TARGET_BENIGN, len(benign_pool)))
    mal_pool    = sample_df(mal_pool, min(TARGET_MAL, len(mal_pool)))

df_final = pd.concat([benign_pool, mal_pool], ignore_index=True)
df_final = df_final.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

print("\nFinal dataset:", df_final.shape)
print("Final label counts:", df_final["label"].value_counts().to_dict())


Combined (dedup/resolved): (39155, 5)
Label counts: {0: 31334, 1: 7821}
Source counts (top 10): {'materialize_js': 17147, 'kaggle_csv': 10849, 'bootstrap_js': 10655, 'portswigger_json:events.json': 147, 'portswigger_json:restricted_characters.json': 78, 'portswigger_json:classic.json': 49, 'portswigger_json:vuejs.json': 43, 'portswigger_json:prototype-pollution.json': 43, 'portswigger_json:angularjs.json': 21, 'portswigger_json:protocols.json': 18}

Benign pool: (31334, 5) Malicious pool: (7821, 5)

Final dataset: (19359, 5)
Final label counts: {0: 12038, 1: 7321}


## 5) Split train/val/test (stratified)
Paper chỉ nói train/test 80/20. Nếu bật `MAKE_VAL`, ta tách `VAL_FRAC_OF_TRAIN` từ train để tuning.

In [51]:

from sklearn.model_selection import train_test_split

X = df_final["payload"].astype(str)
y = df_final["label"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y
)

train_df = pd.DataFrame({"payload": X_train.values, "label": y_train.values})
test_df  = pd.DataFrame({"payload": X_test.values,  "label": y_test.values})

if MAKE_VAL:
    X_tr, X_val, y_tr, y_val = train_test_split(
        train_df["payload"], train_df["label"],
        test_size=VAL_FRAC_OF_TRAIN, random_state=RANDOM_SEED, stratify=train_df["label"]
    )
    train_df = pd.DataFrame({"payload": X_tr.values, "label": y_tr.values})
    val_df   = pd.DataFrame({"payload": X_val.values, "label": y_val.values})

print("Train:", train_df.shape, train_df["label"].value_counts().to_dict())
if MAKE_VAL:
    print("Val:", val_df.shape, val_df["label"].value_counts().to_dict())
print("Test:", test_df.shape, test_df["label"].value_counts().to_dict())

train_df.to_csv(OUT / "train.csv", index=False)
if MAKE_VAL:
    val_df.to_csv(OUT / "val.csv", index=False)
test_df.to_csv(OUT / "test.csv", index=False)

benign_pool.to_csv(OUT / "benign_pool.csv", index=False)
mal_pool.to_csv(OUT / "malicious_pool.csv", index=False)
df_final.to_csv(OUT / "final_dataset.csv", index=False)

print("\nSaved to:", OUT)


Train: (13938, 2) {0: 8667, 1: 5271}
Val: (1549, 2) {0: 963, 1: 586}
Test: (3872, 2) {0: 2408, 1: 1464}

Saved to: D:\UIT Document\UIT subjects\IE105 - Nhập môn đảm bảo và an toàn thông tin\LLM-for-XSS\Dataset\_outputs_train_val_test


## 6) BoW (CountVectorizer) như paper

In [52]:

from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse

vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b")

Xtr = vectorizer.fit_transform(train_df["payload"])
if MAKE_VAL:
    Xva = vectorizer.transform(val_df["payload"])
Xte = vectorizer.transform(test_df["payload"])

print("BoW shapes:")
print("  train:", Xtr.shape)
if MAKE_VAL:
    print("  val  :", Xva.shape)
print("  test :", Xte.shape)

vocab = vectorizer.get_feature_names_out()
(vocab_path := OUT / "bow_vocabulary.txt").write_text("\n".join(vocab), encoding="utf-8")
print("Saved vocab:", vocab_path)

sparse.save_npz(OUT / "X_train_bow.npz", Xtr)
if MAKE_VAL:
    sparse.save_npz(OUT / "X_val_bow.npz", Xva)
sparse.save_npz(OUT / "X_test_bow.npz", Xte)

print("Length stats train:", length_stats(train_df["payload"]))
if MAKE_VAL:
    print("Length stats val:", length_stats(val_df["payload"]))
print("Length stats test:", length_stats(test_df["payload"]))


BoW shapes:
  train: (13938, 8508)
  val  : (1549, 8508)
  test : (3872, 8508)
Saved vocab: D:\UIT Document\UIT subjects\IE105 - Nhập môn đảm bảo và an toàn thông tin\LLM-for-XSS\Dataset\_outputs_train_val_test\bow_vocabulary.txt
Length stats train: {'n': 13938, 'mean': 85.51499497775865, 'median': 61.0, 'p95': 182.0, 'max': 3478.0}
Length stats val: {'n': 1549, 'mean': 83.48482892188508, 'median': 59.0, 'p95': 178.5999999999999, 'max': 3070.0}
Length stats test: {'n': 3872, 'mean': 84.79648760330579, 'median': 60.0, 'p95': 180.0, 'max': 3681.0}


## 7) Quick sanity-check examples

In [53]:

for lbl in [0,1]:
    ex = train_df[train_df["label"]==lbl].head(SHOW_EXAMPLES_PER_LABEL)["payload"].tolist()
    print(f"\nExamples label={lbl} (train):")
    for i,e in enumerate(ex,1):
        print(i, (e[:200] + ("..." if len(e)>200 else "")))



Examples label=0 (train):
1 <p>wendell wallach introduced the concept of <a href="/wiki/artificial_moral_agents" class="mw-redirect" title="artificial moral agents">artificial moral agents </a> (ama) in his book <i>moral machine...
2 parents.push(ancestor) ancestor = ancestor.parentnode.closest(selector) }
3 } _selectmenuitem({ key,

Examples label=1 (train):
1 <picture oncut="alert(1)" contenteditable>test</picture>
2 <dir ondblclick="alert(1)">test</dir>
3 <object id=x onfocusin=alert(1) type=text/html>
