In [1]:
import os, random, numpy as np
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
random.seed(42); np.random.seed(42)

In [2]:
# ==== Install ====
import sys, subprocess, importlib.util
def pip_install(pkgs): subprocess.run([sys.executable, "-m", "pip", "install", "-q"] + pkgs, check=True)
need = []
for p in ["numpy","pandas","scikit-learn","joblib","tqdm","requests","matplotlib",
          "sentence-transformers","transformers","torch","ijson"]:
    if importlib.util.find_spec(p) is None:
        need.append(p)
if need: pip_install(need)

# ==== Imports ====
import os, json, re, time, zipfile
from pathlib import Path
from typing import Dict, List, Tuple, Callable, Optional

import numpy as np
import pandas as pd
import requests, joblib
from tqdm import tqdm

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel

# ==== Paths & knobs ====
USE_GOOGLE_DRIVE = True
BASE = Path("/content/drive/MyDrive/CTC_by_source") if USE_GOOGLE_DRIVE else Path("/content")
RED_PATH = BASE / "CTC_Reddit_10k.json"
STK_PATH = BASE / "CTC_Stackexchange_10k.json"
ARX_PATH = BASE / "CTC_arXiv_10k.json"

WORKDIR = Path("ctc_bench"); WORKDIR.mkdir(parents=True, exist_ok=True)
CACHE = WORKDIR / "cache"; CACHE.mkdir(parents=True, exist_ok=True)

TEST_SIZE = 0.20
VAL_SIZE_WITHIN_TRAIN = 0.125    # 10% of original becomes validation (0.8 * 0.125)
RANDOM_STATE = 42
BATCH_SIZE_TXT = 512

# Authors' repo (dictionary + validation)
CTC_REPO_ZIP = "https://codeload.github.com/epelofske-student/CTC/zip/refs/heads/main"
DICT_REPO_PATH = "English_word_dictionary.txt"
VAL_DIR_CYB = "validation_data_cybersecurity"
VAL_DIR_NON = "validation_data_non_cybersecurity"
DATA = WORKDIR / "data"; DATA.mkdir(exist_ok=True, parents=True)

In [3]:
def stream_download(url: str, out_path: Path, desc: str = None):
    if out_path.exists() and out_path.stat().st_size > 0:
        return out_path
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        total = int(r.headers.get("content-length", 0))
        with open(out_path, "wb") as f, tqdm(total=total, unit="B", unit_scale=True, desc=desc or out_path.name) as pbar:
            for chunk in r.iter_content(1024*1024):
                if chunk:
                    f.write(chunk); pbar.update(len(chunk))
    return out_path

# Get authors' repo as ZIP
repo_zip = DATA / "CTC-main.zip"
repo_root = DATA / "CTC-main"
if not repo_root.exists():
    stream_download(CTC_REPO_ZIP, repo_zip, desc="CTC-main.zip")
    with zipfile.ZipFile(repo_zip, "r") as z:
        z.extractall(DATA)
DICT_PATH = repo_root / DICT_REPO_PATH
assert DICT_PATH.exists(), "Dictionary not found in repo"

# Cleaning
CLEAN_HTML_RE = re.compile(r"<[^>]+>")
URL_RE = re.compile(r"http[s]?://\S+|www\.\S+")
CODE_RE = re.compile(r"`{1,3}.*?`{1,3}", re.DOTALL)
NON_ASCII_RE = re.compile(r"[^\x00-\x7F]+")
WHITESPACE_RE = re.compile(r"\s+")

def clean_text(s: str) -> str:
    if not isinstance(s, str): return ""
    s = URL_RE.sub(" ", s)
    s = CODE_RE.sub(" ", s)
    s = CLEAN_HTML_RE.sub(" ", s)
    s = NON_ASCII_RE.sub(" ", s)
    s = WHITESPACE_RE.sub(" ", s).strip()
    return s

def load_ctc_json(path: Path):
    data = json.loads(path.read_text(encoding="utf-8"))
    X = [clean_text(d["text"]) for d in data]
    y = [int(d["label"]) for d in data]
    return X, y

CTC-main.zip: 49.3MB [00:02, 16.9MB/s]


In [4]:
# Fixed dictionary TF-IDF (paper baseline)
def make_tfidf_dict():
    vocab = sorted({w.strip() for w in DICT_PATH.read_text("utf-8").splitlines() if w.strip()})
    return TfidfVectorizer(vocabulary=vocab, lowercase=True, dtype=np.float32,
                           token_pattern=r"(?u)\b\w+\b", norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=True)

# Free vocab TF-IDF
def make_tfidf_free():
    return TfidfVectorizer(min_df=2, ngram_range=(1,2), lowercase=True, dtype=np.float32)

# Char n-gram TF-IDF
def make_tfidf_char():
    return TfidfVectorizer(analyzer="char", ngram_range=(3,5), lowercase=True, dtype=np.float32)

# CountVectorizer
def make_count():
    return CountVectorizer(min_df=2, ngram_range=(1,2), lowercase=True, dtype=np.int32)

# HashingVectorizer (stateless)
def make_hashing():
    return HashingVectorizer(n_features=2**18, alternate_sign=False, norm="l2", lowercase=True)

# LSA  (TF-IDF -> SVD(256) -> l2-normalize)
def make_lsa():
    tfidf = TfidfVectorizer(min_df=2, ngram_range=(1,2), lowercase=True, dtype=np.float32)
    svd = TruncatedSVD(n_components=256, random_state=RANDOM_STATE)
    norm = Normalizer(copy=False)
    return make_pipeline(tfidf, svd, norm)

# Sentence-Transformers
_SMODELS = {
    "sbert_all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2",
    "e5-small-v2":            "intfloat/e5-small-v2",
    "bge-small-en-v1.5":      "BAAI/bge-small-en-v1.5",
}

def embed_with_sbert(model_name: str, texts: List[str], batch_size: int = 128, device: str = None) -> np.ndarray:
    m = SentenceTransformer(_SMODELS[model_name], device=device or ("cuda" if torch.cuda.is_available() else "cpu"))
    emb = m.encode(texts, batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)
    return emb.astype("float32")

# Vanilla Transformers CLS (e.g., distilroberta-base)
def embed_with_transformer_cls(hf_name: str, texts: List[str], batch_size: int = 64) -> np.ndarray:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tok = AutoTokenizer.from_pretrained(hf_name)
    mdl = AutoModel.from_pretrained(hf_name).to(device)
    mdl.eval()
    outs = []
    for i in tqdm(range(0, len(texts), batch_size), desc=f"Encoding {hf_name} CLS"):
        batch = texts[i:i+batch_size]
        enc = tok(batch, padding=True, truncation=True, max_length=256, return_tensors="pt").to(device)
        with torch.no_grad():
            out = mdl(**enc).last_hidden_state[:,0,:]  # CLS token
            out = torch.nn.functional.normalize(out, p=2, dim=1)
        outs.append(out.detach().cpu().numpy().astype("float32"))
    return np.vstack(outs)

# Registry of 10 methods
METHODS = {
    # Classic 6
    "tfidf_dict_word":   ("sparse", make_tfidf_dict),
    "tfidf_free_word":   ("sparse", make_tfidf_free),
    "tfidf_char_3_5":    ("sparse", make_tfidf_char),
    "count_word":        ("sparse", make_count),
    "hashing_word":      ("sparse", make_hashing),
    "lsa_tfidf_svd256":  ("dense",  make_lsa),
    # Neural / LLM 4
    "sbert_all-MiniLM-L6-v2": ("dense_embed", lambda: "sbert_all-MiniLM-L6-v2"),
    "e5-small-v2":            ("dense_embed", lambda: "e5-small-v2"),
    "bge-small-en-v1.5":      ("dense_embed", lambda: "bge-small-en-v1.5"),
    "distilroberta-base_CLS": ("dense_hf_cls", lambda: "distilroberta-base"),
}

In [6]:
def train_eval_method(X_train_text, y_train, X_val_text, y_val, X_test_text, y_test, method_name: str):
    mtype, factory = METHODS[method_name]
    t0 = time.time()
    timings = {}

    if mtype in ("sparse","dense"):
        vectorizer_or_pipe = factory()
        t_fit0 = time.time()
        X_train = vectorizer_or_pipe.fit_transform(X_train_text)
        timings["fit_vectorizer_s"] = time.time() - t_fit0

        t_tr0 = time.time()
        X_val   = vectorizer_or_pipe.transform(X_val_text)
        X_test  = vectorizer_or_pipe.transform(X_test_text)
        timings["transform_s"] = time.time() - t_tr0

        # Choose classifier
        if mtype == "sparse":
            clf = LinearSVC()
        else:  # dense
            clf = LogisticRegression(max_iter=1000, n_jobs=-1, solver="lbfgs")
        t_clf0 = time.time()
        clf.fit(X_train, y_train)
        timings["fit_clf_s"] = time.time() - t_clf0

        # Evaluate
        t_inf0 = time.time()
        yhat = clf.predict(X_test)
        timings["infer_s"] = time.time() - t_inf0

        acc = accuracy_score(y_test, yhat)
        report = classification_report(y_test, yhat, output_dict=True)
        return acc, report, timings

    elif mtype == "dense_embed":
        model_key = factory()
        # cache for speed
        cache_train = CACHE / f"{method_name}_train.npy"
        cache_val   = CACHE / f"{method_name}_val.npy"
        cache_test  = CACHE / f"{method_name}_test.npy"

        if cache_train.exists() and cache_val.exists() and cache_test.exists():
            X_train = np.load(cache_train); X_val = np.load(cache_val); X_test = np.load(cache_test)
        else:
            X_train = embed_with_sbert(model_key, X_train_text, batch_size=128)
            X_val   = embed_with_sbert(model_key, X_val_text, batch_size=128)
            X_test  = embed_with_sbert(model_key, X_test_text, batch_size=128)
            np.save(cache_train, X_train); np.save(cache_val, X_val); np.save(cache_test, X_test)

        clf = LogisticRegression(max_iter=1000, n_jobs=-1, solver="lbfgs")
        t_clf0 = time.time(); clf.fit(X_train, y_train); timings["fit_clf_s"] = time.time() - t_clf0
        t_inf0 = time.time(); yhat = clf.predict(X_test); timings["infer_s"] = time.time() - t_inf0
        acc = accuracy_score(y_test, yhat)
        report = classification_report(y_test, yhat, output_dict=True)
        return acc, report, timings

    elif mtype == "dense_hf_cls":
        hf_name = factory()
        cache_train = CACHE / f"{method_name}_train.npy"
        cache_val   = CACHE / f"{method_name}_val.npy"
        cache_test  = CACHE / f"{method_name}_test.npy"

        if cache_train.exists() and cache_val.exists() and cache_test.exists():
            X_train = np.load(cache_train); X_val = np.load(cache_val); X_test = np.load(cache_test)
        else:
            X_train = embed_with_transformer_cls(hf_name, X_train_text, batch_size=64)
            X_val   = embed_with_transformer_cls(hf_name, X_val_text, batch_size=64)
            X_test  = embed_with_transformer_cls(hf_name, X_test_text, batch_size=64)
            np.save(cache_train, X_train); np.save(cache_val, X_val); np.save(cache_test, X_test)

        clf = LogisticRegression(max_iter=1000, n_jobs=-1, solver="lbfgs")
        t_clf0 = time.time(); clf.fit(X_train, y_train); timings["fit_clf_s"] = time.time() - t_clf0
        t_inf0 = time.time(); yhat = clf.predict(X_test); timings["infer_s"] = time.time() - t_inf0
        acc = accuracy_score(y_test, yhat)
        report = classification_report(y_test, yhat, output_dict=True)
        return acc, report, timings

    else:
        raise ValueError(mtype)

In [7]:
def run_ctc_pipeline(
    reddit_path="/content/drive/MyDrive/CTC_by_source/CTC_Reddit_10k.json",
    stack_path="/content/drive/MyDrive/CTC_by_source/CTC_Stackexchange_10k.json",
    arxiv_path="/content/drive/MyDrive/CTC_by_source/CTC_arXiv_10k.json",
    combined_path="/content/drive/MyDrive/CTC_by_source/CTC_by_source_30k.json",
    epochs_dnn_per_source=8,
    batch_size=512,
    random_state=42,
    test_size=0.20,
    val_size_within_train=0.125,
):
    """
    One-call runner for the original CTC reproduction:
      - Loads your per-source 10k JSONs + 30k combined
      - Downloads authors' repo to get dictionary and validation folders
      - Builds dictionary TF-IDF
      - Trains 5 classic models + 2 DNN checkpoints per source (total 21 models)
      - Evaluates on held-out test split + authors' validation folders
      - Saves vectorizer + models in ./ctc_models
    """
    import os, json, zipfile, requests, joblib, numpy as np
    from pathlib import Path
    from tqdm import tqdm
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score, classification_report
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import LinearSVC
    from sklearn.neural_network import MLPClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.tree import DecisionTreeClassifier
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
    import re

    # ---------- small helpers ----------
    WORKDIR = Path("ctc_repro"); WORKDIR.mkdir(exist_ok=True, parents=True)
    MODELDIR = Path("ctc_models"); MODELDIR.mkdir(exist_ok=True, parents=True)
    DATA = WORKDIR / "data"; DATA.mkdir(exist_ok=True, parents=True)

    def stream_download(url: str, out_path: Path, desc: str = None):
        if out_path.exists() and out_path.stat().st_size > 0:
            return out_path
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            total = int(r.headers.get("content-length", 0))
            with open(out_path, "wb") as f, tqdm(total=total, unit="B", unit_scale=True, desc=desc or out_path.name) as pbar:
                for chunk in r.iter_content(1024*1024):
                    if chunk:
                        f.write(chunk); pbar.update(len(chunk))
        return out_path

    # cleaning
    CLEAN_HTML_RE = re.compile(r"<[^>]+>")
    URL_RE = re.compile(r"http[s]?://\S+|www\.\S+")
    CODE_RE = re.compile(r"`{1,3}.*?`{1,3}", re.DOTALL)
    NON_ASCII_RE = re.compile(r"[^\x00-\x7F]+")
    WHITESPACE_RE = re.compile(r"\s+")
    def clean_text(s: str) -> str:
        if not isinstance(s, str): return ""
        s = URL_RE.sub(" ", s)
        s = CODE_RE.sub(" ", s)
        s = CLEAN_HTML_RE.sub(" ", s)
        s = NON_ASCII_RE.sub(" ", s)
        s = WHITESPACE_RE.sub(" ", s).strip()
        return s

    def load_json_arr(path: Path):
        data = json.loads(path.read_text(encoding="utf-8"))
        X = [clean_text(d["text"]) for d in data]
        y = [int(d["label"]) for d in data]
        return X, y

    def make_vectorizer_from_dictionary(dict_path: Path) -> TfidfVectorizer:
        vocab = sorted({w.strip() for w in dict_path.read_text("utf-8").splitlines() if w.strip()})
        return TfidfVectorizer(
            vocabulary=vocab,
            lowercase=True,
            dtype=np.float32,
            token_pattern=r"(?u)\b\w+\b",
            max_df=1.0,
            min_df=1,
            norm="l2",
            use_idf=True,
            smooth_idf=True,
            sublinear_tf=True,
        )

    def build_classic_models():
        return {
            "DecisionTree": DecisionTreeClassifier(max_depth=100, random_state=random_state),
            "RandomForest": RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=random_state),
            "Logistic":     LogisticRegression(max_iter=500, n_jobs=-1, solver="saga", penalty="l2"),
            "LinearSVC":    LinearSVC(),
            "MLP":          MLPClassifier(hidden_layer_sizes=(256,), activation="relu", max_iter=25, random_state=random_state),
        }

    def build_dnn(input_dim: int):
        model = keras.Sequential([
            layers.Input(shape=(input_dim,)),
            layers.Dense(512, activation="relu"),
            layers.Dropout(0.3),
            layers.Dense(256, activation="relu"),
            layers.Dropout(0.2),
            layers.Dense(2, activation="softmax"),
        ])
        model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
        return model

    def pred_binary(model, X):
        if hasattr(model, "predict"):
            try:
                return model.predict(X).astype(int)
            except Exception:
                pass
        try:
            df = model.decision_function(X)
            return (df > 0).astype(int)
        except Exception:
            if hasattr(model, "predict_proba"):
                proba = model.predict_proba(X)
                return np.argmax(proba, axis=1).astype(int)
            raise

    def pred_binary_dnn(dnn, X):
        P = dnn.predict(X.toarray(), verbose=0)
        return np.argmax(P, axis=1).astype(int)

    def majority_vote(preds_bin: list) -> np.ndarray:
        stacked = np.vstack(preds_bin)
        votes = stacked.sum(axis=0)
        return (votes >= (stacked.shape[0]/2.0)).astype(int)

    def ctc_predict(vec, models15: dict, dnns6: dict, X_text: list) -> np.ndarray:
        X = vec.transform(X_text)
        preds = [pred_binary(m, X) for m in models15.values()]
        preds += [pred_binary_dnn(d, X) for d in dnns6.values()]
        return majority_vote(preds)

    # ---------- fetch authors' repo for dictionary + validation ----------
    CTC_REPO_ZIP = "https://codeload.github.com/epelofske-student/CTC/zip/refs/heads/main"
    DICT_REPO_PATH = "English_word_dictionary.txt"
    VAL_DIR_CYB = "validation_data_cybersecurity"
    VAL_DIR_NON = "validation_data_non_cybersecurity"

    repo_zip = DATA / "CTC-main.zip"
    repo_root = DATA / "CTC-main"
    if not repo_root.exists():
        stream_download(CTC_REPO_ZIP, repo_zip, "CTC-main.zip")
        with zipfile.ZipFile(repo_zip, "r") as z:
            z.extractall(DATA)

    DICT_PATH = repo_root / DICT_REPO_PATH
    VAL_CYB = repo_root / VAL_DIR_CYB
    VAL_NON = repo_root / VAL_DIR_NON
    assert DICT_PATH.exists(), "Dictionary not found in repo"
    assert VAL_CYB.exists() and VAL_NON.exists(), "Validation folders missing from repo"

    # ---------- load your data ----------
    Rp, Sp, Ap, Cp = Path(reddit_path), Path(stack_path), Path(arxiv_path), Path(combined_path)
    for p in [Rp, Sp, Ap, Cp]:
        if not Path(p).exists():
            raise FileNotFoundError(f"Missing file: {p}")

    Xr, yr = load_json_arr(Rp)
    Xs, ys = load_json_arr(Sp)
    Xa, ya = load_json_arr(Ap)
    Xall, yall = load_json_arr(Cp)

    # combined split (train/val/test)
    X_train_text, X_test_text, y_train, y_test = train_test_split(
        Xall, yall, test_size=test_size, random_state=random_state, stratify=yall
    )
    X_train_text, X_val_text, y_train, y_val = train_test_split(
        X_train_text, y_train, test_size=val_size_within_train, random_state=random_state, stratify=y_train
    )

    # ---------- vectorize with authors' dictionary ----------
    vec = make_vectorizer_from_dictionary(DICT_PATH)
    X_train = vec.fit_transform(X_train_text)
    X_val   = vec.transform(X_val_text)
    X_test  = vec.transform(X_test_text)
    joblib.dump(vec, MODELDIR / "tfidf_vectorizer.joblib")
    print("TF-IDF:", X_train.shape, X_val.shape, X_test.shape)

    # ---------- per-source training (5 classics + 2 DNN) ----------
    def train_source_bundle(source_name: str, X_text: list, y: list):
        Xtr_txt, Xte_txt, ytr, yte = train_test_split(X_text, y, test_size=0.20, random_state=random_state, stratify=y)
        Xtr_txt, Xva_txt, ytr, yva = train_test_split(Xtr_txt, ytr, test_size=0.125, random_state=random_state, stratify=ytr)
        Xtr = vec.transform(Xtr_txt); Xva = vec.transform(Xva_txt); Xte = vec.transform(Xte_txt)

        classics = build_classic_models()
        trained = {}
        for name, model in classics.items():
            print(f"[{source_name}] Training {name} ...")
            try:
                if hasattr(model, "class_weight"):
                    model.set_params(class_weight="balanced")
            except Exception:
                pass
            model.fit(Xtr, ytr)
            va_acc = accuracy_score(yva, model.predict(Xva))
            print(f"[{source_name}] {name} val acc: {va_acc:.4f}")
            joblib.dump(model, MODELDIR / f"{source_name}_{name}.joblib")
            trained[f"{source_name}_{name}"] = model

        print(f"[{source_name}] Training DNN (checkpoints near ~0.95 and ~0.99 val acc) ...")
        dnn = build_dnn(Xtr.shape[1])
        Xtr_d = Xtr.toarray(); Xva_d = Xva.toarray()
        best = { "DNN_t95": (None, 1e9), "DNN_t99": (None, 1e9) }  # (model, gap)

        targets = { "DNN_t95": 0.95, "DNN_t99": 0.99 }
        for ep in range(1, epochs_dnn_per_source+1):
            dnn.fit(Xtr_d, np.array(ytr), epochs=1, batch_size=batch_size, verbose=0)
            _, va_acc = dnn.evaluate(Xva_d, np.array(yva), verbose=0)
            print(f"[{source_name}] DNN epoch {ep}: val acc={va_acc:.4f}")
            for tag, targ in targets.items():
                gap = abs(va_acc - targ)
                if gap < best[tag][1]:
                    path = MODELDIR / f"{source_name}_{tag}.keras"
                    dnn.save(path)
                    best[tag] = (keras.models.load_model(path), gap)

        dnns = { f"{source_name}_DNN_t95": best["DNN_t95"][0],
                 f"{source_name}_DNN_t99": best["DNN_t99"][0] }
        return trained, dnns, (Xte, yte)

    sources = {
        "Reddit": (Xr, yr),
        "Stackexchange": (Xs, ys),
        "arXiv": (Xa, ya),
    }

    all_classic, all_dnns, per_source_tests = {}, {}, {}
    for sname, (Xt, yt) in sources.items():
        mcls, mdnns, (Xte_src, yte_src) = train_source_bundle(sname, Xt, yt)
        all_classic.update(mcls)
        all_dnns.update(mdnns)
        per_source_tests[sname] = (Xte_src, yte_src)

    print("Classic models:", len(all_classic), "| DNNs:", len(all_dnns))

    # ---------- evaluate on your held-out combined TEST ----------
    print("\n=== Combined TEST split ===")
    yhat = ctc_predict(vec, all_classic, all_dnns, X_test_text)
    acc = accuracy_score(y_test, yhat)
    print(f"CTC (21-model) TEST accuracy: {acc:.4f}")
    print(classification_report(y_test, yhat, target_names=["non-cybersecurity","cybersecurity"]))

    # ---------- evaluate on authors' validation folders ----------
    def read_text_dir(dir_path: Path, max_files=None):
        files = sorted([p for p in dir_path.rglob("*") if p.is_file()])
        if max_files is not None: files = files[:max_files]
        texts = []
        for p in files:
            try:
                texts.append(clean_text(p.read_text("utf-8", errors="ignore")))
            except Exception:
                pass
        return texts

    print("\n=== Authors' original validation folders ===")
    X_cyb = read_text_dir(VAL_CYB)
    X_non = read_text_dir(VAL_NON)

    yhat_cyb = ctc_predict(vec, all_classic, all_dnns, X_cyb)
    yhat_non = ctc_predict(vec, all_classic, all_dnns, X_non)

    acc_cyb = (yhat_cyb == 1).mean()
    acc_non = (yhat_non == 0).mean()
    fp_rate = (yhat_non == 1).mean()
    fn_rate = (yhat_cyb == 0).mean()

    print(f"Cybersecurity Val:      acc={acc_cyb:.4f}, FN={fn_rate:.4f}, N={len(X_cyb)}")
    print(f"Non-cybersecurity Val:  acc={acc_non:.4f}, FP={fp_rate:.4f}, N={len(X_non)}")

    # ---------- save index ----------
    index = {
        "vectorizer": str((MODELDIR / "tfidf_vectorizer.joblib").resolve()),
        "classics": sorted(list(all_classic.keys())),
        "dnns": sorted(list(all_dnns.keys()))
    }
    (MODELDIR / "ensemble_index.json").write_text(json.dumps(index, indent=2), encoding="utf-8")
    print("\nSaved ensemble index at:", MODELDIR / "ensemble_index.json")
    print("✅ Done.")

In [None]:
# ===== Mount + Locate + Run =====
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

import os, glob, json
from pathlib import Path
from collections import Counter

# 1) Try the common locations first
candidates = [
    "/content/drive/MyDrive/CTC_by_source",                 # standard My Drive
]
# 2) Also search recursively across all mounted Drive roots
candidates += glob.glob("/content/drive/**/CTC_by_source", recursive=True)

# Deduplicate while preserving order
seen = set(); found = []
for p in candidates:
    if p not in seen and os.path.isdir(p):
        seen.add(p); found.append(p)

if not found:
    # Helpful listings so you can see where things are
    print("⚠️ Could not find CTC_by_source automatically.")
    print("Contents of /content/drive:", os.listdir("/content/drive"))
    if os.path.isdir("/content/drive/MyDrive"):
        print("Contents of /content/drive/MyDrive:", os.listdir("/content/drive/MyDrive"))
    raise FileNotFoundError("CTC_by_source folder not found. If it’s in a Shared Drive, check under /content/drive/Shareddrives/<DriveName>/CTC_by_source")

BASE_DIR = Path(found[0])
print("✅ Using folder:", BASE_DIR)

# --- Locate files inside the folder
def first_match(base: Path, patterns):
    for pat in patterns:
        m = list(base.glob(pat))
        if m: return m[0]
    return None

paths = {
    "reddit":   first_match(BASE_DIR, ["CTC_Reddit_10k.json","*Reddit*10k*.json","*reddit*10k*.json"]),
    "stack":    first_match(BASE_DIR, ["CTC_Stackexchange_10k.json","*Stack*10k*.json","*stack*10k*.json"]),
    "arxiv":    first_match(BASE_DIR, ["CTC_arXiv_10k.json","*arXiv*10k*.json","*arxiv*10k*.json"]),
    "combined": first_match(BASE_DIR, ["CTC_by_source_30k.json","*by_source*30k*.json","*combined*30k*.json"]),
}
print("\nFound files:")
for k,v in paths.items(): print(f"  {k:9s} -> {v}")

missing = [k for k,v in paths.items() if v is None or not Path(v).exists()]
if missing:
    raise FileNotFoundError(f"Missing required file(s): {missing}. Check names and that they’re in {BASE_DIR}")

# --- Tiny sanity check before running
def sniff(p: Path, n=3):
    data = json.loads(Path(p).read_text(encoding="utf-8"))
    labs = Counter(int(d["label"]) for d in data if "label" in d)
    print(f"\n{p.name}: {len(data)} items | labels {dict(labs)}")
    for r in data[:n]:
        print("  -", (r.get("text","")[:120] + ("…" if len(r.get("text",""))>120 else "")))

for k in ["reddit","stack","arxiv","combined"]:
    sniff(Path(paths[k]))

# --- Run the CTC pipeline with the detected paths
# NOTE: run_ctc_pipeline() must already be defined in this notebook (from the previous message).
res = run_ctc_pipeline(
    reddit_path=str(paths["reddit"]),
    stack_path=str(paths["stack"]),
    arxiv_path=str(paths["arxiv"]),
    combined_path=str(paths["combined"]),
    epochs_dnn_per_source=6,   # tweak as you wish
    batch_size=512,
    random_state=42,
    test_size=0.20,
    val_size_within_train=0.125,
)

Mounted at /content/drive
✅ Using folder: /content/drive/MyDrive/CTC_by_source

Found files:
  reddit    -> /content/drive/MyDrive/CTC_by_source/CTC_Reddit_10k.json
  stack     -> /content/drive/MyDrive/CTC_by_source/CTC_Stackexchange_10k.json
  arxiv     -> /content/drive/MyDrive/CTC_by_source/CTC_arXiv_10k.json
  combined  -> /content/drive/MyDrive/CTC_by_source/CTC_by_source_30k.json

CTC_Reddit_10k.json: 20000 items | labels {0: 10000, 1: 10000}
  - Look both ways before crossing... ok, at least look one way... no, not that way.... 
  - Help: I need a cipher with a "false bottom". Hi! This is for a game, geocaching, so it doesn't need to be "super secure"…
  - Computer Science (i.e., the science of computation; not programming) Discord Server. There are very few Discord servers …

CTC_Stackexchange_10k.json: 20000 items | labels {1: 10000, 0: 10000}
  - What is the most successful virus/rootkit?. <p>"Successful" is rated by infection rate. </p>

<p>Which virus/rootkit/mal…
  - How 

In [None]:
# ===============================
# CTC "FAST" Runner (no DNNs)
# ===============================
# - Trains 5 classic models per source (15 total), majority vote ensemble
# - Optional per-source subsample to avoid long runtime
# - Uses the original dictionary TF-IDF
# - Saves models and prints metrics

from google.colab import drive
drive.mount("/content/drive", force_remount=True)

import os, json, zipfile, requests, joblib, re, time, random
import numpy as np
from pathlib import Path
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# ----- CONFIG -----
BASE_DIR = Path("/content/drive/MyDrive/CTC_by_source")
REDDIT = BASE_DIR / "CTC_Reddit_10k.json"
STACK  = BASE_DIR / "CTC_Stackexchange_10k.json"
ARXIV  = BASE_DIR / "CTC_arXiv_10k.json"
COMBO  = BASE_DIR / "CTC_by_source_30k.json"

# Speed knobs
LIMIT_PER_SOURCE = 4000   # set to None for all 10k per source
TEST_SIZE = 0.20
VAL_SIZE_WITHIN_TRAIN = 0.125
RANDOM_STATE = 42

WORKDIR = Path("ctc_fast"); WORKDIR.mkdir(exist_ok=True, parents=True)
MODELDIR = Path("ctc_models_fast"); MODELDIR.mkdir(exist_ok=True, parents=True)
DATA = WORKDIR / "data"; DATA.mkdir(exist_ok=True, parents=True)

# Authors' repo for dictionary + validation (optional at the end)
CTC_REPO_ZIP = "https://codeload.github.com/epelofske-student/CTC/zip/refs/heads/main"
DICT_REPO_PATH = "English_word_dictionary.txt"
VAL_DIR_CYB = "validation_data_cybersecurity"
VAL_DIR_NON = "validation_data_non_cybersecurity"

def stream_download(url: str, out_path: Path, desc: str = None):
    if out_path.exists() and out_path.stat().st_size > 0:
        return out_path
    import requests
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        total = int(r.headers.get("content-length", 0))
        with open(out_path, "wb") as f, tqdm(total=total, unit="B", unit_scale=True, desc=desc or out_path.name) as pbar:
            for chunk in r.iter_content(1024*1024):
                if chunk:
                    f.write(chunk); pbar.update(len(chunk))
    return out_path

# Cleaning similar to earlier cells
CLEAN_HTML_RE = re.compile(r"<[^>]+>")
URL_RE = re.compile(r"http[s]?://\S+|www\.\S+")
CODE_RE = re.compile(r"`{1,3}.*?`{1,3}", re.DOTALL)
NON_ASCII_RE = re.compile(r"[^\x00-\x7F]+")
WHITESPACE_RE = re.compile(r"\s+")
def clean_text(s: str) -> str:
    if not isinstance(s, str): return ""
    s = URL_RE.sub(" ", s)
    s = CODE_RE.sub(" ", s)
    s = CLEAN_HTML_RE.sub(" ", s)
    s = NON_ASCII_RE.sub(" ", s)
    s = WHITESPACE_RE.sub(" ", s).strip()
    return s

def load_json_arr(path: Path, limit=None, seed=RANDOM_STATE):
    data = json.loads(path.read_text(encoding="utf-8"))
    if limit is not None and len(data) > limit:
        random.Random(seed).shuffle(data)
        data = data[:limit]
    X = [clean_text(d["text"]) for d in data]
    y = [int(d["label"]) for d in data]
    return X, y

# 1) Fetch dictionary
repo_zip = DATA / "CTC-main.zip"
repo_root = DATA / "CTC-main"
if not repo_root.exists():
    stream_download(CTC_REPO_ZIP, repo_zip, "CTC-main.zip")
    with zipfile.ZipFile(repo_zip, "r") as z:
        z.extractall(DATA)
DICT_PATH = repo_root / DICT_REPO_PATH
assert DICT_PATH.exists(), "Dictionary not found in repo"

# 2) Load your data (optionally subsample for speed)
print("Loading data…")
Xr, yr = load_json_arr(REDDIT, limit=LIMIT_PER_SOURCE)
Xs, ys = load_json_arr(STACK,  limit=LIMIT_PER_SOURCE)
Xa, ya = load_json_arr(ARXIV,  limit=LIMIT_PER_SOURCE)

Xall, yall = load_json_arr(COMBO, limit=(None if LIMIT_PER_SOURCE is None else 3*LIMIT_PER_SOURCE))

# 3) Build dictionary TF-IDF and combined splits
vocab = sorted({w.strip() for w in DICT_PATH.read_text("utf-8").splitlines() if w.strip()})
vec = TfidfVectorizer(vocabulary=vocab, lowercase=True, dtype=np.float32,
                      token_pattern=r"(?u)\b\w+\b", norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=True)
X_train_text, X_test_text, y_train, y_test = train_test_split(
    Xall, yall, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=yall
)
X_train_text, X_val_text, y_train, y_val = train_test_split(
    X_train_text, y_train, test_size=VAL_SIZE_WITHIN_TRAIN, random_state=RANDOM_STATE, stratify=y_train
)
X_train = vec.fit_transform(X_train_text)
X_val   = vec.transform(X_val_text)
X_test  = vec.transform(X_test_text)
joblib.dump(vec, MODELDIR / "tfidf_vectorizer.joblib")
print("TF-IDF shapes:", X_train.shape, X_val.shape, X_test.shape)

# 4) Classic models per source
def build_classic_models():
    return {
        "DecisionTree": DecisionTreeClassifier(max_depth=100, random_state=RANDOM_STATE),
        "RandomForest": RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=RANDOM_STATE),
        "Logistic":     LogisticRegression(max_iter=500, n_jobs=-1, solver="saga", penalty="l2"),
        "LinearSVC":    LinearSVC(),
        "MLP":          MLPClassifier(hidden_layer_sizes=(256,), activation="relu", max_iter=25, random_state=RANDOM_STATE),
    }

def train_source_classics(source_name: str, X_text, y):
    Xtr_txt, Xte_txt, ytr, yte = train_test_split(X_text, y, test_size=0.20, random_state=RANDOM_STATE, stratify=y)
    Xtr_txt, Xva_txt, ytr, yva = train_test_split(Xtr_txt, ytr, test_size=0.125, random_state=RANDOM_STATE, stratify=ytr)
    Xtr = vec.transform(Xtr_txt); Xva = vec.transform(Xva_txt); Xte = vec.transform(Xte_txt)

    classics = build_classic_models()
    trained = {}
    for name, model in classics.items():
        print(f"[{source_name}] Training {name} …")
        try:
            if hasattr(model, "class_weight"): model.set_params(class_weight="balanced")
        except Exception:
            pass
        model.fit(Xtr, ytr)
        va_acc = accuracy_score(yva, model.predict(Xva))
        print(f"[{source_name}] {name} val acc: {va_acc:.4f}")
        joblib.dump(model, MODELDIR / f"{source_name}_{name}.joblib")
        trained[f"{source_name}_{name}"] = model
    return trained, (Xte, yte)

def pred_binary(model, X):
    if hasattr(model, "predict"):
        try:
            return model.predict(X).astype(int)
        except Exception:
            pass
    try:
        df = model.decision_function(X)
        return (df > 0).astype(int)
    except Exception:
        if hasattr(model, "predict_proba"):
            proba = model.predict_proba(X)
            return np.argmax(proba, axis=1).astype(int)
        raise

def majority_vote(preds_bin: list) -> np.ndarray:
    stacked = np.vstack(preds_bin)
    votes = stacked.sum(axis=0)
    return (votes >= (stacked.shape[0]/2.0)).astype(int)

def ctc_predict(vec, models15: dict, X_text: list) -> np.ndarray:
    X = vec.transform(X_text)
    preds = [pred_binary(m, X) for m in models15.values()]
    return majority_vote(preds)

sources = {
    "Reddit": (Xr, yr),
    "Stackexchange": (Xs, ys),
    "arXiv": (Xa, ya),
}

all_models = {}
per_source_tests = {}
for sname, (Xt, yt) in sources.items():
    mcls, (Xte_src, yte_src) = train_source_classics(sname, Xt, yt)
    all_models.update(mcls)
    per_source_tests[sname] = (Xte_src, yte_src)

print("\nClassic models trained:", len(all_models))  # should be 15

# 5) Evaluate ensemble on your combined TEST split
print("\n=== Combined TEST split (FAST) ===")
yhat = ctc_predict(vec, all_models, X_test_text)
acc = accuracy_score(y_test, yhat)
print(f"CTC (15 classic) TEST accuracy: {acc:.4f}")
print(classification_report(y_test, yhat, target_names=["non-cybersecurity","cybersecurity"]))

print("\n✅ FAST run complete. Models saved in:", MODELDIR.resolve())

In [None]:
# ============================================
# Per-Source DNN Trainer (resume-safe)
# ============================================

# --- Config you can tweak ---
BASE_DIR = "/content/drive/MyDrive/CTC_by_source"     # where your 10k/source JSONs live
MODEL_DIR = "ctc_models"                               # where vectorizer & classic models live
OUT_DIR   = "ctc_dnn_checkpoints"                      # where DNN checkpoints will be saved

EPOCHS_PER_CALL = 4        # how many epochs to run in THIS invocation (safe to call multiple times)
BATCH_SIZE = 512
RANDOM_STATE = 42
TARGETS = {"DNN_t95": 0.95, "DNN_t99": 0.99}  # we keep checkpoints closest to these val accuracies

# Optional cap for very small VRAM; set to None to use all data from the 10k file
LIMIT_PER_SOURCE = None    # e.g., 6000 to speed up quick runs

# -------------------------------------------
# Imports
import os, json, re, zipfile, requests, joblib, glob
import numpy as np
from pathlib import Path
from tqdm import tqdm
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Colab niceties
from google.colab import drive
drive.mount("/content/drive", force_remount=False)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
try:
    for g in tf.config.list_physical_devices("GPU"):
        tf.config.experimental.set_memory_growth(g, True)
except Exception:
    pass

# -------------------------------------------
# Helpers

CLEAN_HTML_RE = re.compile(r"<[^>]+>")
URL_RE = re.compile(r"http[s]?://\S+|www\.\S+")
CODE_RE = re.compile(r"`{1,3}.*?`{1,3}", re.DOTALL)
NON_ASCII_RE = re.compile(r"[^\x00-\x7F]+")
WHITESPACE_RE = re.compile(r"\s+")

def clean_text(s: str) -> str:
    if not isinstance(s, str): return ""
    s = URL_RE.sub(" ", s)
    s = CODE_RE.sub(" ", s)
    s = CLEAN_HTML_RE.sub(" ", s)
    s = NON_ASCII_RE.sub(" ", s)
    s = WHITESPACE_RE.sub(" ", s).strip()
    return s

def load_ctc_json(p: Path, limit=None, seed=RANDOM_STATE):
    data = json.loads(p.read_text(encoding="utf-8"))
    if limit and len(data) > limit:
        rng = np.random.default_rng(seed); idx = rng.permutation(len(data))[:limit]
        data = [data[i] for i in idx]
    X = [clean_text(d["text"]) for d in data]
    y = [int(d["label"]) for d in data]
    return X, y

def build_dnn(input_dim: int):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(512, activation="relu"),
        layers.Dropout(0.30),
        layers.Dense(256, activation="relu"),
        layers.Dropout(0.20),
        layers.Dense(2, activation="softmax"),
    ])
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return model

def latest_epoch_ckpt(ckpt_dir: Path):
    files = sorted(ckpt_dir.glob("epoch_*.keras"))
    if not files:
        return None
    return files[-1]

def save_meta(meta_path: Path, meta: dict):
    meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")

def load_meta(meta_path: Path):
    if meta_path.exists():
        try:
            return json.loads(meta_path.read_text(encoding="utf-8"))
        except Exception:
            pass
    # default meta
    return {
        "best": { "DNN_t95": {"acc": 0.0, "gap": 1e9, "path": None},
                  "DNN_t99": {"acc": 0.0, "gap": 1e9, "path": None}},
        "epochs_trained": 0
    }

def get_source_paths(source_name: str) -> Path:
    base = Path(BASE_DIR)
    mapping = {
        "Reddit":       "CTC_Reddit_10k.json",
        "Stackexchange":"CTC_Stackexchange_10k.json",
        "arXiv":        "CTC_arXiv_10k.json",
    }
    fn = mapping.get(source_name)
    if not fn:
        raise ValueError("source_name must be one of: 'Reddit', 'Stackexchange', 'arXiv'")
    p = base / fn
    if not p.exists():
        raise FileNotFoundError(f"Not found: {p}")
    return p

def load_vectorizer_or_fail():
    vec_path = Path(MODEL_DIR) / "tfidf_vectorizer.joblib"
    if not vec_path.exists():
        raise FileNotFoundError(f"Can't find vectorizer at {vec_path}. "
                                f"Run the FAST runner or the full pipeline once to create it.")
    return joblib.load(vec_path)

# -------------------------------------------
# Main callable

def train_dnn_source(source_name: str,
                     epochs_this_call: int = EPOCHS_PER_CALL,
                     batch_size: int = BATCH_SIZE,
                     limit_per_source = LIMIT_PER_SOURCE):
    """
    Train/Resume DNN for a single source.
    Splits that source 80/20 then 12.5% of train as val (≈10% of original).
    Saves:
      - per-epoch checkpoints: {OUT_DIR}/{source}/epoch_{N}.keras
      - best-by-target:        {OUT_DIR}/{source}/{source}_DNN_t95.keras and _t99.keras
      - meta.json with best accuracies & epochs_trained
    """
    print(f"\n=== DNN training for source: {source_name} ===")
    src_path = get_source_paths(source_name)
    vec = load_vectorizer_or_fail()

    # Load data for this source
    X, y = load_ctc_json(src_path, limit=limit_per_source)
    Xtr_txt, Xte_txt, ytr, yte = train_test_split(X, y, test_size=0.20, random_state=RANDOM_STATE, stratify=y)
    Xtr_txt, Xva_txt, ytr, yva = train_test_split(Xtr_txt, ytr, test_size=0.125, random_state=RANDOM_STATE, stratify=ytr)

    # Vectorize and densify (DNN needs dense)
    Xtr = vec.transform(Xtr_txt).toarray()
    Xva = vec.transform(Xva_txt).toarray()

    # Prepare dirs
    out_root = Path(OUT_DIR); out_root.mkdir(parents=True, exist_ok=True)
    ckpt_dir = out_root / source_name; ckpt_dir.mkdir(parents=True, exist_ok=True)
    meta_path = ckpt_dir / "meta.json"
    meta = load_meta(meta_path)

    # Create or resume model
    latest = latest_epoch_ckpt(ckpt_dir)
    if latest:
        print(f"Resuming from latest checkpoint: {latest.name}")
        dnn = keras.models.load_model(latest)
        start_epoch = int(latest.stem.split("_")[-1]) + 1
    else:
        print("No prior checkpoint — starting fresh.")
        dnn = build_dnn(Xtr.shape[1])
        start_epoch = 1

    # Train a few epochs this call
    for ep in range(start_epoch, start_epoch + epochs_this_call):
        dnn.fit(Xtr, np.array(ytr), epochs=1, batch_size=batch_size, verbose=1)
        va_loss, va_acc = dnn.evaluate(Xva, np.array(yva), verbose=0)
        print(f"[{source_name}] epoch {ep} — val acc: {va_acc:.4f}")

        # Save epoch checkpoint
        ep_path = ckpt_dir / f"epoch_{ep:03d}.keras"
        dnn.save(ep_path)

        # Update best-by-target checkpoints
        for tag, targ in TARGETS.items():
            gap = abs(float(va_acc) - targ)
            if gap < meta["best"][tag]["gap"]:
                best_path = ckpt_dir / f"{source_name}_{tag}.keras"
                dnn.save(best_path)
                meta["best"][tag] = {"acc": float(va_acc), "gap": float(gap), "path": str(best_path)}
                print(f"  ↳ Updated {tag} ({targ:.2f}) → acc={va_acc:.4f}, saved: {best_path.name}")

        meta["epochs_trained"] = ep
        save_meta(meta_path, meta)

    print("\n✅ Done for", source_name)
    print("Best snapshots so far:")
    for tag, info in meta["best"].items():
        print(f"  {tag}: acc≈{info['acc']:.4f}, path={info['path']}")
    print(f"All checkpoints in: {ckpt_dir.resolve()}")