### Paths & configuration

In [2]:
# Paths & config
from pathlib import Path

CANDIDATES = [
    Path(r"D:\Adelaide\2ndTrimester2 2025\Using Machine Learning Tools_4536_COMP_SCI_X_0001\A3"),
    Path.cwd()
]
A3_DIR = next((p for p in CANDIDATES if (p / "train_ai.npy").exists()), Path.cwd())

DATA_DIR   = A3_DIR
SUBMIT_DIR = A3_DIR / "submissions"
SUBMIT_DIR.mkdir(parents=True, exist_ok=True)

# Memory / speed knobs
BATCH_TRAIN = 24   # streaming training for batch size (lower if RAM is tight)
BATCH_TEST  = 256  # streaming test prediction for batch size

print("DATA_DIR:", DATA_DIR)
print("Files:", [p.name for p in DATA_DIR.iterdir() if p.is_file()][:10], "...")


DATA_DIR: D:\Adelaide\2ndTrimester2 2025\Using Machine Learning Tools_4536_COMP_SCI_X_0001\A3
Files: ['achala.ipynb', 'ALFIE_pre_processing.ipynb', 'best_meta.json', 'best_name.txt', 'best_sgd_stream.joblib', 'best_sklearn.joblib', 'kaggle_language_detection_advanced.ipynb', 'sample.csv', 'Sanjida_Amrin_a1934493.ipynb', 'test_features.jsonl'] ...


### Helper functions

In [4]:
# Helper functions: JSONL resolving, pooling, metrics, etc.
import json, gc, numpy as np
from pathlib import Path
from sklearn.metrics import f1_score, roc_auc_score

def resolve_jsonl(p: Path) -> Path:
    """
    Handling the cases where a *folder* is named 'validation.jsonl' or 'test_features.jsonl'.
    And returns a file path to the actual .jsonl file.
    """
    p = Path(p)
    if p.is_file():
        return p
    if p.is_dir():
        cands = list(p.glob("*.jsonl")) or list(p.rglob("*.jsonl"))
        if not cands:
            raise FileNotFoundError(f"No .jsonl found in {p}")
        # Prefer expected filenames if present
        for expected in ("validation.jsonl", "test_features.jsonl"):
            for c in cands:
                if c.name.lower() == expected:
                    print(f"Resolved {p} -> {c}")
                    return c
        print(f"Resolved {p} -> {cands[0]}")
        return cands[0]
    raise FileNotFoundError(p)

def read_jsonl(path: Path):
    items = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                items.append(json.loads(line))
    return items

def pool_stats_batch(x: np.ndarray) -> np.ndarray:
    """
    Input:  x of shape (B, L, 768)
    Output: (B, 3072) by concatenating [mean, std, max, min] along the time axis L.
    """
    m  = x.mean(axis=1)
    s  = x.std(axis=1)
    mx = x.max(axis=1)
    mn = x.min(axis=1)
    return np.concatenate([m, s, mx, mn], axis=1).astype(np.float32)

def pool_stats_flex(sample) -> np.ndarray:
    """
    Flexible pooling for any (L,768) or (768,) or k*768 shaped data -> (3072,) features
    by concatenating [mean, std, max, min] across the sequence axis.
    """
    x = np.asarray(sample, dtype=np.float32)
    if x.ndim == 1:
        if x.size == 768:
            m = x; s = np.zeros_like(x); mx = x; mn = x
            return np.concatenate([m, s, mx, mn]).astype(np.float32)
        if x.size % 768 == 0:
            x = x.reshape(-1, 768)
        else:
            vec = (np.pad(x, (0, 768 - x.size)) if x.size < 768 else x[:768])
            m = vec; s = np.zeros_like(vec); mx = vec; mn = vec
            return np.concatenate([m, s, mx, mn]).astype(np.float32)
    if x.ndim == 2:
        if x.shape[1] == 768:
            pass
        elif x.shape[0] == 768:
            x = x.T
        elif (x.size % 768) == 0:
            x = x.reshape(-1, 768)
        else:
            x = x[:, :768] if x.shape[1] > 768 else np.pad(x, ((0,0),(0,768-x.shape[1])), 'constant')
    else:
        if (x.size % 768) == 0:
            x = x.reshape(-1, 768)
        else:
            vec = (np.pad(x.ravel(), (0, 768 - x.size)) if x.size < 768 else x.ravel()[:768])
            m = vec; s = np.zeros_like(vec); mx = vec; mn = vec
            return np.concatenate([m, s, mx, mn]).astype(np.float32)
    m, s, mx, mn = x.mean(0), x.std(0), x.max(0), x.min(0)
    return np.concatenate([m, s, mx, mn]).astype(np.float32)

def n_rows_in_npy(npy_path: Path) -> int:
    """Return number of items in a .npy array (supports memmap)."""
    arr = np.load(npy_path, mmap_mode="r")
    n = arr.shape[0] if hasattr(arr, "shape") else len(arr)
    del arr
    return int(n)

def expit_stable(z):
    """Numerically stable sigmoid that avoids overflow for large |z|."""
    z = np.asarray(z, dtype=np.float64)
    out = np.empty_like(z)
    pos = z >= 0
    out[pos]  = 1.0 / (1.0 + np.exp(-z[pos]))
    ez = np.exp(z[~pos])
    out[~pos] = ez / (1.0 + ez)
    return out.astype(np.float32)

def optimal_threshold(y_true, proba):
    """
    Return threshold with best F1 on y_true, plus F1 and AUC.
    """
    ts = np.linspace(0.05, 0.95, 19)
    best_t, best_f1 = 0.5, -1
    for t in ts:
        f1 = f1_score(y_true, (proba >= t).astype(int))
        if f1 > best_f1:
            best_f1, best_t = f1, t
    auc = roc_auc_score(y_true, proba)
    return best_t, best_f1, auc


### Training, Validating & Saving (streaming SGD)

In [5]:
# Streaming StandardScaler + SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
import joblib

ai_path = DATA_DIR / "train_ai.npy"
hu_path = DATA_DIR / "train_human.npy"

# For balanced learning via sample_weight class weights
n_pos = n_rows_in_npy(ai_path)
n_neg = n_rows_in_npy(hu_path)
total = n_pos + n_neg
w1 = total / (2.0 * n_pos)  # weight for class 1 (AI)
w0 = total / (2.0 * n_neg)  # weight for class 0 (Human)
print(f"class weights -> w0={w0:.4f}, w1={w1:.4f}  (n0={n_neg}, n1={n_pos})")

# In streaming mode fit StandardScaler 
scaler = StandardScaler()
for path in [ai_path, hu_path]:
    arr = np.load(path, mmap_mode="r")
    if arr.ndim == 3:  # (N, 100, 768)
        n = arr.shape[0]
        for start in range(0, n, BATCH_TRAIN):
            end = min(start + BATCH_TRAIN, n)
            Xb = pool_stats_batch(np.asarray(arr[start:end]))
            scaler.partial_fit(Xb)
            if start % (BATCH_TRAIN*40) == 0:
                print(f"[scaler] {path.name}: {end}/{n}")
    else:  # object array
        for i in range(len(arr)):
            Xb = pool_stats_flex(arr[i])[None, :]
            scaler.partial_fit(Xb)
            if i % 1000 == 0 and i > 0:
                print(f"[scaler] {path.name}: {i}/{len(arr)}")
    del arr; gc.collect()

# Training SGDClassifier (logistic) with partial_fit + per-sample weights
clf = SGDClassifier(loss="log_loss", penalty="l2", alpha=1e-4,
                    max_iter=1, tol=None, random_state=42)
first = True
classes = np.array([0, 1], dtype=int)

for path, label in [(ai_path, 1), (hu_path, 0)]:
    arr = np.load(path, mmap_mode="r")
    if arr.ndim == 3:
        n = arr.shape[0]
        for start in range(0, n, BATCH_TRAIN):
            end = min(start + BATCH_TRAIN, n)
            Xb = pool_stats_batch(np.asarray(arr[start:end]))
            Xb = scaler.transform(Xb).astype(np.float32, copy=False)
            yb = np.full(len(Xb), label, dtype=int)
            sw = np.where(yb == 1, w1, w0).astype(np.float32)  # per-sample weights
            if first:
                clf.partial_fit(Xb, yb, classes=classes, sample_weight=sw); first = False
            else:
                clf.partial_fit(Xb, yb, sample_weight=sw)
            if start % (BATCH_TRAIN*40) == 0:
                print(f"[sgd] {path.name}: {end}/{n}")
    else:
        for i in range(len(arr)):
            Xb = pool_stats_flex(arr[i])[None, :]
            Xb = scaler.transform(Xb).astype(np.float32, copy=False)
            yb = np.array([label], dtype=int)
            sw = np.array([w1 if label == 1 else w0], dtype=np.float32)
            if first:
                clf.partial_fit(Xb, yb, classes=classes, sample_weight=sw); first = False
            else:
                clf.partial_fit(Xb, yb, sample_weight=sw)
            if i % 1000 == 0 and i > 0:
                print(f"[sgd] {path.name}: {i}/{len(arr)}")
    del arr; gc.collect()

# Validation (As a tiny set: treating metrics cautiously)
VAL_PATH = resolve_jsonl(DATA_DIR / "validation.jsonl")
vitems = read_jsonl(VAL_PATH)
y_val  = np.array([it["label"] for it in vitems], dtype=int)
X_val  = np.vstack([pool_stats_flex(it["features"]) for it in vitems]).astype(np.float32)
X_val  = scaler.transform(X_val).astype(np.float32, copy=False)

scores_val = clf.decision_function(X_val)
proba_val  = expit_stable(scores_val)
thr, f1, auc = optimal_threshold(y_val, proba_val)
print(f"Validation: AUC={auc:.4f}  F1@best={f1:.4f}  thr={thr:.2f}")

# Saveing bundle + meta (model + scaler bundled for reuse)
import json as _json
bundle_path = DATA_DIR / "best_sgd_stream.joblib"
joblib.dump({"scaler": scaler, "model": clf}, bundle_path)
(DATA_DIR / "best_name.txt").write_text("sgd_stream")
(DATA_DIR / "best_meta.json").write_text(_json.dumps({"name": "sgd_stream", "thr": float(thr)}, indent=2))
print("Saved:", bundle_path)


class weights -> w0=1.0000, w1=1.0000  (n0=8161, n1=8161)
[scaler] train_ai.npy: 24/8161
[scaler] train_ai.npy: 984/8161
[scaler] train_ai.npy: 1944/8161
[scaler] train_ai.npy: 2904/8161
[scaler] train_ai.npy: 3864/8161
[scaler] train_ai.npy: 4824/8161
[scaler] train_ai.npy: 5784/8161
[scaler] train_ai.npy: 6744/8161
[scaler] train_ai.npy: 7704/8161
[scaler] train_human.npy: 24/8161
[scaler] train_human.npy: 984/8161
[scaler] train_human.npy: 1944/8161
[scaler] train_human.npy: 2904/8161
[scaler] train_human.npy: 3864/8161
[scaler] train_human.npy: 4824/8161
[scaler] train_human.npy: 5784/8161
[scaler] train_human.npy: 6744/8161
[scaler] train_human.npy: 7704/8161
[sgd] train_ai.npy: 24/8161
[sgd] train_ai.npy: 984/8161
[sgd] train_ai.npy: 1944/8161
[sgd] train_ai.npy: 2904/8161
[sgd] train_ai.npy: 3864/8161
[sgd] train_ai.npy: 4824/8161
[sgd] train_ai.npy: 5784/8161
[sgd] train_ai.npy: 6744/8161
[sgd] train_ai.npy: 7704/8161
[sgd] train_human.npy: 24/8161
[sgd] train_human.npy: 984/81

### Generating submissions (raw sigmoid + Platt calibrated)

In [6]:
# Raw sigmoid + Platt-calibrated using validation
import joblib
from sklearn.linear_model import LogisticRegression
import pandas as pd

# Loading bundle
bundle = joblib.load(DATA_DIR / "best_sgd_stream.joblib")
scaler = bundle["scaler"]; clf = bundle["model"]

# Building validation for Platt calibration
VAL_PATH = resolve_jsonl(DATA_DIR / "validation.jsonl")
vitems   = read_jsonl(VAL_PATH)
y_val    = np.array([it["label"] for it in vitems], dtype=int)
X_val    = np.vstack([pool_stats_flex(it["features"]) for it in vitems]).astype(np.float32)
X_val    = scaler.transform(X_val).astype(np.float32, copy=False)
val_scores = clf.decision_function(X_val).reshape(-1, 1)

# Fitting calibrator
calib = LogisticRegression(max_iter=200, solver="lbfgs")
calib.fit(val_scores, y_val)

# Streaming test once, compute raw scores
TEST_PATH = resolve_jsonl(DATA_DIR / "test_features.jsonl")
tids, scores, buf = [], [], []
with open(TEST_PATH, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip(): continue
        obj = json.loads(line)
        tids.append(obj["id"])
        buf.append(obj["features"])
        if len(buf) == BATCH_TEST:
            Xb = np.vstack([pool_stats_flex(b) for b in buf]).astype(np.float32)
            Xb = scaler.transform(Xb).astype(np.float32, copy=False)
            scores.append(clf.decision_function(Xb))
            buf = []
    if buf:
        Xb = np.vstack([pool_stats_flex(b) for b in buf]).astype(np.float32)
        Xb = scaler.transform(Xb).astype(np.float32, copy=False)
        scores.append(clf.decision_function(Xb))

scores = np.concatenate(scores).reshape(-1, 1)

# Two sets of probabilities
proba_raw   = expit_stable(scores.ravel())         # raw sigmoid
proba_platt = calib.predict_proba(scores)[:, 1]    # calibrated

# Write both Kaggle files
sub_raw   = pd.DataFrame({"id": tids, "probability": proba_raw})
sub_platt = pd.DataFrame({"id": tids, "probability": proba_platt})

out_raw   = SUBMIT_DIR / "submission_expit.csv"
out_platt = SUBMIT_DIR / "submission_platt.csv"
sub_raw.to_csv(out_raw, index=False)
sub_platt.to_csv(out_platt, index=False)
print("Wrote:", out_raw)
print("Wrote:", out_platt)

# Quick sanity-check
print("RAW   probs ->", "min:", float(proba_raw.min()),  "max:", float(proba_raw.max()),
      "mean:", float(proba_raw.mean()),  "std:", float(proba_raw.std()))
print("PLATT probs ->", "min:", float(proba_platt.min()),"max:", float(proba_platt.max()),
      "mean:", float(proba_platt.mean()),"std:", float(proba_platt.std()))

display(sub_raw.head(), sub_platt.head())


Wrote: D:\Adelaide\2ndTrimester2 2025\Using Machine Learning Tools_4536_COMP_SCI_X_0001\A3\submissions\submission_expit.csv
Wrote: D:\Adelaide\2ndTrimester2 2025\Using Machine Learning Tools_4536_COMP_SCI_X_0001\A3\submissions\submission_platt.csv
RAW   probs -> min: 0.0 max: 1.0 mean: 0.2611111104488373 std: 0.43924033641815186
PLATT probs -> min: 0.013428269701594017 max: 0.9990216546697579 mean: 0.39289330784319737 std: 0.30235123121346336


Unnamed: 0,id,probability
0,15,0.0
1,16,0.0
2,17,0.0
3,18,0.0
4,19,0.0


Unnamed: 0,id,probability
0,15,0.203387
1,16,0.466815
2,17,0.059594
3,18,0.234636
4,19,0.191696
