In [None]:
import sys
print(sys.executable)

In [None]:
# Import

import fasttext
from pathlib import Path

In [None]:
# --> Get model params

def get_supervised_params_from_model(m):
    """Extract hyperparameters so we can reuse them."""
    a = m.f.getArgs()
    return {
        "lr": a.lr,
        "dim": a.dim,
        "ws": a.ws,
        "epoch": a.epoch,
        "minCount": a.minCount,
        "minn": a.minn,
        "maxn": a.maxn,
        "neg": a.neg,
        "wordNgrams": a.wordNgrams,
        "bucket": a.bucket,
        "lrUpdateRate": a.lrUpdateRate,
        "t": a.t,
        "loss": m.loss
,   
    }

In [None]:
### Helpers
import re
from collections import Counter

def to_ft(lbl: str) -> str:
    return lbl if lbl.startswith("__label__") else "__label__" + lbl

ACTIONABLE = {to_ft(x) for x in ["ar_ma","ar_msa","ar_ma_latin","en","fr","es","it"]}
FALLBACK_AR = to_ft("other_ar")
FALLBACK_LG = to_ft("other_lg")

_ARABIC_RE = re.compile(r"[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]")

def fallback_label(text: str) -> str:
    return FALLBACK_AR if _ARABIC_RE.search(text) else FALLBACK_LG

def gate_pred(text: str, top1_lbl: str, top1_p: float, top2_p: float | None,
              tau: float = 0.90, delta: float = 0.00) -> str:
    # If model already predicts fallback, keep it
    if top1_lbl in {FALLBACK_AR, FALLBACK_LG}:
        return top1_lbl

    # If model predicts actionable but not confident enough (or too close to #2), abstain
    if top1_lbl in ACTIONABLE:
        if top1_p < tau:
            return fallback_label(text)
        if top2_p is not None and (top1_p - top2_p) < delta:
            return fallback_label(text)
        return top1_lbl

    # Any unexpected label -> fallback
    return fallback_label(text)

### Helpers
def evaluate_selective(model, path, tau=0.90, delta=0.00):
    y_true, texts = read_ft_file(path)

    labels = sorted(set(y_true))

    tp = Counter()
    fp = Counter()
    fn = Counter()
    conf = Counter()

    # selective metrics counters
    pred_A = 0
    correct_pred_A = 0
    a2a_wrong = 0

    # confidence diagnostics
    p_top1_accept = []
    p_top1_abstain = []
    margin_accept = []
    margin_abstain = []

    for yt, text in zip(y_true, texts):
        # IMPORTANT: k=2 to get probabilities + margin
        pred_labels, pred_probs = model.predict(text, k=2)

        top1_lbl = pred_labels[0]
        top1_p = float(pred_probs[0])
        top2_p = float(pred_probs[1]) if len(pred_probs) > 1 else None
        margin = (top1_p - top2_p) if top2_p is not None else None

        yp = gate_pred(text, top1_lbl, top1_p, top2_p, tau=tau, delta=delta)

        accepted = (yp in ACTIONABLE)
        if accepted:
            pred_A += 1
            p_top1_accept.append(top1_p)
            if margin is not None: margin_accept.append(margin)
        else:
            p_top1_abstain.append(top1_p)
            if margin is not None: margin_abstain.append(margin)

        # Update confusion/per-label stats using FINAL prediction (yp)
        if yp == yt:
            tp[yt] += 1
            if accepted:
                correct_pred_A += 1
        else:
            fp[yp] += 1
            fn[yt] += 1
            conf[(yt, yp)] += 1

            # actionable-to-actionable wrong (the error type you want to minimize)
            if (yt in ACTIONABLE) and (yp in ACTIONABLE):
                a2a_wrong += 1

    N = len(y_true)
    coverage_A = pred_A / N if N else 0.0
    trust_precision_A = (correct_pred_A / pred_A) if pred_A else 0.0
    a2a_error_rate = (a2a_wrong / N) if N else 0.0

    # per-label stats (still useful, but you will focus mainly on precision for ACTIONABLE)
    per_label = {}
    for lbl in labels:
        TPi = tp[lbl]
        FPi = fp[lbl]
        FNi = fn[lbl]
        prec = TPi / (TPi + FPi) if (TPi + FPi) else 0.0
        rec  = TPi / (TPi + FNi) if (TPi + FNi) else 0.0
        f1   = (2 * prec * rec / (prec + rec)) if (prec + rec) else 0.0
        per_label[lbl] = {"precision": prec, "recall": rec, "f1": f1, "support": TPi + FNi}

    # Macro-precision over ACTIONABLE (aligned with your goal)
    actionable_present = [lbl for lbl in labels if lbl in ACTIONABLE]
    macro_precision_A = (sum(per_label[l]["precision"] for l in actionable_present) / len(actionable_present)) if actionable_present else 0.0

    top_conf = conf.most_common(15)

    # diagnostics
    def avg(x): return sum(x)/len(x) if x else None
    diag = {
        "avg_top1_p_accept": avg(p_top1_accept),
        "avg_top1_p_abstain": avg(p_top1_abstain),
        "avg_margin_accept": avg(margin_accept),
        "avg_margin_abstain": avg(margin_abstain),
    }

    return {
        "coverage_A": coverage_A,
        "trust_precision_A": trust_precision_A,
        "macro_precision_A": macro_precision_A,
        "a2a_error_rate": a2a_error_rate,
        "per_label": per_label,
        "top_conf": top_conf,
        "diag": diag,
    }

def evaluate_per_label(model, path):
    y_true, texts = read_ft_file(path)

    labels = sorted(set(y_true))
    tp = Counter()
    fp = Counter()
    fn = Counter()
    conf = Counter()

    for yt, text in zip(y_true, texts):
        yp = model.predict(text, k=1)[0][0]
        if yp == yt:
            tp[yt] += 1
        else:
            fp[yp] += 1
            fn[yt] += 1
            conf[(yt, yp)] += 1

    per_label = {}
    f1_sum = 0.0
    for lbl in labels:
        TPi = tp[lbl]
        FPi = fp[lbl]
        FNi = fn[lbl]
        prec = TPi / (TPi + FPi) if (TPi + FPi) else 0.0
        rec  = TPi / (TPi + FNi) if (TPi + FNi) else 0.0
        f1   = (2 * prec * rec / (prec + rec)) if (prec + rec) else 0.0
        per_label[lbl] = {"precision": prec, "recall": rec, "f1": f1, "support": TPi + FNi}
        f1_sum += f1

    macro_f1 = f1_sum / len(labels) if labels else 0.0
    accuracy = sum(tp.values()) / len(y_true) if y_true else 0.0

    # Top confusions
    top_conf = conf.most_common(15)

    return accuracy, macro_f1, per_label, top_conf

# Training of the Baseline Model

In [None]:
# training of our baseline model

train_path = "data/train.txt"
val_path   = "data/validation.txt"

model_baseline = fasttext.train_supervised(
    input=train_path,
    epoch=200,
    lr=0.007,
    dim=100,
    wordNgrams=2,
    minn=2,
    maxn=5,
    loss="softmax"
)

Path("models/from_scratch_with_augData").mkdir(parents=True, exist_ok=True)
model_baseline.save_model("models/from_scratch_with_augData/langid_baseline_200ep_0.007lr.ftz")

In [None]:
# baseline forced-choice metrics 
n, precision, recall = model_baseline.test(val_path)
print("N:", n, "P@1:", precision, "R@1:", recall)

acc, macro_f1, per_label, top_conf = evaluate_per_label(model_baseline, val_path)
print(f"Validation Accuracy (micro): {acc:.4f}")
print(f"Validation Macro-F1       : {macro_f1:.4f}")

# selective metrics 
res = evaluate_selective(model_baseline, val_path, tau=0.90, delta=0.00)
print("\nSelective metrics @ tau=0.98:")
print("TrustPrecision_A  :", round(res["trust_precision_A"], 4))
print("A2A error rate    :", round(res["a2a_error_rate"], 6))
print("Coverage_A        :", round(res["coverage_A"], 4))
print("MacroPrecision_A  :", round(res["macro_precision_A"], 4))
print("Diagnostics       :", res["diag"])

# Show worst 10 labels by F1 
worst = sorted(per_label.items(), key=lambda x: x[1]["f1"])[:10]
print("\nWorst labels (by F1):")
for lbl, m in worst:
    print(lbl, "F1=", round(m["f1"],4), "P=", round(m["precision"],4), "R=", round(m["recall"],4), "support=", m["support"])

print("\nTop confusions (true -> pred):")
for (yt, yp), c in top_conf:
    print(yt, "->", yp, ":", c)

# Training Using Autonune on F1

In [None]:
# train using autonune on F1

train_path = "data/train.txt"
val_path   = "data/validation.txt"

model_autonune_f1 = fasttext.train_supervised(
    input=train_path,
    autotuneValidationFile=val_path,
    autotuneMetric="f1",          
    autotuneDuration=1800,         
    loss="softmax",
    minn=2,
    maxn=5,
    verbose=2
)

Path("models/from_scratch").mkdir(parents=True, exist_ok=True)
model_autonune_f1.save_model("models/from_scratch_with_augData/langid_autonune_f1_1800.ftz")

In [None]:
# baseline forced-choice metrics 
n, precision, recall = model_autonune_f1.test(val_path)
print("N:", n, "P@1:", precision, "R@1:", recall)

acc, macro_f1, per_label, top_conf = evaluate_per_label(model_autonune_f1, val_path)
print(f"Validation Accuracy (micro): {acc:.4f}")
print(f"Validation Macro-F1       : {macro_f1:.4f}")

# selective metrics 
res = evaluate_selective(model_autonune_f1, val_path, tau=0.90, delta=0.00)
print("\nSelective metrics @ tau=0.98:")
print("TrustPrecision_A  :", round(res["trust_precision_A"], 4))
print("A2A error rate    :", round(res["a2a_error_rate"], 6))
print("Coverage_A        :", round(res["coverage_A"], 4))
print("MacroPrecision_A  :", round(res["macro_precision_A"], 4))
print("Diagnostics       :", res["diag"])

# Show worst 10 labels by F1 
worst = sorted(per_label.items(), key=lambda x: x[1]["f1"])[:10]
print("\nWorst labels (by F1):")
for lbl, m in worst:
    print(lbl, "F1=", round(m["f1"],4), "P=", round(m["precision"],4), "R=", round(m["recall"],4), "support=", m["support"])

print("\nTop confusions (true -> pred):")
for (yt, yp), c in top_conf:
    print(yt, "->", yp, ":", c)

# Training using Autonune on Precision

In [None]:
# train using autonune on precision

train_path = "data/train.txt"
val_path   = "data/validation.txt"

model_autonune_precision = fasttext.train_supervised(
    input=train_path,
    autotuneValidationFile=val_path,
    autotuneMetric="precisionAtRecall:30",          
    autotuneDuration=1800,         
    loss="softmax",
    minn=2,
    maxn=5,
    verbose=2
)

Path("models/from_scratch").mkdir(parents=True, exist_ok=True)
model_autonune_precision.save_model("models/from_scratch_with_augData/langid_autonune_precision_1800.ftz")

In [None]:
# baseline forced-choice metrics 
n, precision, recall = model_autonune_precision.test(val_path)
print("N:", n, "P@1:", precision, "R@1:", recall)

acc, macro_f1, per_label, top_conf = evaluate_per_label(model_autonune_precision, val_path)
print(f"Validation Accuracy (micro): {acc:.4f}")
print(f"Validation Macro-F1       : {macro_f1:.4f}")

# selective metrics 
res = evaluate_selective(model_autonune_precision, val_path, tau=0.90, delta=0.00)
print("\nSelective metrics @ tau=0.98:")
print("TrustPrecision_A  :", round(res["trust_precision_A"], 4))
print("A2A error rate    :", round(res["a2a_error_rate"], 6))
print("Coverage_A        :", round(res["coverage_A"], 4))
print("MacroPrecision_A  :", round(res["macro_precision_A"], 4))
print("Diagnostics       :", res["diag"])

# Show worst 10 labels by F1 
worst = sorted(per_label.items(), key=lambda x: x[1]["f1"])[:10]
print("\nWorst labels (by F1):")
for lbl, m in worst:
    print(lbl, "F1=", round(m["f1"],4), "P=", round(m["precision"],4), "R=", round(m["recall"],4), "support=", m["support"])

print("\nTop confusions (true -> pred):")
for (yt, yp), c in top_conf:
    print(yt, "->", yp, ":", c)