In [None]:
import sys
print(sys.executable)

In [None]:
# Import

import fasttext
from pathlib import Path
from collections import Counter

In [None]:
# --> Get model params

def get_supervised_params_from_model(m):
    """Extract hyperparameters so we can reuse them."""
    a = m.f.getArgs()
    return {
        "lr": a.lr,
        "dim": a.dim,
        "ws": a.ws,
        "epoch": a.epoch,
        "minCount": a.minCount,
        "minn": a.minn,
        "maxn": a.maxn,
        "neg": a.neg,
        "wordNgrams": a.wordNgrams,
        "bucket": a.bucket,
        "lrUpdateRate": a.lrUpdateRate,
        "t": a.t,
        "loss": a.loss
,   
    }

In [None]:
# --> the macro comparaison 

from collections import defaultdict, Counter

def read_ft_file(path):
    y_true, texts = [], []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split(maxsplit=1)
            if len(parts) < 2:
                continue
            y_true.append(parts[0])
            texts.append(parts[1].replace("\n", " "))
    return y_true, texts

def evaluate_per_label(model, path):
    y_true, texts = read_ft_file(path)

    labels = sorted(set(y_true))
    tp = Counter()
    fp = Counter()
    fn = Counter()
    conf = Counter()

    for yt, text in zip(y_true, texts):
        yp = model.predict(text, k=1)[0][0]
        if yp == yt:
            tp[yt] += 1
        else:
            fp[yp] += 1
            fn[yt] += 1
            conf[(yt, yp)] += 1

    per_label = {}
    f1_sum = 0.0
    for lbl in labels:
        TPi = tp[lbl]
        FPi = fp[lbl]
        FNi = fn[lbl]
        prec = TPi / (TPi + FPi) if (TPi + FPi) else 0.0
        rec  = TPi / (TPi + FNi) if (TPi + FNi) else 0.0
        f1   = (2 * prec * rec / (prec + rec)) if (prec + rec) else 0.0
        per_label[lbl] = {"precision": prec, "recall": rec, "f1": f1, "support": TPi + FNi}
        f1_sum += f1

    macro_f1 = f1_sum / len(labels) if labels else 0.0
    accuracy = sum(tp.values()) / len(y_true) if y_true else 0.0

    # Top confusions
    top_conf = conf.most_common(15)

    return accuracy, macro_f1, per_label, top_conf

In [None]:
################# Choosing between the full vocab and the partial vocab

In [None]:
########## Full

In [None]:
# Variables

model_origin_path = "models/with_initialization/lid.176.bin"
out_path = Path("models/with_initialization/vectors/lid176_full.vec")
out_path.parent.mkdir(parents=True, exist_ok=True)

In [None]:
model_origin = fasttext.load_model(model_origin_path)

words = model_origin.get_words()
dim = model_origin.get_dimension()

with out_path.open("w", encoding="utf-8") as f:
    f.write(f"{len(words)} {dim}\n")
    for w in words:
        v = model_origin.get_word_vector(w)
        f.write(w + " " + " ".join(map(str, v)) + "\n")

print("Saved:", out_path)
print("dim:", dim, "n_words_exported:", len(words))

In [None]:
####### Partial

In [None]:
# Variables

model_origin_path = "models/with_initialization/lid.176.bin"
out_path = Path("models/with_initialization/vectors/lid176_partial.vec")
out_path.parent.mkdir(parents=True, exist_ok=True)

splits = ["data/train.txt", "data/validation.txt", "data/test.txt"]

In [None]:
def iter_tokens_from_fasttext_file(path):
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split(maxsplit=1)
            if len(parts) < 2:
                continue
            text = parts[1]
            for tok in text.split():
                yield tok

m = fasttext.load_model(model_origin_path)
dim = m.get_dimension()

meta_vocab = set(m.get_words())  

dataset_tokens = set()
for sp in splits:
    for tok in iter_tokens_from_fasttext_file(sp):
        dataset_tokens.add(tok)

selected = [t for t in dataset_tokens if t in meta_vocab]
selected.sort()  # deterministic output

with out_path.open("w", encoding="utf-8") as f:
    f.write(f"{len(selected)} {dim}\n")
    for w in selected:
        v = m.get_word_vector(w)
        f.write(w + " " + " ".join(map(str, v)) + "\n")

print("Saved:", out_path)
print("dim:", dim)
print("dataset_tokens_total:", len(dataset_tokens))
print("in_meta_vocab:", len(selected))
print("coverage:", (len(selected) / max(1, len(dataset_tokens))))

In [None]:
################## Comparing both full vec model and partial vec model

In [None]:
# Variables

train_path = "data/train.txt"
val_path   = "data/validation.txt"

vec_full = "models/with_initialization/vectors/lid176_full.vec"
vec_partial = "models/with_initialization/vectors/lid176_partial.vec"

In [None]:
# Process 

def read_vec_dim(vec_path: str) -> int:
    with open(vec_path, "r", encoding="utf-8") as f:
        first = f.readline().strip().split()
    # first line: "<n_words> <dim>"
    return int(first[1])

def read_ft_file(path):
    y_true, texts = [], []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split(maxsplit=1)
            if len(parts) < 2:
                continue
            y_true.append(parts[0])
            texts.append(parts[1].replace("\n", " "))
    return y_true, texts

def eval_macro(model, path):
    y_true, texts = read_ft_file(path)
    labels = sorted(set(y_true))

    tp = Counter(); fp = Counter(); fn = Counter()
    conf = Counter()

    for yt, text in zip(y_true, texts):
        yp = model.predict(text, k=1)[0][0]
        if yp == yt:
            tp[yt] += 1
        else:
            fp[yp] += 1
            fn[yt] += 1
            conf[(yt, yp)] += 1

    per_label = {}
    f1_sum = 0.0
    for lbl in labels:
        TPi = tp[lbl]; FPi = fp[lbl]; FNi = fn[lbl]
        prec = TPi / (TPi + FPi) if (TPi + FPi) else 0.0
        rec  = TPi / (TPi + FNi) if (TPi + FNi) else 0.0
        f1   = (2*prec*rec/(prec+rec)) if (prec+rec) else 0.0
        per_label[lbl] = {"precision": prec, "recall": rec, "f1": f1, "support": TPi + FNi}
        f1_sum += f1

    macro_f1 = f1_sum / len(labels) if labels else 0.0
    accuracy = sum(tp.values()) / len(y_true) if y_true else 0.0
    top_conf = conf.most_common(15)
    return accuracy, macro_f1, per_label, top_conf

def train_pretrained(vec_path: str, out_name: str):
    dim = read_vec_dim(vec_path)

    model = fasttext.train_supervised(
        input=train_path,
        epoch=500,
        lr=0.3,
        dim=dim,                 # must match vec dim
        wordNgrams=2,
        minn=2,
        maxn=5,
        loss="softmax",
        pretrainedVectors=vec_path,
        seed=42,
        verbose=2
    )

    out_dir = Path("models/with_initialization")
    out_dir.mkdir(parents=True, exist_ok=True)

    bin_path = out_dir / f"{out_name}.bin"
    model.save_model(str(bin_path))

    # micro (fastText)
    n, p1, r1 = model.test(val_path)

    # macro + per-label
    acc, macro_f1, per_label, top_conf = eval_macro(model, val_path)

    print("\n====", out_name, "====")
    print("vec:", vec_path)
    print("dim:", dim)
    print("saved:", bin_path)
    print("VAL micro: N=", n, "P@1=", p1, "R@1=", r1)
    print("VAL macro: acc=", round(acc, 6), "macro_f1=", round(macro_f1, 6))

    for lbl in ["__label__other_ar", "__label__ar_ma", "__label__ar_msa"]:
        if lbl in per_label:
            m = per_label[lbl]
            print(lbl, "F1=", round(m["f1"],4), "P=", round(m["precision"],4), "R=", round(m["recall"],4), "support=", m["support"])

    print("\nTop confusions (true -> pred):")
    for (yt, yp), c in top_conf:
        print(yt, "->", yp, ":", c)

    return {
        "name": out_name,
        "vec": vec_path,
        "dim": dim,
        "micro_p1": p1,
        "macro_f1": macro_f1,
        "per_label": per_label,
        "top_conf": top_conf,
    }

# res_full = train_pretrained(vec_full, "langid_baseline_full_500ep")
res_data = train_pretrained(vec_partial, "langid_baseline_partial_500ep")

In [None]:
############ Autonune of Partial on F1

In [None]:
# Variable

train_path = "data/train.txt"
val_path   = "data/validation.txt"
vec_path   = "models/with_initialization/vectors/lid176_partial.vec" 

In [None]:
# training
model = fasttext.train_supervised(
    input=train_path,
    autotuneValidationFile=val_path,
    autotuneMetric="f1",
    autotuneDuration=1200,          # 600 < 1200 Vs 1800    in term of performance         
    pretrainedVectors=vec_path,
    dim=16,
    loss="softmax",
    minn=2,
    maxn=5,
    verbose=2,
    seed=42
)

out_dir = Path("models/with_initialization")
out_dir.mkdir(parents=True, exist_ok=True)

model.save_model(str(out_dir / "langid_autotune_f1_partial_1200.bin"))

In [None]:
get_supervised_params_from_model(model)

In [None]:
# the micro score

n, precision, recall = model.test(val_path)
print("N:", n, "P@1:", precision, "R@1:", recall)

In [None]:
# --> Micro comparaison

acc, macro_f1, per_label, top_conf = evaluate_per_label(model, "data/validation.txt")

print(f"Validation Accuracy (micro): {acc:.4f}")
print(f"Validation Macro-F1       : {macro_f1:.4f}")

# Show worst 10 labels by F1 (most important for imbalance)
worst = sorted(per_label.items(), key=lambda x: x[1]["f1"])[:10]
print("\nWorst labels (by F1):")
for lbl, m in worst:
    print(lbl, "F1=", round(m["f1"],4), "P=", round(m["precision"],4), "R=", round(m["recall"],4), "support=", m["support"])

print("\nTop confusions (true -> pred):")
for (yt, yp), c in top_conf:
    print(yt, "->", yp, ":", c)

In [None]:
########### training using Autonune on precision

In [None]:
# Variable

train_path = "data/train.txt"
val_path   = "data/validation.txt"
vec_path   = "models/with_initialization/vectors/lid176_partial.vec" 

In [None]:
# training
model = fasttext.train_supervised(
    input=train_path,
    autotuneValidationFile=val_path,
    autotuneMetric="precisionAtRecall:30",
    autotuneDuration=1800,         
    pretrainedVectors=vec_path,
    dim=16,
    loss="softmax",
    minn=2,
    maxn=5,
    verbose=2,
    seed=42
)

out_dir = Path("models/with_initialization")
out_dir.mkdir(parents=True, exist_ok=True)

model.save_model(str(out_dir / "langid_autotune_precision_partial_1800.bin"))

In [None]:
# the micro score

n, precision, recall = model.test(val_path)
print("N:", n, "P@1:", precision, "R@1:", recall)

In [None]:
# --> Micro comparaison

acc, macro_f1, per_label, top_conf = evaluate_per_label(model, "data/validation.txt")

print(f"Validation Accuracy (micro): {acc:.4f}")
print(f"Validation Macro-F1       : {macro_f1:.4f}")

# Show worst 10 labels by F1 (most important for imbalance)
worst = sorted(per_label.items(), key=lambda x: x[1]["f1"])[:10]
print("\nWorst labels (by F1):")
for lbl, m in worst:
    print(lbl, "F1=", round(m["f1"],4), "P=", round(m["precision"],4), "R=", round(m["recall"],4), "support=", m["support"])

print("\nTop confusions (true -> pred):")
for (yt, yp), c in top_conf:
    print(yt, "->", yp, ":", c)