In [None]:
import sys
print(sys.executable)

In [None]:
# Import

import fasttext
from pathlib import Path

In [None]:
# --> Get model params

def get_supervised_params_from_model(m):
    """Extract hyperparameters so we can reuse them."""
    a = m.f.getArgs()
    return {
        "lr": a.lr,
        "dim": a.dim,
        "ws": a.ws,
        "epoch": a.epoch,
        "minCount": a.minCount,
        "minn": a.minn,
        "maxn": a.maxn,
        "neg": a.neg,
        "wordNgrams": a.wordNgrams,
        "bucket": a.bucket,
        "lrUpdateRate": a.lrUpdateRate,
        "t": a.t,
        "loss": m.loss
,   
    }

In [None]:
# --> the macro comparaison 

from collections import defaultdict, Counter

def read_ft_file(path):
    y_true, texts = [], []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split(maxsplit=1)
            if len(parts) < 2:
                continue
            y_true.append(parts[0])
            texts.append(parts[1].replace("\n", " "))
    return y_true, texts

def evaluate_per_label(model, path):
    y_true, texts = read_ft_file(path)

    labels = sorted(set(y_true))
    tp = Counter()
    fp = Counter()
    fn = Counter()
    conf = Counter()

    for yt, text in zip(y_true, texts):
        yp = model.predict(text, k=1)[0][0]
        if yp == yt:
            tp[yt] += 1
        else:
            fp[yp] += 1
            fn[yt] += 1
            conf[(yt, yp)] += 1

    per_label = {}
    f1_sum = 0.0
    for lbl in labels:
        TPi = tp[lbl]
        FPi = fp[lbl]
        FNi = fn[lbl]
        prec = TPi / (TPi + FPi) if (TPi + FPi) else 0.0
        rec  = TPi / (TPi + FNi) if (TPi + FNi) else 0.0
        f1   = (2 * prec * rec / (prec + rec)) if (prec + rec) else 0.0
        per_label[lbl] = {"precision": prec, "recall": rec, "f1": f1, "support": TPi + FNi}
        f1_sum += f1

    macro_f1 = f1_sum / len(labels) if labels else 0.0
    accuracy = sum(tp.values()) / len(y_true) if y_true else 0.0

    # Top confusions
    top_conf = conf.most_common(15)

    return accuracy, macro_f1, per_label, top_conf

# Training of the Baseline Model

In [None]:
# training of our baseline model

train_path = "data/train.txt"
val_path   = "data/validation.txt"

model_baseline = fasttext.train_supervised(
    input=train_path,
    epoch=150,
    lr=0.3,
    dim=100,
    wordNgrams=2,
    minn=2,
    maxn=5,
    loss="softmax"
)

Path("models/from_scratch").mkdir(parents=True, exist_ok=True)
model_baseline.save_model("models/from_scratch/langid_baseline_150ep.ftz")

In [None]:
# the micro score

n, precision, recall = model_baseline.test(val_path)
print("N:", n, "P@1:", precision, "R@1:", recall)
print()

# --> Micro comparaison

acc, macro_f1, per_label, top_conf = evaluate_per_label(model_baseline, "data/validation.txt")

print(f"Validation Accuracy (micro): {acc:.4f}")
print(f"Validation Macro-F1       : {macro_f1:.4f}")

# Show worst 10 labels by F1 (most important for imbalance)
worst = sorted(per_label.items(), key=lambda x: x[1]["f1"])[:10]
print("\nWorst labels (by F1):")
for lbl, m in worst:
    print(lbl, "F1=", round(m["f1"],4), "P=", round(m["precision"],4), "R=", round(m["recall"],4), "support=", m["support"])

print("\nTop confusions (true -> pred):")
for (yt, yp), c in top_conf:
    print(yt, "->", yp, ":", c)

# Training Using Autonune on F1

In [None]:
# train using autonune on F1

train_path = "data/train.txt"
val_path   = "data/validation.txt"

model_autonune_f1 = fasttext.train_supervised(
    input=train_path,
    autotuneValidationFile=val_path,
    autotuneMetric="f1",          
    autotuneDuration=1200,         
    loss="softmax",
    minn=2,
    maxn=5,
    verbose=2
)

Path("models/from_scratch").mkdir(parents=True, exist_ok=True)
model_autonune_f1.save_model("models/from_scratch/langid_autonune_f1_1200.ftz")

In [None]:
get_supervised_params_from_model(model_autonune_f1)

In [None]:
# the micro score

n, precision, recall = model_autonune_f1.test(val_path)
print("N:", n, "P@1:", precision, "R@1:", recall)
print()

# --> Micro comparaison

acc, macro_f1, per_label, top_conf = evaluate_per_label(model_autonune_f1, "data/validation.txt")

print(f"Validation Accuracy (micro): {acc:.4f}")
print(f"Validation Macro-F1       : {macro_f1:.4f}")

# Show worst 10 labels by F1 (most important for imbalance)
worst = sorted(per_label.items(), key=lambda x: x[1]["f1"])[:10]
print("\nWorst labels (by F1):")
for lbl, m in worst:
    print(lbl, "F1=", round(m["f1"],4), "P=", round(m["precision"],4), "R=", round(m["recall"],4), "support=", m["support"])

print("\nTop confusions (true -> pred):")
for (yt, yp), c in top_conf:
    print(yt, "->", yp, ":", c)

# Training using Autonune on Precision

In [None]:
# train using autonune on precision

train_path = "data/train.txt"
val_path   = "data/validation.txt"

model_autonune_precision = fasttext.train_supervised(
    input=train_path,
    autotuneValidationFile=val_path,
    autotuneMetric="precisionAtRecall:30",          
    autotuneDuration=600,         
    loss="softmax",
    minn=2,
    maxn=5,
    verbose=2
)

Path("models/from_scratch").mkdir(parents=True, exist_ok=True)
model_autonune_precision.save_model("models/from_scratch/langid_autonune_precision.ftz")

In [None]:
get_supervised_params_from_model(model_autonune_precision)

In [None]:
n, precision, recall = model_autonune_precision.test(val_path)
print("N:", n, "P@1:", precision, "R@1:", recall)

In [None]:
# --> Micro comparaison

acc, macro_f1, per_label, top_conf = evaluate_per_label(model_autonune_precision, "data/validation.txt")

print(f"Validation Accuracy (micro): {acc:.4f}")
print(f"Validation Macro-F1       : {macro_f1:.4f}")

# Show worst 10 labels by F1 (most important for imbalance)
worst = sorted(per_label.items(), key=lambda x: x[1]["f1"])[:10]
print("\nWorst labels (by F1):")
for lbl, m in worst:
    print(lbl, "F1=", round(m["f1"],4), "P=", round(m["precision"],4), "R=", round(m["recall"],4), "support=", m["support"])

print("\nTop confusions (true -> pred):")
for (yt, yp), c in top_conf:
    print(yt, "->", yp, ":", c)