In [1]:
import os
import parse
import fasttext
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.metrics import accuracy_score, f1_score

from typing import Union, List, Tuple
def get_labels_text(path: Union[str, Path]) -> Tuple[List[str], List[str]]:
    """Reads fasttext formatted file and extracts labels and text

    Args:
        path (Union[str, Path]): file, each line being a document. 
                                Line should start with __label__XX for label XX

    Returns:
        Tuple[List[str], List[str]]: Labels, texts
    """    
    labels, texts = list(), list()
    with open(str(path),"r") as f:
        pattern = "__label__{language} {text}"
        p = parse.compile(pattern)
        for line in f.readlines():
            rez = p.parse(line)
            labels.append(rez["language"])
            texts.append(rez["text"].replace("\n", " "))
    return labels, texts



In [2]:

from sklearn.metrics import accuracy_score, f1_score, classification_report

train = str(
    Path("data/interim/Twitter_train.fasttext")
)
dev = str(
    Path("data/interim/Twitter_dev.fasttext")
)

model = fasttext.train_supervised(input=train, autotuneValidationFile=dev, autotuneDuration=600,
                                    maxn=10
                                    )
test = str(
    Path("data/interim/Twitter_test.fasttext")
)
y_true, texts = get_labels_text(test)
y_pred =  [i[0].replace("__label__", "") for i in model.predict(texts)[0]]
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:0.3}")
f1 = f1_score(y_true, y_pred, labels=list(set(y_true)), average="macro")
print(f"macro F1 score: {f1:0.3}")

print(classification_report(
    y_true, y_pred, labels = "hr bs sr".split(), 
    ))


Progress: 100.0% Trials:   14 Best score:  0.705357 ETA:   0h 0m 0s
Training again with best arguments
Read 2M words
Number of words:  333991
Number of labels: 3
Progress: 100.0% words/sec/thread:  168418 lr:  0.000000 avg.loss:  1.102525 ETA:   0h 0m 0s


Accuracy: 0.705
macro F1 score: 0.276
              precision    recall  f1-score   support

          hr       0.00      0.00      0.00        18
          bs       0.00      0.00      0.00        15
          sr       0.71      1.00      0.83        79

    accuracy                           0.71       112
   macro avg       0.24      0.33      0.28       112
weighted avg       0.50      0.71      0.58       112



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [3]:
train = str(
    Path("data/interim/SETimes_train.fasttext")
)
dev = str(
    Path("data/interim/SETimes_dev.fasttext")
)

model = fasttext.train_supervised(input=train, autotuneValidationFile=dev, autotuneDuration=600,
                                    maxn=10
                                    )
test = str(
    Path("data/interim/SETimes_test.fasttext")
)

y_true, texts = get_labels_text(test)
y_pred =  [i[0].replace("__label__", "") for i in model.predict(texts)[0]]
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:0.3}")
f1 = f1_score(y_true, y_pred, labels=list(set(y_true)), average="macro")
print(f"macro F1 score: {f1:0.3}")

print(classification_report(
    y_true, y_pred, labels = "hr bs sr".split(), 
    ))

Progress: 100.0% Trials:    7 Best score:  0.993769 ETA:   0h 0m 0s
Training again with best arguments
Read 6M words
Number of words:  294151
Number of labels: 3
Progress: 100.0% words/sec/thread:   89813 lr:  0.000000 avg.loss:  0.487643 ETA:   0h 0m 0s


Accuracy: 0.989
macro F1 score: 0.989
              precision    recall  f1-score   support

          hr       1.00      0.98      0.99       313
          bs       0.97      0.99      0.98       312
          sr       1.00      0.99      0.99       296

    accuracy                           0.99       921
   macro avg       0.99      0.99      0.99       921
weighted avg       0.99      0.99      0.99       921



In [9]:
results = {
    "train on SETimes, test on SETimes": {
        "train": str(Path("data/interim/SETimes_train.fasttext")),
        "test": str(Path("data/interim/SETimes_test.fasttext"))
    },
    "train on SETimes, test on Twitter": {
        "train": str(Path("data/interim/SETimes_train.fasttext")),
        "test": str(Path("data/interim/Twitter_test.fasttext"))
    },
    "train on Twitter, test on SETimes":{
        "train": str(Path("data/interim/Twitter_train.fasttext")),
        "test": str(Path("data/interim/SETimes_test.fasttext"))
    },
    "train on Twitter, test on Twitter": {
        "train": str(Path("data/interim/Twitter_train.fasttext")),
        "test": str(Path("data/interim/Twitter_test.fasttext"))
    }
}
for setup in results:
    train = results[setup]["train"]
    test = results[setup]["test"]
    dev = results[setup]["train"].replace("train", "dev")
    results[setup]["runs"] = []
    for i in range(5):
        model = fasttext.train_supervised(input=train, 
                                          autotuneValidationFile=dev,
                                          autotuneDuration=600,
                                          maxn=10
                                    )
        y_true, texts = get_labels_text(test)
        y_pred =  [i[0].replace("__label__", "") for i in model.predict(texts)[0]]
        accuracy = accuracy_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred, labels = "hr bs sr".split(),  average="macro")
        results[setup]["runs"].append({
            "accuracy": accuracy,
            "macroF1": f1,
            "y_true": y_true,
            "y_pred": y_pred
        })
        with open("002_fasttext_results.json", "w") as f:
            import json
            json.dump(results, f)


Progress: 100.0% Trials:    7 Best score:  0.993769 ETA:   0h 0m 0s
Training again with best arguments
Read 6M words
Number of words:  294151
Number of labels: 3
Progress: 100.0% words/sec/thread:   91448 lr:  0.000000 avg.loss:  0.482010 ETA:   0h 0m 0s
Progress: 100.0% Trials:    7 Best score:  0.993769 ETA:   0h 0m 0s
Training again with best arguments
Read 6M words
Number of words:  294151
Number of labels: 3
Progress: 100.0% words/sec/thread:   85493 lr:  0.000000 avg.loss:  0.482269 ETA:   0h 0m 0s
Progress: 100.0% Trials:    7 Best score:  0.992731 ETA:   0h 0m 0s
Training again with best arguments
Read 6M words
Number of words:  294151
Number of labels: 3
Progress:   5.5% words/sec/thread:   86487 lr:  0.319706 avg.loss:  1.102535 ETA:   0h 2m17s