In [29]:
import pandas as pd
import numpy as np

train_fname = "../data/lgbt-en.train.tsv"
test_fname = "../data/lgbt-en.test.tsv"

def read_file(fname: str) -> pd.DataFrame:
    """Reads a filename and formats it properly for simpletransformers"""
    df = pd.read_table(fname, sep="\t", header=None, names="text,labels,role".split(","))
    offensive_ids = df.labels != "Acceptable speech"

    #df.labels[offensive_ids] = 1
    #df.labels[~offensive_ids] = 0
    #df["labels"] = df.labels.astype(np.int8)
    df = df.drop(columns=["role"])
    return df


train = read_file(train_fname)
test = read_file(test_fname)

# Fasttext

In [37]:
def prepare_for_fasttext(fname):
    df = pd.read_table(fname, sep="\t", header=None, names="text,labels,role".split(","))
    offensive_ids = df.labels != "Acceptable speech"

    df.labels[offensive_ids] = "Offensive"
    df.labels[~offensive_ids] = "Acceptable"


    df["fasttextlabel"] = df.labels.apply(lambda s: "__label__"+s+" ")
    df["fasttext_all"] = df.fasttextlabel + df.text
    with open(fname+"fasttext", "w") as f:
        for line in list(df.fasttext_all.values):
            f.write(line+"\n")
    print(f"Wrote data from {fname} to {fname+'fasttext'}")
    
    return df
    

## English

In [66]:
train_fname = "../data/lgbt-en.train.tsv"
test_fname = "../data/lgbt-en.test.tsv"
prepare_for_fasttext(train_fname)
_ = prepare_for_fasttext(test_fname)

Wrote data from ../data/lgbt-en.train.tsv to ../data/lgbt-en.train.tsvfasttext
Wrote data from ../data/lgbt-en.test.tsv to ../data/lgbt-en.test.tsvfasttext


In [67]:
%%time
import fasttext
model = fasttext.train_supervised(input=train_fname+"fasttext", epoch=1000, lr=0.05)
model.test(test_fname+"fasttext")

CPU times: user 29.8 s, sys: 252 ms, total: 30 s
Wall time: 4.43 s


(1017, 0.7404129793510325, 0.7404129793510325)

In [68]:
test = prepare_for_fasttext(test_fname)
y_pred = np.array(model.predict(list(test.text.values))[0]).reshape(-1).tolist()

from sklearn.metrics import accuracy_score, f1_score
y_true = test["fasttextlabel"].apply(lambda s: s.split(" ")[0]).values.tolist()

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy: ", accuracy)
f1 = f1_score(y_true, y_pred, labels=list(set(y_true)), average="macro")
print("F1 score: ", f1)

Wrote data from ../data/lgbt-en.test.tsv to ../data/lgbt-en.test.tsvfasttext
Accuracy:  0.7404129793510325
F1 score:  0.6303162486368593


## Slovenian

In [70]:
train_fname = "../data/lgbt-sl.train.tsv"
test_fname = "../data/lgbt-sl.test.tsv"
prepare_for_fasttext(train_fname)
_ = prepare_for_fasttext(test_fname)

Wrote data from ../data/lgbt-sl.train.tsv to ../data/lgbt-sl.train.tsvfasttext
Wrote data from ../data/lgbt-sl.test.tsv to ../data/lgbt-sl.test.tsvfasttext


In [71]:
%%time
import fasttext
model = fasttext.train_supervised(input=train_fname+"fasttext", epoch=1000, lr=0.05)
model.test(test_fname+"fasttext")

CPU times: user 19.8 s, sys: 212 ms, total: 20 s
Wall time: 3.02 s


(900, 0.62, 0.62)

In [72]:
test = prepare_for_fasttext(test_fname)
y_pred = np.array(model.predict(list(test.text.values))[0]).reshape(-1).tolist()

from sklearn.metrics import accuracy_score, f1_score
y_true = test["fasttextlabel"].apply(lambda s: s.split(" ")[0]).values.tolist()

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy: ", accuracy)
f1 = f1_score(y_true, y_pred, labels=list(set(y_true)), average="macro")
print("F1 score: ", f1)

Wrote data from ../data/lgbt-sl.test.tsv to ../data/lgbt-sl.test.tsvfasttext
Accuracy:  0.62
F1 score:  0.6199080271275519


## Croatian

In [73]:
train_fname = "../data/lgbt-hr.train.tsv"
test_fname = "../data/lgbt-hr.test.tsv"
prepare_for_fasttext(train_fname)
_ = prepare_for_fasttext(test_fname)

Wrote data from ../data/lgbt-hr.train.tsv to ../data/lgbt-hr.train.tsvfasttext
Wrote data from ../data/lgbt-hr.test.tsv to ../data/lgbt-hr.test.tsvfasttext


In [74]:
%%time
import fasttext
model = fasttext.train_supervised(input=train_fname+"fasttext", epoch=1000, lr=0.05)
model.test(test_fname+"fasttext")

CPU times: user 25.2 s, sys: 232 ms, total: 25.4 s
Wall time: 3.83 s


(1142, 0.7224168126094571, 0.7224168126094571)

In [75]:
test = prepare_for_fasttext(test_fname)
y_pred = np.array(model.predict(list(test.text.values))[0]).reshape(-1).tolist()

from sklearn.metrics import accuracy_score, f1_score
y_true = test["fasttextlabel"].apply(lambda s: s.split(" ")[0]).values.tolist()

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy: ", accuracy)
f1 = f1_score(y_true, y_pred, labels=list(set(y_true)), average="macro")
print("F1 score: ", f1)

Wrote data from ../data/lgbt-hr.test.tsv to ../data/lgbt-hr.test.tsvfasttext
Accuracy:  0.7224168126094571
F1 score:  0.6945387840582412
