# Simpletransformers

In [6]:
import pandas as pd
import numpy as np
import torch


languages = ["en", "sl", "hr"]
accuracies = dict()
f1_scores = dict()


def read_file(fname: str) -> pd.DataFrame:
    df = pd.read_table(fname, sep="\t", header=None, names="text,labels,role".split(","))
    #offensive_ids = df.labels != "Acceptable speech"

    #df.labels[offensive_ids] = 1
    #df.labels[~offensive_ids] = 0
    def str_to_int(series):
        all_labels = ['Background offensive', 'Acceptable speech', 'Other offensive',
       'Background violence', 'Inappropriate', 'Other violence']
        d = dict()
        for i, l in enumerate(all_labels):
            d[l] = i
        return series.apply(lambda s: d[s])
    df["labels"] = str_to_int(df.labels)
    df = df.drop(columns=["role"])
    return df

for lang in languages:
    train_fname = f"../data/lgbt-{lang}.train.tsv"
    test_fname = f"../data/lgbt-{lang}.test.tsv"
    
    train = read_file(train_fname)
    test = read_file(test_fname)
    
    from simpletransformers.classification import ClassificationModel

    model_args = {
        "num_train_epochs": 5,
        "learning_rate": 1e-5,
        "overwrite_output_dir": True,
        "train_batch_size": 40,
    }

    model = ClassificationModel(
        "roberta", "roberta-base", use_cuda=False,
        args=model_args,
        num_labels=6

    )

    model.overwrite_output_dir = True
    model.train_model(train, )
    from sklearn.metrics import accuracy_score, f1_score
    y_true = test["labels"]
    y_pred = model.predict(list(test["text"].values))[0]

    accuracy = accuracy_score(y_true, y_pred)
    accuracies[lang] = accuracy
    f1 = f1_score(y_true, y_pred, average="macro")
    f1_scores[lang] = f1

print("""
|  language | accuracy  |  f1 |
|---|---|---|""")
for lang in languages:
    print(f"|{lang}| {accuracies[lang]:0.3} | {f1_scores[lang]:0.3} |")

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.de

HBox(children=(FloatProgress(value=0.0, max=4819.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=121.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=121.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=121.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=121.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=121.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=1017.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=128.0), HTML(value='')))




Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.de

HBox(children=(FloatProgress(value=0.0, max=2844.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=72.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=72.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=72.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=72.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=72.0, style=ProgressStyle(desc…





HBox(children=(FloatProgress(value=0.0, max=900.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=113.0), HTML(value='')))




Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.de

HBox(children=(FloatProgress(value=0.0, max=4495.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=113.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=113.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=113.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=113.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=113.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=1142.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=143.0), HTML(value='')))



|  language | accuracy  |  f1 |
|---|---|---|
|en| 0.814 | 0.33 |
|sl| 0.429 | 0.153 |
|hr| 0.645 | 0.29 |


# Fasttext

In [23]:
def prepare_for_fasttext(fname):
    df = pd.read_table(fname, sep="\t", header=None, names="text,labels,role".split(","))
    #offensive_ids = df.labels != "Acceptable speech"

    #df.labels[offensive_ids] = "Offensive"
    #df.labels[~offensive_ids] = "Acceptable"


    df["fasttextlabel"] = df.labels.apply(lambda s: "__label__"+s.replace(" ", "_") +" ")
    df["fasttext_all"] = df.fasttextlabel + df.text
    with open(fname+"fasttext", "w") as f:
        for line in list(df.fasttext_all.values):
            f.write(line+"\n")
    print(f"Wrote data from {fname} to {fname+'fasttext'}")
    
    return df

languages = ["en", "sl", "hr"]
accuracies = dict()
f1_scores = dict()

for lang in languages:
    train_fname = f"../data/lgbt-{lang}.train.tsv"
    test_fname = f"../data/lgbt-{lang}.test.tsv"
    
    prepare_for_fasttext(test_fname)
    prepare_for_fasttext(train_fname)
    import fasttext
    model = fasttext.train_supervised(input=train_fname+"fasttext", epoch=1000, lr=0.05)


    test = prepare_for_fasttext(test_fname)
    y_pred = np.array(model.predict(list(test.text.values))[0]).reshape(-1).tolist()

    from sklearn.metrics import accuracy_score, f1_score
    y_true = test["fasttextlabel"].apply(lambda s: s.split(" ")[0]).values.tolist()

    accuracy = accuracy_score(y_true, y_pred)
    accuracies[lang] = accuracy
    f1 = f1_score(y_true, y_pred, average="macro")
    f1_scores[lang] = f1


print("""
|  language | accuracy  |  f1 |
|---|---|---|""")
for lang in languages:
    print(f"|{lang}| {accuracies[lang]} | {f1_scores[lang]} |")

Wrote data from ../data/lgbt-en.test.tsv to ../data/lgbt-en.test.tsvfasttext
Wrote data from ../data/lgbt-en.train.tsv to ../data/lgbt-en.train.tsvfasttext
Wrote data from ../data/lgbt-en.test.tsv to ../data/lgbt-en.test.tsvfasttext
Wrote data from ../data/lgbt-sl.test.tsv to ../data/lgbt-sl.test.tsvfasttext
Wrote data from ../data/lgbt-sl.train.tsv to ../data/lgbt-sl.train.tsvfasttext
Wrote data from ../data/lgbt-sl.test.tsv to ../data/lgbt-sl.test.tsvfasttext
Wrote data from ../data/lgbt-hr.test.tsv to ../data/lgbt-hr.test.tsvfasttext
Wrote data from ../data/lgbt-hr.train.tsv to ../data/lgbt-hr.train.tsvfasttext
Wrote data from ../data/lgbt-hr.test.tsv to ../data/lgbt-hr.test.tsvfasttext

|  language | accuracy  |  f1 |
|---|---|---|
|en| 0.7050147492625368 | 0.22781099892926118 |
|sl| 0.47 | 0.23323056764230118 |
|hr| 0.6050788091068301 | 0.3491833127771309 |


# sklearn

In [31]:
languages = ["en", "sl", "hr"]
accuracies = dict()
f1_scores = dict()

def read_file(fname: str) -> pd.DataFrame:

    df = pd.read_table(fname, sep="\t", header=None, names="text,labels,role".split(","))
    df = df.drop(columns=["role"])
    return df

for lang in languages:
    train_fname = f"../data/lgbt-{lang}.train.tsv"
    test_fname = f"../data/lgbt-{lang}.test.tsv"
    train = read_file(train_fname)
    test = read_file(test_fname)

    from sklearn.feature_extraction.text import CountVectorizer
    count_vect = CountVectorizer(ngram_range=(1,3))
    X_train_counts = count_vect.fit_transform(train.text.values)


    from sklearn.feature_extraction.text import TfidfTransformer
    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    from sklearn.svm import SVC


    clf = SVC().fit(X=X_train_tfidf, y=train.labels)

    docs_new = test.text.values.tolist()
    X_new_counts = count_vect.transform(docs_new)
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)

    predicted = clf.predict(X_new_tfidf)

    from sklearn.metrics import accuracy_score, f1_score
    y_true = test["labels"]

    accuracy = accuracy_score(y_true, predicted)
    f1 = f1_score(y_true, predicted, average="macro")
    accuracies[lang] = accuracy
    f1_scores[lang] = f1


print("""
|  language | accuracy  |  f1 |
|---|---|---|""")
for lang in languages:
    print(f"|{lang}| {accuracies[lang]} | {f1_scores[lang]} |")


|  language | accuracy  |  f1 |
|---|---|---|
|en| 0.7394296951819076 | 0.1711102220747723 |
|sl| 0.43555555555555553 | 0.10962285904460795 |
|hr| 0.6120840630472855 | 0.28993712690907253 |


In [3]:
train.labels.unique()

array(['Background offensive', 'Acceptable speech', 'Other offensive',
       'Background violence', 'Inappropriate', 'Other violence'],
      dtype=object)