In [1]:
import pandas as pd
import numpy as np

train_fname = "../data/lgbt-en.train.tsv"
test_fname = "../data/lgbt-en.test.tsv"

def read_file(fname: str) -> pd.DataFrame:
    """Reads a filename and formats it properly for simpletransformers"""
    df = pd.read_table(fname, sep="\t", header=None, names="text,labels,role".split(","))
    offensive_ids = df.labels != "Acceptable speech"

    df.labels[offensive_ids] = 1
    df.labels[~offensive_ids] = 0
    
    df["labels"] = df.labels.astype(np.int8)
    df = df.drop(columns=["role"])
    return df


train = read_file(train_fname)
test = read_file(test_fname)

In [17]:
from simpletransformers.classification import ClassificationModel

model_args = {
    "num_train_epochs": 5,
    "learning_rate": 1e-5,
    "overwrite_output_dir": True,
    "train_batch_size": 40
}

model = ClassificationModel(
    "roberta", "roberta-base", use_cuda=True,
    args=model_args
    
)

model.overwrite_output_dir = True
model.train_model(train, )

print(model.eval_model(test))
from sklearn.metrics import accuracy_score, f1_score
y_true = test["labels"]
y_pred = model.predict(list(test["text"].values))[0]

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy: ", accuracy)
f1 = f1_score(y_true, y_pred)
print("F1 score: ", f1)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_p

HBox(children=(FloatProgress(value=0.0, max=4819.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=121.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=121.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=121.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=121.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=121.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=1017.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=128.0, style=ProgressStyle(descr…


({'mcc': 0.581372370408602, 'tp': 181, 'tn': 672, 'fp': 68, 'fn': 96, 'auroc': 0.903768660357108, 'auprc': 0.8075992112897901, 'eval_loss': 0.3702339269220829}, array([[ 1.77050781, -1.75585938],
       [ 2.47460938, -2.47851562],
       [ 0.47949219, -0.50244141],
       ...,
       [ 2.48046875, -2.0703125 ],
       [-0.59765625,  0.74609375],
       [ 2.27148438, -1.90625   ]]), [])


HBox(children=(FloatProgress(value=0.0, max=1017.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=128.0), HTML(value='')))


Accuracy:  0.8387413962635202
F1 score:  0.688212927756654


## Distilbert

In [29]:
from simpletransformers.classification import ClassificationModel

model_args = {
    
    "learning_rate": 1e-3,
    "batch_size": 100,
    "warmup": 600,
    "max_seq_length": 128,
    "num_train_epochs": 5,

    "overwrite_output_dir": True,
    "train_batch_size": 300
}

model = ClassificationModel(
    "distilbert", "distilbert-base-uncased-finetuned-sst-2-english", use_cuda=True,
    args=model_args
    
)

model.overwrite_output_dir = True
model.train_model(train, )


print(model.eval_model(test))
from sklearn.metrics import accuracy_score, f1_score
y_true = test["labels"]
y_pred = model.predict(list(test["text"].values))[0]

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy: ", accuracy)
f1 = f1_score(y_true, y_pred)
print("F1 score: ", f1)

HBox(children=(FloatProgress(value=0.0, max=4819.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=17.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=17.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=17.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=17.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=17.0, style=ProgressStyle(desc…





HBox(children=(FloatProgress(value=0.0, max=1017.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=128.0, style=ProgressStyle(descr…


({'mcc': 0.0, 'tp': 0, 'tn': 740, 'fp': 0, 'fn': 277, 'auroc': 0.5124109669236023, 'auprc': 0.2812159264402502, 'eval_loss': 0.5848753452301025}, array([[ 0.45898438, -0.42358398],
       [ 0.45898438, -0.42358398],
       [ 0.45898438, -0.42358398],
       ...,
       [ 0.45898438, -0.42358398],
       [ 0.45874023, -0.42382812],
       [ 0.45874023, -0.42382812]]), [])


HBox(children=(FloatProgress(value=0.0, max=1017.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=128.0), HTML(value='')))


Accuracy:  0.727630285152409
F1 score:  0.0


#  Hugging face

In [35]:

import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# Same as before
checkpoint = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = list(train.text.values)
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# This is new
batch["labels"] = torch.tensor(list(train.labels.values))

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

RuntimeError: [enforce fail at CPUAllocator.cpp:71] . DefaultCPUAllocator: can't allocate memory: you tried to allocate 60637052928 bytes. Error code 12 (Cannot allocate memory)

# Comparing results with slovenian and croatian data

In [36]:
train_fname = "../data/lgbt-sl.train.tsv"
test_fname = "../data/lgbt-sl.test.tsv"

train = read_file(train_fname)
test = read_file(test_fname)

In [37]:

model_args = {
    "num_train_epochs": 5,
    "learning_rate": 1e-5,
    "overwrite_output_dir": True,
    "train_batch_size": 40
}

model = ClassificationModel(
    "roberta", "EMBEDDIA/sloberta", use_cuda=True,
    args=model_args
    
)

model.overwrite_output_dir = True
model.train_model(train, )

print(model.eval_model(test))
from sklearn.metrics import accuracy_score, f1_score
y_true = test["labels"]
y_pred = model.predict(list(test["text"].values))[0]

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy: ", accuracy)
f1 = f1_score(y_true, y_pred)
print("F1 score: ", f1)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=520.0, style=ProgressStyle(description_…

You are using a model of type camembert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.





HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442838427.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at EMBEDDIA/sloberta were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at EMBEDDIA/sloberta and are newly initialized: ['roberta.pooler.dense.weight', 'classifier.dense.weight', 'clas

OSError: Can't load tokenizer for 'EMBEDDIA/sloberta'. Make sure that:

- 'EMBEDDIA/sloberta' is a correct model identifier listed on 'https://huggingface.co/models'

- or 'EMBEDDIA/sloberta' is the correct path to a directory containing relevant tokenizer files



# Fasttext

In [2]:
def prepare_for_fasttext(fname):
    df = pd.read_table(fname, sep="\t", header=None, names="text,labels,role".split(","))
    offensive_ids = df.labels != "Acceptable speech"

    df.labels[offensive_ids] = "Offensive"
    df.labels[~offensive_ids] = "Acceptable"


    df["fasttextlabel"] = df.labels.apply(lambda s: "__label__"+s+" ")
    df["fasttext_all"] = df.fasttextlabel + df.text
    with open(fname+"fasttext", "w") as f:
        for line in list(df.fasttext_all.values):
            f.write(line+"\n")
    print(f"Wrote data from {fname} to {fname+'fasttext'}")
    

In [6]:
train_fname = "../data/lgbt-en.train.tsv"
test_fname = "../data/lgbt-en.test.tsv"
prepare_for_fasttext(train_fname)
prepare_for_fasttext(test_fname)

Wrote data from ../data/lgbt-en.train.tsv to ../data/lgbt-en.train.tsvfasttext
Wrote data from ../data/lgbt-en.test.tsv to ../data/lgbt-en.test.tsvfasttext


In [9]:
%%time
import fasttext
model = fasttext.train_supervised(input="../data/lgbt-en.train.tsvfasttext", epoch=1000, lr=0.05)
model.test("../data/lgbt-en.test.tsvfasttext")

CPU times: user 30.3 s, sys: 280 ms, total: 30.6 s
Wall time: 4.54 s


(1017, 0.7453294001966568, 0.7453294001966568)

In [31]:
model.predict("Ti si rdečelasec")

(('__label__Offensive',), array([0.52610397]))

In [25]:
!head -n 10 ../data/lgbt-sl.train.tsvfasttext

__label__Acceptable No, pa smo tu - primer očeta in matere, ki sta tako slabo skrbela za dojenčka, da je umrl, pa še dva ostala otroka so zaradi podhranjenosti zadržali v UKC!! http://wap.tuskabina.si/sl/kronika/30633
__label__Offensive Tristan, ste žaljivi in druge podcenjujete! Naj vam nekaj povem! S tem ravno vi in s svojim žaljivim pisanjem kažete raven nestrpnosti do drugačnih od vas ! O inteligenci pa...no saj, zazrite se vase! Kaj ste pa mislili kako bomo nekateri reagirali na vaše pisanje? Z razumevanjem?? Pa , če ste že tako prepričani v svojo superiornost, le čemu se potemtakem skrivate za lažnim profilom??? Pa DA, sem normalen človek in nisem se rodil s strahom--do ničesar! ( Si poleg svojih objav tule upam celo pokazati svoj obraz , za razliko od nekaterih drugih) In DA, nihče ne vpliva na moje mišljenje in nisem vodljiv !! Sem kar sem in razmišljam s svojo glavo, vsaj to pravico v tej nori državi še zaenkrat imam!!
__label__Acceptable Otroci so odprti in brez predsodkov.Pr

AttributeError: module 'fasttext' has no attribute 'help'