# Simpletransformers

In [1]:
import pandas as pd
import numpy as np
import torch

train_fname = "../data/lgbt-en.train.tsv"
test_fname = "../data/lgbt-en.test.tsv"

def read_file(fname: str) -> pd.DataFrame:
    """Reads a filename and formats it properly for simpletransformers"""
    df = pd.read_table(fname, sep="\t", header=None, names="text,labels,role".split(","))
    offensive_ids = df.labels != "Acceptable speech"

    df.labels[offensive_ids] = 1
    df.labels[~offensive_ids] = 0
    
    df["labels"] = df.labels.astype(np.int8)
    df = df.drop(columns=["role"])
    return df


train = read_file(train_fname)
test = read_file(test_fname)

In [2]:
%%time
from simpletransformers.classification import ClassificationModel

model_args = {
    "num_train_epochs": 5,
    "learning_rate": 1e-5,
    "overwrite_output_dir": True,
    "train_batch_size": 40
}

model = ClassificationModel(
    "roberta", "roberta-base", use_cuda=True,
    args=model_args
    
)

model.overwrite_output_dir = True
model.train_model(train, )

print(model.eval_model(test))
from sklearn.metrics import accuracy_score, f1_score
y_true = test["labels"]
y_pred = model.predict(list(test["text"].values))[0]

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy: ", accuracy)
f1 = f1_score(y_true, y_pred)
print("F1 score: ", f1)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.

HBox(children=(FloatProgress(value=0.0, max=4819.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=121.0, style=ProgressStyle(des…

  torch.nn.utils.clip_grad_norm_(





HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=121.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=121.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=121.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=121.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=1017.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=128.0, style=ProgressStyle(descr…


({'mcc': 0.5917537810251161, 'tp': 180, 'tn': 678, 'fp': 62, 'fn': 97, 'auroc': 0.8883622792467558, 'auprc': 0.7867289171575443, 'eval_loss': 0.3954293243587017}, array([[ 1.68457031, -1.86621094],
       [ 2.43554688, -2.44726562],
       [ 0.55029297, -0.77636719],
       ...,
       [ 1.85546875, -1.91699219],
       [-1.25390625,  1.21289062],
       [ 1.66015625, -1.8671875 ]]), [])


HBox(children=(FloatProgress(value=0.0, max=1017.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=128.0), HTML(value='')))


Accuracy:  0.8436578171091446
F1 score:  0.6936416184971097


## Slovenian

In [4]:
%%time
train_fname = "../data/lgbt-sl.train.tsv"
test_fname = "../data/lgbt-sl.test.tsv"

train = read_file(train_fname)
test = read_file(test_fname)


model_args = {
    "num_train_epochs": 5,
    "learning_rate": 1e-5,
    "overwrite_output_dir": True,
    "train_batch_size": 40
}

model = ClassificationModel(
    "roberta", "roberta-base", use_cuda=True,
    args=model_args
    
)

model.overwrite_output_dir = True
model.train_model(train, )

print(model.eval_model(test))
from sklearn.metrics import accuracy_score, f1_score
y_true = test["labels"]
y_pred = model.predict(list(test["text"].values))[0]

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy: ", accuracy)
f1 = f1_score(y_true, y_pred)
print("F1 score: ", f1)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.

HBox(children=(FloatProgress(value=0.0, max=2844.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=72.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=72.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=72.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=72.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=72.0, style=ProgressStyle(desc…





HBox(children=(FloatProgress(value=0.0, max=900.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=113.0, style=ProgressStyle(descr…


({'mcc': 0.21212251879575894, 'tp': 277, 'tn': 261, 'fp': 128, 'fn': 234, 'auroc': 0.6601024253064961, 'auprc': 0.7018209307587928, 'eval_loss': 0.7219902477433197}, array([[-0.41552734, -0.00978088],
       [ 0.73535156, -0.83007812],
       [-0.60546875,  0.28930664],
       ...,
       [-0.4597168 ,  0.08905029],
       [-0.68798828,  0.55419922],
       [ 0.31982422, -0.94628906]]), [])


HBox(children=(FloatProgress(value=0.0, max=900.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=113.0), HTML(value='')))


Accuracy:  0.5977777777777777
F1 score:  0.6048034934497817
CPU times: user 46.5 s, sys: 21.9 s, total: 1min 8s
Wall time: 1min 8s


## Croatian

In [5]:
%%time
train_fname = "../data/lgbt-hr.train.tsv"
test_fname = "../data/lgbt-hr.test.tsv"

train = read_file(train_fname)
test = read_file(test_fname)


model_args = {
    "num_train_epochs": 5,
    "learning_rate": 1e-5,
    "overwrite_output_dir": True,
    "train_batch_size": 40
}

model = ClassificationModel(
    "roberta", "roberta-base", use_cuda=True,
    args=model_args
    
)

model.overwrite_output_dir = True
model.train_model(train, )

print(model.eval_model(test))
from sklearn.metrics import accuracy_score, f1_score
y_true = test["labels"]
y_pred = model.predict(list(test["text"].values))[0]

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy: ", accuracy)
f1 = f1_score(y_true, y_pred)
print("F1 score: ", f1)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.

HBox(children=(FloatProgress(value=0.0, max=4495.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=113.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=113.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=113.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=113.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=113.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=1142.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=143.0, style=ProgressStyle(descr…


({'mcc': 0.5104076264731185, 'tp': 629, 'tn': 262, 'fp': 136, 'fn': 115, 'auroc': 0.8396873480304751, 'auprc': 0.9017786613640991, 'eval_loss': 0.5006679700804757}, array([[ 1.5703125 , -1.86328125],
       [-2.17578125,  1.86914062],
       [-0.23706055,  0.10772705],
       ...,
       [-1.36425781,  1.35253906],
       [-0.01015472,  0.04400635],
       [ 1.47753906, -1.82324219]]), [])


HBox(children=(FloatProgress(value=0.0, max=1142.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=143.0), HTML(value='')))


Accuracy:  0.7802101576182137
F1 score:  0.8336646785950961
CPU times: user 1min 7s, sys: 27.7 s, total: 1min 35s
Wall time: 1min 35s


# "sangrimlee/bert-base-multilingual-cased-nsmc"

In [2]:
import pandas as pd
import numpy as np
import torch

train_fname = "../data/lgbt-en.train.tsv"
test_fname = "../data/lgbt-en.test.tsv"

def read_file(fname: str) -> pd.DataFrame:
    """Reads a filename and formats it properly for simpletransformers"""
    df = pd.read_table(fname, sep="\t", header=None, names="text,labels,role".split(","))
    offensive_ids = df.labels != "Acceptable speech"

    df.labels[offensive_ids] = 1
    df.labels[~offensive_ids] = 0
    
    df["labels"] = df.labels.astype(np.int8)
    df = df.drop(columns=["role"])
    return df


train = read_file(train_fname)
test = read_file(test_fname)

In [3]:
%%time
from simpletransformers.classification import ClassificationModel

model_args = {
    "num_train_epochs": 5,
    "learning_rate": 1e-5,
    "overwrite_output_dir": True,
    "train_batch_size": 40
}

model = ClassificationModel(
    "bert", "sangrimlee/bert-base-multilingual-cased-nsmc", use_cuda=True,
    args=model_args
    
)

model.overwrite_output_dir = True
model.train_model(train, )

print(model.eval_model(test))
from sklearn.metrics import accuracy_score, f1_score
y_true = test["labels"]
y_pred = model.predict(list(test["text"].values))[0]

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy: ", accuracy)
f1 = f1_score(y_true, y_pred)
print("F1 score: ", f1)

HBox(children=(FloatProgress(value=0.0, max=4819.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=121.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=121.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=121.0, style=ProgressStyle(des…

  torch.nn.utils.clip_grad_norm_(





HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=121.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=121.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=1017.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=128.0, style=ProgressStyle(descr…


({'mcc': 0.45394849183062413, 'tp': 147, 'tn': 661, 'fp': 79, 'fn': 130, 'auroc': 0.8198653527173383, 'auprc': 0.6648347142606493, 'eval_loss': 0.5325278723612428}, array([[ 1.640625  , -1.56640625],
       [ 2.69140625, -2.86523438],
       [-0.31860352,  0.42211914],
       ...,
       [ 1.98632812, -2.06054688],
       [-1.21484375,  1.14746094],
       [ 1.5390625 , -1.72363281]]), [])


HBox(children=(FloatProgress(value=0.0, max=1017.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=128.0), HTML(value='')))


Accuracy:  0.7944936086529006
F1 score:  0.584493041749503
CPU times: user 1min 21s, sys: 36.8 s, total: 1min 58s
Wall time: 2min 5s


## Slovenian

In [4]:
%%time
train_fname = "../data/lgbt-sl.train.tsv"
test_fname = "../data/lgbt-sl.test.tsv"

train = read_file(train_fname)
test = read_file(test_fname)


model_args = {
    "num_train_epochs": 5,
    "learning_rate": 1e-5,
    "overwrite_output_dir": True,
    "train_batch_size": 40
}

model = ClassificationModel(
    "bert", "sangrimlee/bert-base-multilingual-cased-nsmc", use_cuda=True,
    args=model_args
    
)

model.overwrite_output_dir = True
model.train_model(train, )

print(model.eval_model(test))
from sklearn.metrics import accuracy_score, f1_score
y_true = test["labels"]
y_pred = model.predict(list(test["text"].values))[0]

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy: ", accuracy)
f1 = f1_score(y_true, y_pred)
print("F1 score: ", f1)

HBox(children=(FloatProgress(value=0.0, max=2844.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=72.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=72.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=72.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=72.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=72.0, style=ProgressStyle(desc…





HBox(children=(FloatProgress(value=0.0, max=900.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=113.0, style=ProgressStyle(descr…


({'mcc': 0.32941565551096874, 'tp': 310, 'tn': 282, 'fp': 107, 'fn': 201, 'auroc': 0.7319787301475509, 'auprc': 0.7756147838560125, 'eval_loss': 0.7730572930479471}, array([[ 0.75537109, -0.97167969],
       [-1.00878906,  0.87939453],
       [-1.76269531,  1.63085938],
       ...,
       [ 0.52490234, -0.86767578],
       [-1.01171875,  0.80761719],
       [ 0.58056641, -1.03222656]]), [['Edina golazen trenutno tukaj si ti, me pa res zanima kdo se za tem fb profilom res skriva 🙄', 'KARLO KANTARE....', 'Pedri so lejzbike napadli, svašta😂', 'da bi biv na njegovem mesti, štrik pa pod jabko, čaooo !!', 'Če hoče biti ženska, prov , spremeni spol pa je to to. Ne pa da je ženska in moški, groza.Kako bodo starši majhnim otrokom razložil kaj predstavlja, maškaro , zbudite se. Nisem starokopitna , zadrgnjena pa ne vem kako me boste še poimenovali, samo vsak mora bit v eni koži ne v dveh :(', 'Muslimanska vira,sploh ni in ne bo priznana v Sloveniji.Gdo to piše je velika budala......', 'Določene

HBox(children=(FloatProgress(value=0.0, max=900.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=113.0), HTML(value='')))


Accuracy:  0.6577777777777778
F1 score:  0.6681034482758621
CPU times: user 52.5 s, sys: 28.5 s, total: 1min 21s
Wall time: 1min 28s


## Croatian

In [5]:
%%time
train_fname = "../data/lgbt-hr.train.tsv"
test_fname = "../data/lgbt-hr.test.tsv"

train = read_file(train_fname)
test = read_file(test_fname)


model_args = {
    "num_train_epochs": 5,
    "learning_rate": 1e-5,
    "overwrite_output_dir": True,
    "train_batch_size": 40
}

model = ClassificationModel(
    "bert", "sangrimlee/bert-base-multilingual-cased-nsmc", use_cuda=True,
    args=model_args
    
)

model.overwrite_output_dir = True
model.train_model(train, )

print(model.eval_model(test))
from sklearn.metrics import accuracy_score, f1_score
y_true = test["labels"]
y_pred = model.predict(list(test["text"].values))[0]

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy: ", accuracy)
f1 = f1_score(y_true, y_pred)
print("F1 score: ", f1)

HBox(children=(FloatProgress(value=0.0, max=4495.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=113.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=113.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=113.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=113.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=113.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=1142.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=143.0, style=ProgressStyle(descr…


({'mcc': 0.5328066215514191, 'tp': 620, 'tn': 279, 'fp': 119, 'fn': 124, 'auroc': 0.8692403550008105, 'auprc': 0.9228963119282649, 'eval_loss': 0.5030829085336699}, array([[ 3.02929688, -3.21289062],
       [-2.8671875 ,  3.30664062],
       [ 0.77197266, -0.9375    ],
       ...,
       [-0.52441406,  0.45507812],
       [ 0.79052734, -1.11230469],
       [ 1.625     , -1.89453125]]), [])


HBox(children=(FloatProgress(value=0.0, max=1142.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=143.0), HTML(value='')))


Accuracy:  0.787215411558669
F1 score:  0.8361429534726905
CPU times: user 1min 12s, sys: 35.5 s, total: 1min 48s
Wall time: 1min 55s


#     "roberta", "unitary/multilingual-toxic-xlm-roberta", use_cuda=True,

In [12]:
import pandas as pd
import numpy as np
import torch

train_fname = "../data/lgbt-en.train.tsv"
test_fname = "../data/lgbt-en.test.tsv"

def read_file(fname: str) -> pd.DataFrame:
    """Reads a filename and formats it properly for simpletransformers"""
    df = pd.read_table(fname, sep="\t", header=None, names="text,labels,role".split(","))
    offensive_ids = df.labels != "Acceptable speech"

    df.labels[offensive_ids] = 1
    df.labels[~offensive_ids] = 0
    
    df["labels"] = df.labels.astype(np.int8)
    df = df.drop(columns=["role"])
    return df


train = read_file(train_fname)
test = read_file(test_fname)

In [18]:
%%time
from simpletransformers.classification import ClassificationModel

model_args = {
    "num_train_epochs": 5,
    "learning_rate": 1e-5,
    "overwrite_output_dir": True,
    "train_batch_size": 40
}

model = ClassificationModel(
    "roberta", "unitary/multilingual-toxic-xlm-roberta", use_cuda=True,
    args=model_args
    
)

model.overwrite_output_dir = True
model.train_model(train, )

print(model.eval_model(test))
from sklearn.metrics import accuracy_score, f1_score
y_true = test["labels"]
y_pred = model.predict(list(test["text"].values))[0]

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy: ", accuracy)
f1 = f1_score(y_true, y_pred)
print("F1 score: ", f1)

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at unitary/multilingual-toxic-xlm-roberta and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLMRobertaTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.


TypeError: expected str, bytes or os.PathLike object, not NoneType

## Slovenian

In [11]:
%%time
train_fname = "../data/lgbt-sl.train.tsv"
test_fname = "../data/lgbt-sl.test.tsv"

train = read_file(train_fname)
test = read_file(test_fname)


model_args = {
    "num_train_epochs": 5,
    "learning_rate": 1e-5,
    "overwrite_output_dir": True,
    "train_batch_size": 40
}

model = ClassificationModel(
    "roberta", "unitary/multilingual-toxic-xlm-roberta", use_cuda=True,
    args=model_args
    
)

model.overwrite_output_dir = True
model.train_model(train, )

print(model.eval_model(test))
from sklearn.metrics import accuracy_score, f1_score
y_true = test["labels"]
y_pred = model.predict(list(test["text"].values))[0]

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy: ", accuracy)
f1 = f1_score(y_true, y_pred)
print("F1 score: ", f1)

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at unitary/multilingual-toxic-xlm-roberta and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLMRobertaTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.


TypeError: expected str, bytes or os.PathLike object, not NoneType

## Croatian

In [5]:
%%time
train_fname = "../data/lgbt-hr.train.tsv"
test_fname = "../data/lgbt-hr.test.tsv"

train = read_file(train_fname)
test = read_file(test_fname)


model_args = {
    "num_train_epochs": 5,
    "learning_rate": 1e-5,
    "overwrite_output_dir": True,
    "train_batch_size": 40
}

model = ClassificationModel(
    "roberta", "unitary/multilingual-toxic-xlm-roberta", use_cuda=True,
    args=model_args
    
)

model.overwrite_output_dir = True
model.train_model(train, )

print(model.eval_model(test))
from sklearn.metrics import accuracy_score, f1_score
y_true = test["labels"]
y_pred = model.predict(list(test["text"].values))[0]

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy: ", accuracy)
f1 = f1_score(y_true, y_pred)
print("F1 score: ", f1)

HBox(children=(FloatProgress(value=0.0, max=4495.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=113.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=113.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=113.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=113.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=113.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=1142.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=143.0, style=ProgressStyle(descr…


({'mcc': 0.5328066215514191, 'tp': 620, 'tn': 279, 'fp': 119, 'fn': 124, 'auroc': 0.8692403550008105, 'auprc': 0.9228963119282649, 'eval_loss': 0.5030829085336699}, array([[ 3.02929688, -3.21289062],
       [-2.8671875 ,  3.30664062],
       [ 0.77197266, -0.9375    ],
       ...,
       [-0.52441406,  0.45507812],
       [ 0.79052734, -1.11230469],
       [ 1.625     , -1.89453125]]), [])


HBox(children=(FloatProgress(value=0.0, max=1142.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=143.0), HTML(value='')))


Accuracy:  0.787215411558669
F1 score:  0.8361429534726905
CPU times: user 1min 12s, sys: 35.5 s, total: 1min 48s
Wall time: 1min 55s


# Distilbert

## Slovenian

In [6]:
%%time
train_fname = "../data/lgbt-sl.train.tsv"
test_fname = "../data/lgbt-sl.test.tsv"

train = read_file(train_fname)
test = read_file(test_fname)


model_args = {
    "num_train_epochs": 5,
    "learning_rate": 1e-5,
    "overwrite_output_dir": True,
    "train_batch_size": 40
}

model = ClassificationModel(
     "distilbert", "distilbert-base-uncased-finetuned-sst-2-english", use_cuda=True,
    args=model_args
    
)

model.overwrite_output_dir = True
model.train_model(train, )

print(model.eval_model(test))
from sklearn.metrics import accuracy_score, f1_score
y_true = test["labels"]
y_pred = model.predict(list(test["text"].values))[0]

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy: ", accuracy)
f1 = f1_score(y_true, y_pred)
print("F1 score: ", f1)

HBox(children=(FloatProgress(value=0.0, max=2844.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=72.0, style=ProgressStyle(desc…






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=72.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=72.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=72.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=72.0, style=ProgressStyle(desc…





HBox(children=(FloatProgress(value=0.0, max=900.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=113.0, style=ProgressStyle(descr…


({'mcc': 0.0865368177806467, 'tp': 248, 'tn': 234, 'fp': 155, 'fn': 263, 'auroc': 0.5793947046720227, 'auprc': 0.6323780813126891, 'eval_loss': 0.7905932401133849}, array([[-0.14331055,  0.19824219],
       [ 0.79785156, -0.68652344],
       [-0.32250977,  0.31982422],
       ...,
       [-0.04086304,  0.07098389],
       [-0.09295654,  0.17541504],
       [-0.07128906,  0.1385498 ]]), [])


HBox(children=(FloatProgress(value=0.0, max=900.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=113.0), HTML(value='')))


Accuracy:  0.5355555555555556
F1 score:  0.5426695842450766
CPU times: user 26.9 s, sys: 12.9 s, total: 39.9 s
Wall time: 42.9 s


## English

In [7]:
%%time
train_fname = "../data/lgbt-en.train.tsv"
test_fname = "../data/lgbt-en.test.tsv"

train = read_file(train_fname)
test = read_file(test_fname)


model_args = {
    "num_train_epochs": 5,
    "learning_rate": 1e-5,
    "overwrite_output_dir": True,
    "train_batch_size": 40
}

model = ClassificationModel(
     "distilbert", "distilbert-base-uncased-finetuned-sst-2-english", use_cuda=True,
    args=model_args
    
)

model.overwrite_output_dir = True
model.train_model(train, )

print(model.eval_model(test))
from sklearn.metrics import accuracy_score, f1_score
y_true = test["labels"]
y_pred = model.predict(list(test["text"].values))[0]

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy: ", accuracy)
f1 = f1_score(y_true, y_pred)
print("F1 score: ", f1)

HBox(children=(FloatProgress(value=0.0, max=4819.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=121.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=121.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=121.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=121.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=121.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=1017.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=128.0, style=ProgressStyle(descr…


({'mcc': 0.3558397739830577, 'tp': 106, 'tn': 677, 'fp': 63, 'fn': 171, 'auroc': 0.7634427749048687, 'auprc': 0.5896695167242314, 'eval_loss': 0.5030838921666145}, array([[ 0.1529541 , -0.22094727],
       [ 1.92578125, -1.66699219],
       [-0.19128418,  0.15405273],
       ...,
       [ 0.29345703, -0.31591797],
       [ 0.16845703, -0.14221191],
       [ 0.72607422, -0.66650391]]), [])


HBox(children=(FloatProgress(value=0.0, max=1017.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=128.0), HTML(value='')))


Accuracy:  0.7699115044247787
F1 score:  0.47533632286995514
CPU times: user 40.7 s, sys: 16.3 s, total: 57 s
Wall time: 59.3 s


## Croatian

In [8]:
%%time
train_fname = "../data/lgbt-hr.train.tsv"
test_fname = "../data/lgbt-hr.test.tsv"

train = read_file(train_fname)
test = read_file(test_fname)


model_args = {
    "num_train_epochs": 5,
    "learning_rate": 1e-5,
    "overwrite_output_dir": True,
    "train_batch_size": 40
}

model = ClassificationModel(
     "distilbert", "distilbert-base-uncased-finetuned-sst-2-english", use_cuda=True,
    args=model_args
    
)

model.overwrite_output_dir = True
model.train_model(train, )

print(model.eval_model(test))
from sklearn.metrics import accuracy_score, f1_score
y_true = test["labels"]
y_pred = model.predict(list(test["text"].values))[0]

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy: ", accuracy)
f1 = f1_score(y_true, y_pred)
print("F1 score: ", f1)

HBox(children=(FloatProgress(value=0.0, max=4495.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=113.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=113.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=113.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=113.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=113.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=1142.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=143.0, style=ProgressStyle(descr…


({'mcc': 0.3131724338905248, 'tp': 608, 'tn': 191, 'fp': 207, 'fn': 136, 'auroc': 0.770757348570811, 'auprc': 0.8671966233424704, 'eval_loss': 0.5458229280018306}, array([[ 1.77246094, -1.58496094],
       [-2.89648438,  3.1328125 ],
       [ 0.03710938,  0.0682373 ],
       ...,
       [ 0.10119629,  0.04788208],
       [-0.48754883,  0.66259766],
       [ 1.17773438, -1.03027344]]), [])


HBox(children=(FloatProgress(value=0.0, max=1142.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=143.0), HTML(value='')))


Accuracy:  0.6996497373029772
F1 score:  0.7799871712636306
CPU times: user 38.3 s, sys: 15.7 s, total: 54.1 s
Wall time: 56.6 s


## Slovenian checkpoint

In [10]:
%%time
train_fname = "../data/lgbt-sl.train.tsv"
test_fname = "../data/lgbt-sl.test.tsv"

train = read_file(train_fname)
test = read_file(test_fname)


model_args = {
    "num_train_epochs": 5,
    "learning_rate": 1e-5,
    "overwrite_output_dir": True,
    "train_batch_size": 40
}

model = ClassificationModel(
     "bert", "IMSyPP/hate_speech_slo", use_cuda=True,
    args=model_args
    
)

model.overwrite_output_dir = True
model.train_model(train, )

print(model.eval_model(test))
from sklearn.metrics import accuracy_score, f1_score
y_true = test["labels"]
y_pred = model.predict(list(test["text"].values))[0]

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy: ", accuracy)
f1 = f1_score(y_true, y_pred)
print("F1 score: ", f1)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=496602227.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=329119.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=86.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, max=2844.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=72.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=72.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=72.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=72.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=72.0, style=ProgressStyle(desc…





HBox(children=(FloatProgress(value=0.0, max=900.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=113.0, style=ProgressStyle(descr…


({'mcc': 0.1578962831607748, 'eval_loss': 0.7648846195862357}, array([[ 1.88671875,  3.93164062, -2.69140625, -3.8671875 ],
       [ 2.8515625 ,  2.21875   , -2.37890625, -4.2109375 ],
       [ 3.97460938,  1.07226562, -2.33007812, -3.59765625],
       ...,
       [ 3.18945312,  2.57617188, -2.85351562, -3.94140625],
       [ 1.28320312,  3.31445312, -1.29492188, -3.54296875],
       [ 3.4921875 ,  2.39453125, -3.40820312, -3.734375  ]]), [])


HBox(children=(FloatProgress(value=0.0, max=900.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=113.0), HTML(value='')))


Accuracy:  0.5588888888888889
F1 score:  0.5323910482921084
CPU times: user 52.9 s, sys: 23.8 s, total: 1min 16s
Wall time: 1min 36s


#  Hugging face

In [35]:

import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# Same as before
checkpoint = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = list(train.text.values)
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# This is new
batch["labels"] = torch.tensor(list(train.labels.values))

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

RuntimeError: [enforce fail at CPUAllocator.cpp:71] . DefaultCPUAllocator: can't allocate memory: you tried to allocate 60637052928 bytes. Error code 12 (Cannot allocate memory)

In [16]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
  
train_fname = "../data/lgbt-sl.train.tsv"
test_fname = "../data/lgbt-sl.test.tsv"

train = read_file(train_fname)
test = read_file(test_fname)


tokenizer = AutoTokenizer.from_pretrained("IMSyPP/hate_speech_slo")

model = AutoModelForSequenceClassification.from_pretrained("IMSyPP/hate_speech_slo")

sequences = list(train.text.values)
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# This is new
batch["labels"] = torch.tensor(list(train.labels.values))

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


RuntimeError: The size of tensor a (1910) must match the size of tensor b (512) at non-singleton dimension 1

In [20]:
train.shape

(2844, 2)

In [21]:
len(sequences)

2844