In [1]:
import pandas as pd
import numpy as np

train_fname = "../data/lgbt-en.train.tsv"
test_fname = "../data/lgbt-en.test.tsv"

def read_file(fname: str) -> pd.DataFrame:
    """Reads a filename and formats it properly for simpletransformers"""
    df = pd.read_table(fname, sep="\t", header=None, names="text,labels,role".split(","))
    offensive_ids = df.labels != "Acceptable speech"

    df.labels[offensive_ids] = 1
    df.labels[~offensive_ids] = 0
    
    df["labels"] = df.labels.astype(np.int8)
    df = df.drop(columns=["role"])
    return df


train = read_file(train_fname)
test = read_file(test_fname)

In [17]:
from simpletransformers.classification import ClassificationModel

model_args = {
    "num_train_epochs": 5,
    "learning_rate": 1e-5,
    "overwrite_output_dir": True,
    "train_batch_size": 40
}

model = ClassificationModel(
    "roberta", "roberta-base", use_cuda=True,
    args=model_args
    
)

model.overwrite_output_dir = True
model.train_model(train, )

print(model.eval_model(test))
from sklearn.metrics import accuracy_score, f1_score
y_true = test["labels"]
y_pred = model.predict(list(test["text"].values))[0]

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy: ", accuracy)
f1 = f1_score(y_true, y_pred)
print("F1 score: ", f1)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_p

HBox(children=(FloatProgress(value=0.0, max=4819.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=121.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=121.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=121.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=121.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=121.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=1017.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=128.0, style=ProgressStyle(descr…


({'mcc': 0.581372370408602, 'tp': 181, 'tn': 672, 'fp': 68, 'fn': 96, 'auroc': 0.903768660357108, 'auprc': 0.8075992112897901, 'eval_loss': 0.3702339269220829}, array([[ 1.77050781, -1.75585938],
       [ 2.47460938, -2.47851562],
       [ 0.47949219, -0.50244141],
       ...,
       [ 2.48046875, -2.0703125 ],
       [-0.59765625,  0.74609375],
       [ 2.27148438, -1.90625   ]]), [])


HBox(children=(FloatProgress(value=0.0, max=1017.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=128.0), HTML(value='')))


Accuracy:  0.8387413962635202
F1 score:  0.688212927756654


## Distilbert

In [6]:
from simpletransformers.classification import ClassificationModel

model_args = {
    
    "learning_rate": 1e-5,
    "batch_size": 32,
    "warmup": 600,
    "max_seq_length": 128,
    "num_train_epochs": 3.0,

    "overwrite_output_dir": True,
    "train_batch_size": 40
}

model = ClassificationModel(
    "distilbert", "distilbert-base-uncased-finetuned-sst-2-english", use_cuda=True,
    args=model_args
    
)

model.overwrite_output_dir = True
model.train_model(train, )
model.eval_model(test)

HBox(children=(FloatProgress(value=0.0, max=4819.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 3.0', max=121.0, style=ProgressStyle(d…






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 3.0', max=121.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 3.0', max=121.0, style=ProgressStyle(d…





HBox(children=(FloatProgress(value=0.0, max=1017.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=128.0, style=ProgressStyle(descr…




({'mcc': 0.2539314709260029,
  'tp': 64,
  'tn': 698,
  'fp': 42,
  'fn': 213,
  'auroc': 0.7369914137964679,
  'auprc': 0.542618778599848,
  'eval_loss': 0.5089862793684006},
 array([[ 0.41552734, -0.42919922],
        [ 1.37109375, -1.20898438],
        [ 0.36621094, -0.35498047],
        ...,
        [ 0.46533203, -0.46166992],
        [ 0.22546387, -0.22595215],
        [ 0.58984375, -0.55957031]]),
 [])

In [14]:
from sklearn.metrics import accuracy_score, f1_score
y_true = test["labels"]
y_pred = model.predict(list(test["text"].values))[0]

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy: ", accuracy)
f1 = f1_score(y_true, y_pred)
print("F1 score: ", f1)

HBox(children=(FloatProgress(value=0.0, max=1017.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=128.0), HTML(value='')))


Accuracy:  0.7492625368731564
F1 score:  0.3342036553524804


In [12]:
y_pred[0]

array([0, 0, 0, ..., 0, 0, 0])