In [7]:
import pandas as pd
import numpy as np
import torch



def read_file(fname: str, correct_labels=False) -> pd.DataFrame:
    """Reads a filename, return df with text and labels."""

    df = pd.read_table(fname, sep="\t", header=None, names="text,labels,role".split(","))
    if correct_labels:
        offensive_ids = df.labels != "Acceptable speech"
        df.labels[offensive_ids] = 1
        df.labels[~offensive_ids] = 0
        df["labels"] = df.labels.astype(int)
    df = df.drop(columns=["role"])
    return df

en_test, en_train = "../data/merged-en.test.tsv" , "../data/merged-en.train.tsv"
hr_test, hr_train = "../data/merged-hr.test.tsv" , "../data/merged-hr.train.tsv"
sl_test, sl_train = "../data/merged-sl.test.tsv",  "../data/merged-sl.train.tsv"


In [5]:
def evaluate_HF_model(model_type, model_name, language):
    from simpletransformers.classification import ClassificationModel
    
    model = ClassificationModel(model_type, model_name)
    
    test_fname = f"../data/merged-{lang}.test.tsv"
    test = read_file(test_fname, correct_labels = True)
    
    from sklearn.metrics import accuracy_score, f1_score
    y_true = test["labels"]
    y_pred = model.predict(list(test["text"].values))[0]

    accuracy = accuracy_score(y_true, y_pred)

    f1 = f1_score(y_true, y_pred, average="macro" )
    
    return accuracy, f1


lang = "hr"
model_type = "electra"
model_name = "./finetuned_models/HR_hate___classla_bcms-bertic_6/"

accs = list()
f1s = list()
for i in range(1):
    import gc
    gc.collect()
    a, f = evaluate_HF_model(model_type, model_name, lang)
    accs.append(a)
    f1s.append(f)


Some weights of the model checkpoint at ./finetuned_models/HR_hate___classla_bcms-bertic_6/ were not used when initializing ElectraForSequenceClassification: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at ./finetuned_models/HR_hate___classla_bcms-bertic_6/ and are newly initialized: ['pooler.dense.weight', 'classifier.weight', 'pooler.d

HBox(children=(FloatProgress(value=0.0, max=2120.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=265.0), HTML(value='')))




# Added pretraining:

In [11]:
def train_and_evaluate_HF_model(model_type, model_name, language):
    from simpletransformers.classification import ClassificationModel
    
    train_fname = f"../data/merged-{lang}.train.tsv"
    train = read_file(train_fname, correct_labels = True)
    
    
    model_args = {
        "num_train_epochs": 5,
        "learning_rate": 1e-5,
        "overwrite_output_dir": True,
        "train_batch_size": 40
    }

    model = ClassificationModel(
        model_type, model_name, use_cuda=True,
        args=model_args

    )

    model.overwrite_output_dir = True
    #model.train_model(train, )
    
    test_fname = f"../data/merged-{lang}.test.tsv"
    test = read_file(test_fname, correct_labels = True)
    
    from sklearn.metrics import accuracy_score, f1_score
    y_true = test["labels"]
    #y_pred = model.predict(list(test["text"].values))[0]    
    accs = list()
    f1s = list()
    for i in range(10):
        model.train_model(train, )
        y_pred = model.predict(list(test["text"].values))[0]
        accuracy = accuracy_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred, average="macro" )
        accs.append(accuracy)
        f1s.append(f1)
    print(f"""
# Model: {model_name}

|language|accuracy|f1 score|
|---|---|---|
    """, end="")
    for a, f in zip(accs, f1s):
        print(f"|{lang}|{a:0.3}|{f:0.3}|")
    return model, accs, f1s


lang = "hr"
model_type = "electra"
model_name = "./finetuned_models/HR_hate___classla_bcms-bertic_6/"

model, accs, f1s = train_and_evaluate_HF_model(model_type, model_name, lang)



model_name2 = "classla/bcms-bertic"
#model2, accs2, f1s2 = train_and_evaluate_HF_model(model_type, model_name2, lang)
print(model_name, "\nAccuracies: ", accs, "\nF1s: ", f1s)
#print(model_name2, "\nAccuracies: ", accs2, "\nF1s: ", f1s2)

Some weights of the model checkpoint at ./finetuned_models/HR_hate___classla_bcms-bertic_6/ were not used when initializing ElectraForSequenceClassification: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at ./finetuned_models/HR_hate___classla_bcms-bertic_6/ and are newly initialized: ['pooler.dense.weight', 'classifier.weight', 'pooler.d

HBox(children=(FloatProgress(value=0.0, max=8851.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=222.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=2120.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=265.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8851.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=222.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=2120.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=265.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8851.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=222.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=2120.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=265.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8851.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=222.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=2120.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=265.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8851.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=222.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=2120.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=265.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8851.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=222.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=2120.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=265.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8851.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=222.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=2120.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=265.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8851.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=222.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=2120.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=265.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8851.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=222.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=2120.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=265.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8851.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=222.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=222.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=2120.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=265.0), HTML(value='')))



# Model: ./finetuned_models/HR_hate___classla_bcms-bertic_6/

|language|accuracy|f1 score|
|---|---|---|
    |hr|0.82|0.81|
|hr|0.824|0.814|
|hr|0.823|0.813|
|hr|0.825|0.817|
|hr|0.823|0.814|
|hr|0.823|0.814|
|hr|0.825|0.817|
|hr|0.822|0.814|
|hr|0.817|0.808|
|hr|0.818|0.808|
./finetuned_models/HR_hate___classla_bcms-bertic_6/ 
Accuracies:  [0.8202830188679245, 0.8235849056603773, 0.8226415094339623, 0.825, 0.8226415094339623, 0.8226415094339623, 0.8254716981132075, 0.8221698113207547, 0.8169811320754717, 0.8179245283018868] 
F1s:  [0.8103576396837415, 0.8137994328125191, 0.8131421246925632, 0.8166274246122405, 0.814428362438319, 0.8142733987278361, 0.8173135685432784, 0.8135829032916411, 0.8076892503677602, 0.807648731871283]
