In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import ElectraTokenizerFast, ElectraForSequenceClassification
from transformers import Trainer, TrainingArguments



def read_file(fname: str, correct_labels=False) -> pd.DataFrame:
    """Reads a filename, return df with text and labels.

    Args:
        fname (str): Filename to read
        correct_labels (bool, optional): If True, offensive instances get labeled 1
        and acceptable speech gets labeled 0. Else the labels remain unchanged.
        Defaults to False.

    Returns:
        pd.DataFrame: resulting dataframe with columns: text, labels
    """

    df = pd.read_table(fname, sep="\t", header=None, names="text,labels,role".split(","))
    if correct_labels:
        offensive_ids = df.labels != "Acceptable speech"
        df.labels[offensive_ids] = 1
        df.labels[~offensive_ids] = 0
        df["labels"] = df.labels.astype(int)
    df = df.drop(columns=["role"])
    return df

en_test, en_train = "../data/merged-en.test.tsv" , "../data/merged-en.train.tsv"
hr_test, hr_train = "../data/merged-hr.test.tsv" , "../data/merged-hr.train.tsv"
sl_test, sl_train = "../data/merged-sl.test.tsv",  "../data/merged-sl.train.tsv"

model_name = "classla/bcms-bertic"

train_df = read_file(hr_train, correct_labels=True)
test_df = read_file(hr_test, correct_labels=True)

train_texts, train_labels = train_df.text.values.tolist(), train_df.labels.values.tolist()
test_texts, test_labels = test_df.text.values.tolist(), test_df.labels.values.tolist()

class MergedHateDataset(Dataset):
    """ A dataset class for the merged hatespeech dataset (Frank)
    """    
    def __init__(self, encodings, labels) -> None:
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, index):
        item = {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[index])
        return item
    def __len__(self):
        return len(self.labels)

tokenizer = ElectraTokenizerFast.from_pretrained(model_name)


train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_dataset = MergedHateDataset(train_encodings, train_labels)
test_dataset = MergedHateDataset(test_encodings, test_labels)

training_args = TrainingArguments(
    output_dir = "./outputs",
    num_train_epochs = 7,
    per_device_train_batch_size = 4,
    warmup_steps = 100,
    learning_rate = 3e-5,
    logging_dir = "./runs",
    overwrite_output_dir=True
)

model = ElectraForSequenceClassification.from_pretrained(model_name)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset

)

trainer.train()

Some weights of the model checkpoint at classla/bcms-bertic were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at classla/bcms-bertic and are newly initialized: ['classifier.dense.weight', 'classifier.ou

Step,Training Loss
500,0.5861
1000,0.5305
1500,0.5425
2000,0.5522
2500,0.4534
3000,0.4746
3500,0.4561
4000,0.4509
4500,0.428
5000,0.277


Saving model checkpoint to ./outputs/checkpoint-500
Configuration saved in ./outputs/checkpoint-500/config.json
Model weights saved in ./outputs/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1000
Configuration saved in ./outputs/checkpoint-1000/config.json
Model weights saved in ./outputs/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1500
Configuration saved in ./outputs/checkpoint-1500/config.json
Model weights saved in ./outputs/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2000
Configuration saved in ./outputs/checkpoint-2000/config.json
Model weights saved in ./outputs/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2500
Configuration saved in ./outputs/checkpoint-2500/config.json
Model weights saved in ./outputs/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-3000
Configuration saved in ./outputs/checkpoint-3

TrainOutput(global_step=15491, training_loss=0.2286161774008664, metrics={'train_runtime': 2304.3218, 'train_samples_per_second': 26.887, 'train_steps_per_second': 6.723, 'total_flos': 1.630157165693952e+16, 'train_loss': 0.2286161774008664, 'epoch': 7.0})

# Saving the model

In [2]:
filename = "finetuned_models/HR_hate___classla_bcms-bertic_5"
model.save_pretrained(filename)
tokenizer.save_pretrained(filename)

Configuration saved in finetuned_models/HR_hate___classla_bcms-bertic_5/config.json
Model weights saved in finetuned_models/HR_hate___classla_bcms-bertic_5/pytorch_model.bin
tokenizer config file saved in finetuned_models/HR_hate___classla_bcms-bertic_5/tokenizer_config.json
Special tokens file saved in finetuned_models/HR_hate___classla_bcms-bertic_5/special_tokens_map.json


('finetuned_models/HR_hate___classla_bcms-bertic_5/tokenizer_config.json',
 'finetuned_models/HR_hate___classla_bcms-bertic_5/special_tokens_map.json',
 'finetuned_models/HR_hate___classla_bcms-bertic_5/vocab.txt',
 'finetuned_models/HR_hate___classla_bcms-bertic_5/added_tokens.json',
 'finetuned_models/HR_hate___classla_bcms-bertic_5/tokenizer.json')

tokenizer config file saved in .finetuned_models/HR_hate___classla_bcms-bertic_1/tokenizer_config.json
Special tokens file saved in .finetuned_models/HR_hate___classla_bcms-bertic_1/special_tokens_map.json


('.finetuned_models/HR_hate___classla_bcms-bertic_1/tokenizer_config.json',
 '.finetuned_models/HR_hate___classla_bcms-bertic_1/special_tokens_map.json',
 '.finetuned_models/HR_hate___classla_bcms-bertic_1/vocab.txt',
 '.finetuned_models/HR_hate___classla_bcms-bertic_1/added_tokens.json',
 '.finetuned_models/HR_hate___classla_bcms-bertic_1/tokenizer.json')