In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import ElectraTokenizerFast, ElectraForSequenceClassification
from transformers import Trainer, TrainingArguments



def read_file(fname: str, correct_labels=False) -> pd.DataFrame:
    """Reads a filename, return df with text and labels.

    Args:
        fname (str): Filename to read
        correct_labels (bool, optional): If True, offensive instances get labeled 1
        and acceptable speech gets labeled 0. Else the labels remain unchanged.
        Defaults to False.

    Returns:
        pd.DataFrame: resulting dataframe with columns: text, labels
    """

    df = pd.read_table(fname, sep="\t", header=None, names="text,labels,role".split(","))
    if correct_labels:
        offensive_ids = df.labels != "Acceptable speech"
        df.labels[offensive_ids] = 1
        df.labels[~offensive_ids] = 0
        df["labels"] = df.labels.astype(int)
    df = df.drop(columns=["role"])
    return df

en_test, en_train = "../data/merged-en.test.tsv" , "../data/merged-en.train.tsv"
hr_test, hr_train = "../data/merged-hr.test.tsv" , "../data/merged-hr.train.tsv"
sl_test, sl_train = "../data/merged-sl.test.tsv",  "../data/merged-sl.train.tsv"

model_name = "classla/bcms-bertic"

train_df = read_file(hr_train, correct_labels=True)
test_df = read_file(hr_test, correct_labels=True)

train_texts, train_labels = train_df.text.values.tolist(), train_df.labels.values.tolist()
test_texts, test_labels = test_df.text.values.tolist(), test_df.labels.values.tolist()

class MergedHateDataset(Dataset):
    """ A dataset class for the merged hatespeech dataset (Frank)
    """    
    def __init__(self, encodings, labels) -> None:
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, index):
        item = {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[index])
        return item
    def __len__(self):
        return len(self.labels)

tokenizer = ElectraTokenizerFast.from_pretrained(model_name)


train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_dataset = MergedHateDataset(train_encodings, train_labels)
test_dataset = MergedHateDataset(test_encodings, test_labels)

training_args = TrainingArguments(
    output_dir = "./outputs",
    num_train_epochs = 7,
    per_device_train_batch_size = 4,
    warmup_steps = 100,
    learning_rate = 3e-5,
    logging_dir = "./runs",
    overwrite_output_dir=True
)

model = ElectraForSequenceClassification.from_pretrained(model_name)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset

)

trainer.train()

Some weights of the model checkpoint at classla/bcms-bertic were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at classla/bcms-bertic and are newly initialized: ['classifier.dense.weight', 'classifier.ou

Step,Training Loss
500,0.5861
1000,0.5305
1500,0.5425
2000,0.5522
2500,0.4534
3000,0.4746
3500,0.4561
4000,0.4509
4500,0.428
5000,0.277


Saving model checkpoint to ./outputs/checkpoint-500
Configuration saved in ./outputs/checkpoint-500/config.json
Model weights saved in ./outputs/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1000
Configuration saved in ./outputs/checkpoint-1000/config.json
Model weights saved in ./outputs/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1500
Configuration saved in ./outputs/checkpoint-1500/config.json
Model weights saved in ./outputs/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2000
Configuration saved in ./outputs/checkpoint-2000/config.json
Model weights saved in ./outputs/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2500
Configuration saved in ./outputs/checkpoint-2500/config.json
Model weights saved in ./outputs/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-3000
Configuration saved in ./outputs/checkpoint-3

TrainOutput(global_step=15491, training_loss=0.2286161774008664, metrics={'train_runtime': 2304.3218, 'train_samples_per_second': 26.887, 'train_steps_per_second': 6.723, 'total_flos': 1.630157165693952e+16, 'train_loss': 0.2286161774008664, 'epoch': 7.0})

# Saving the model

In [1]:
out_filename = "finetuned_models/HR_hate___classla_bcms-bertic_5"
model.save_pretrained(out_filename)
tokenizer.save_pretrained(out_filename)

NameError: name 'model' is not defined

# Repeated training

In [12]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import ElectraTokenizerFast, ElectraForSequenceClassification
from transformers import Trainer, TrainingArguments

in_filename = "finetuned_models/HR_hate___classla_bcms-bertic_10"

def read_file(fname: str, correct_labels=False) -> pd.DataFrame:
    """Reads a filename, return df with text and labels.

    Args:
        fname (str): Filename to read
        correct_labels (bool, optional): If True, offensive instances get labeled 1
        and acceptable speech gets labeled 0. Else the labels remain unchanged.
        Defaults to False.

    Returns:
        pd.DataFrame: resulting dataframe with columns: text, labels
    """

    df = pd.read_table(fname, sep="\t", header=None, names="text,labels,role".split(","))
    if correct_labels:
        offensive_ids = df.labels != "Acceptable speech"
        df.labels[offensive_ids] = 1
        df.labels[~offensive_ids] = 0
        df["labels"] = df.labels.astype(int)
    df = df.drop(columns=["role"])
    return df

en_test, en_train = "../data/merged-en.test.tsv" , "../data/merged-en.train.tsv"
hr_test, hr_train = "../data/merged-hr.test.tsv" , "../data/merged-hr.train.tsv"
sl_test, sl_train = "../data/merged-sl.test.tsv",  "../data/merged-sl.train.tsv"

model_name = in_filename

train_df = read_file(hr_train, correct_labels=True)
test_df = read_file(hr_test, correct_labels=True)

train_texts, train_labels = train_df.text.values.tolist(), train_df.labels.values.tolist()
test_texts, test_labels = test_df.text.values.tolist(), test_df.labels.values.tolist()

class MergedHateDataset(Dataset):
    """ A dataset class for the merged hatespeech dataset (Frank)
    """    
    def __init__(self, encodings, labels) -> None:
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, index):
        item = {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[index])
        return item
    def __len__(self):
        return len(self.labels)

tokenizer = ElectraTokenizerFast.from_pretrained(model_name)


train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_dataset = MergedHateDataset(train_encodings, train_labels)
test_dataset = MergedHateDataset(test_encodings, test_labels)

training_args = TrainingArguments(
    output_dir = "./outputs",
    num_train_epochs = 5,
    per_device_train_batch_size = 4,
    warmup_steps = 100,
    learning_rate = 3e-5,
    logging_dir = "./runs",
    overwrite_output_dir=True
)

model = ElectraForSequenceClassification.from_pretrained(model_name)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset

)

trainer.train()

Didn't find file finetuned_models/HR_hate___classla_bcms-bertic_10/added_tokens.json. We won't load it.
loading file finetuned_models/HR_hate___classla_bcms-bertic_10/vocab.txt
loading file finetuned_models/HR_hate___classla_bcms-bertic_10/tokenizer.json
loading file None
loading file finetuned_models/HR_hate___classla_bcms-bertic_10/special_tokens_map.json
loading file finetuned_models/HR_hate___classla_bcms-bertic_10/tokenizer_config.json
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file finetuned_models/HR_hate___classla_bcms-bertic_10/config.json
Model config ElectraConfig {
  "_name_or_path": "finetuned_models/HR_hate___classla_bcms-bertic_9",
  "architectures": [
    "ElectraForSequenceClassification"
  ],
  "at

Step,Training Loss
500,0.036
1000,0.0463
1500,0.019
2000,0.0332
2500,0.0303
3000,0.0424
3500,0.0031
4000,0.0154
4500,0.0268
5000,0.0325


Saving model checkpoint to ./outputs/checkpoint-500
Configuration saved in ./outputs/checkpoint-500/config.json
Model weights saved in ./outputs/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1000
Configuration saved in ./outputs/checkpoint-1000/config.json
Model weights saved in ./outputs/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1500
Configuration saved in ./outputs/checkpoint-1500/config.json
Model weights saved in ./outputs/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2000
Configuration saved in ./outputs/checkpoint-2000/config.json
Model weights saved in ./outputs/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2500
Configuration saved in ./outputs/checkpoint-2500/config.json
Model weights saved in ./outputs/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-3000
Configuration saved in ./outputs/checkpoint-3

TrainOutput(global_step=11065, training_loss=0.020669079484090618, metrics={'train_runtime': 1631.2458, 'train_samples_per_second': 27.13, 'train_steps_per_second': 6.783, 'total_flos': 1.16439797549568e+16, 'train_loss': 0.020669079484090618, 'epoch': 5.0})

In [13]:
out_filename_after_additional_training = "finetuned_models/HR_hate___classla_bcms-bertic_11"
model.save_pretrained(out_filename_after_additional_training)
tokenizer.save_pretrained(out_filename_after_additional_training)

Configuration saved in finetuned_models/HR_hate___classla_bcms-bertic_11/config.json
Model weights saved in finetuned_models/HR_hate___classla_bcms-bertic_11/pytorch_model.bin
tokenizer config file saved in finetuned_models/HR_hate___classla_bcms-bertic_11/tokenizer_config.json
Special tokens file saved in finetuned_models/HR_hate___classla_bcms-bertic_11/special_tokens_map.json


('finetuned_models/HR_hate___classla_bcms-bertic_11/tokenizer_config.json',
 'finetuned_models/HR_hate___classla_bcms-bertic_11/special_tokens_map.json',
 'finetuned_models/HR_hate___classla_bcms-bertic_11/vocab.txt',
 'finetuned_models/HR_hate___classla_bcms-bertic_11/added_tokens.json',
 'finetuned_models/HR_hate___classla_bcms-bertic_11/tokenizer.json')

## EMBEDDIA/crosloengual-bert

In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import ElectraTokenizerFast, ElectraForSequenceClassification, BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import os

def read_file(fname: str, correct_labels=False) -> pd.DataFrame:
    """Reads a filename, return df with text and labels.

    Args:
        fname (str): Filename to read
        correct_labels (bool, optional): If True, offensive instances get labeled 1
        and acceptable speech gets labeled 0. Else the labels remain unchanged.
        Defaults to False.

    Returns:
        pd.DataFrame: resulting dataframe with columns: text, labels
    """

    df = pd.read_table(fname, sep="\t", header=None, names="text,labels,role".split(","))
    if correct_labels:
        offensive_ids = df.labels != "Acceptable speech"
        df.labels[offensive_ids] = 1
        df.labels[~offensive_ids] = 0
        df["labels"] = df.labels.astype(int)
    df = df.drop(columns=["role"])
    return df

en_test, en_train = "../data/merged-en.test.tsv" , "../data/merged-en.train.tsv"
hr_test, hr_train = "../data/merged-hr.test.tsv" , "../data/merged-hr.train.tsv"
sl_test, sl_train = "../data/merged-sl.test.tsv",  "../data/merged-sl.train.tsv"




train_df = read_file(hr_train, correct_labels=True)
test_df = read_file(hr_test, correct_labels=True)

train_texts, train_labels = train_df.text.values.tolist(), train_df.labels.values.tolist()
test_texts, test_labels = test_df.text.values.tolist(), test_df.labels.values.tolist()

class MergedHateDataset(Dataset):
    """ A dataset class for the merged hatespeech dataset (Frank)
    """    
    def __init__(self, encodings, labels) -> None:
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, index):
        item = {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[index])
        return item
    def __len__(self):
        return len(self.labels)

initial_filename = "EMBEDDIA/crosloengual-bert"
in_filename = initial_filename
out_filename_after_additional_training = "finetuned_models/HR_hate___EMBEDDIA/crosloengual-bert_5_second_attempt"


for i in range(10):
    if i == 0:
        model_name = initial_filename
    else:
        model_name = in_filename
    tokenizer = BertTokenizer.from_pretrained(model_name)
    train_encodings = tokenizer(train_texts, truncation=True, padding=True)
    test_encodings = tokenizer(test_texts, truncation=True, padding=True)

    train_dataset = MergedHateDataset(train_encodings, train_labels)
    test_dataset = MergedHateDataset(test_encodings, test_labels)

    training_args = TrainingArguments(
        output_dir = "./outputs",
        num_train_epochs = 5,
        per_device_train_batch_size = 4,
        warmup_steps = 100,
        learning_rate = 3e-5,
        logging_dir = "./runs",
        overwrite_output_dir=True
    )

    model = BertForSequenceClassification.from_pretrained(model_name)

    trainer = Trainer(
        model = model,
        args = training_args,
        train_dataset = train_dataset,
        eval_dataset = test_dataset

    )

    trainer.train()
    model.save_pretrained(out_filename_after_additional_training)
    tokenizer.save_pretrained(out_filename_after_additional_training)
    
    os.system("for file in {~/macocu/task1/task2/runs,~/macocu/task1/task2/outputs,~/macocu/task1/task2/wandb,~/macocu/task1/task2/cache_dir,}; do rm -rf $file; done")

loading file https://huggingface.co/EMBEDDIA/crosloengual-bert/resolve/main/vocab.txt from cache at /home/peterr/.cache/huggingface/transformers/09f73beefae6e0412fcfe3cc0d7cdd26efc944b005bfc2a9578ddbe1236ff2b5.32b339c0808458d0322a520d66b9be44f71818893fd19ec5a2e21e19799521ed
loading file https://huggingface.co/EMBEDDIA/crosloengual-bert/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/EMBEDDIA/crosloengual-bert/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/EMBEDDIA/crosloengual-bert/resolve/main/tokenizer_config.json from cache at /home/peterr/.cache/huggingface/transformers/6df9a50970eafe70edbc4e0098fbe8781b666e868e9b9b36ceb9890bb7fcf0bf.268a1e35e9054a0c7fb4bc185891f11122ab5e15883e3333534cdf3d76681112
loading file https://huggingface.co/EMBEDDIA/crosloengual-bert/resolve/main/tokenizer.json from cache at None
loading configuration file https://huggingface.co/EMBEDDIA/crosloengual-bert/resolve/main/config

Step,Training Loss
500,0.6671
1000,0.5436
1500,0.5645
2000,0.5577
2500,0.4793
3000,0.4291
3500,0.4659
4000,0.4764
4500,0.4124
5000,0.2201


Saving model checkpoint to ./outputs/checkpoint-500
Configuration saved in ./outputs/checkpoint-500/config.json
Model weights saved in ./outputs/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1000
Configuration saved in ./outputs/checkpoint-1000/config.json
Model weights saved in ./outputs/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1500
Configuration saved in ./outputs/checkpoint-1500/config.json
Model weights saved in ./outputs/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2000
Configuration saved in ./outputs/checkpoint-2000/config.json
Model weights saved in ./outputs/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2500
Configuration saved in ./outputs/checkpoint-2500/config.json
Model weights saved in ./outputs/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-3000
Configuration saved in ./outputs/checkpoint-3

Step,Training Loss
500,0.6247
1000,0.5373
1500,0.554
2000,0.554
2500,0.4469
3000,0.3862
3500,0.449
4000,0.458
4500,0.3847
5000,0.1629


Saving model checkpoint to ./outputs/checkpoint-500
Configuration saved in ./outputs/checkpoint-500/config.json
Model weights saved in ./outputs/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1000
Configuration saved in ./outputs/checkpoint-1000/config.json
Model weights saved in ./outputs/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1500
Configuration saved in ./outputs/checkpoint-1500/config.json
Model weights saved in ./outputs/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2000
Configuration saved in ./outputs/checkpoint-2000/config.json
Model weights saved in ./outputs/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2500
Configuration saved in ./outputs/checkpoint-2500/config.json
Model weights saved in ./outputs/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-3000
Configuration saved in ./outputs/checkpoint-3

Step,Training Loss
500,0.6247
1000,0.5373
1500,0.554
2000,0.554
2500,0.4469
3000,0.3862
3500,0.449
4000,0.458
4500,0.3847
5000,0.1629


Saving model checkpoint to ./outputs/checkpoint-500
Configuration saved in ./outputs/checkpoint-500/config.json
Model weights saved in ./outputs/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1000
Configuration saved in ./outputs/checkpoint-1000/config.json
Model weights saved in ./outputs/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1500
Configuration saved in ./outputs/checkpoint-1500/config.json
Model weights saved in ./outputs/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2000
Configuration saved in ./outputs/checkpoint-2000/config.json
Model weights saved in ./outputs/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2500
Configuration saved in ./outputs/checkpoint-2500/config.json
Model weights saved in ./outputs/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-3000
Configuration saved in ./outputs/checkpoint-3

Step,Training Loss
500,0.6247
1000,0.5373
1500,0.554
2000,0.554
2500,0.4469
3000,0.3862
3500,0.449
4000,0.458
4500,0.3847
5000,0.1629


Saving model checkpoint to ./outputs/checkpoint-500
Configuration saved in ./outputs/checkpoint-500/config.json
Model weights saved in ./outputs/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1000
Configuration saved in ./outputs/checkpoint-1000/config.json
Model weights saved in ./outputs/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1500
Configuration saved in ./outputs/checkpoint-1500/config.json
Model weights saved in ./outputs/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2000
Configuration saved in ./outputs/checkpoint-2000/config.json
Model weights saved in ./outputs/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2500
Configuration saved in ./outputs/checkpoint-2500/config.json
Model weights saved in ./outputs/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-3000
Configuration saved in ./outputs/checkpoint-3

Step,Training Loss
500,0.6247
1000,0.5373
1500,0.554
2000,0.554
2500,0.4469
3000,0.3862
3500,0.449
4000,0.458
4500,0.3847
5000,0.1629


Saving model checkpoint to ./outputs/checkpoint-500
Configuration saved in ./outputs/checkpoint-500/config.json
Model weights saved in ./outputs/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1000
Configuration saved in ./outputs/checkpoint-1000/config.json
Model weights saved in ./outputs/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1500
Configuration saved in ./outputs/checkpoint-1500/config.json
Model weights saved in ./outputs/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2000
Configuration saved in ./outputs/checkpoint-2000/config.json
Model weights saved in ./outputs/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2500
Configuration saved in ./outputs/checkpoint-2500/config.json
Model weights saved in ./outputs/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-3000
Configuration saved in ./outputs/checkpoint-3

Step,Training Loss
500,0.6247
1000,0.5373
1500,0.554
2000,0.554
2500,0.4469
3000,0.3862
3500,0.449
4000,0.458
4500,0.3847
5000,0.1629


Saving model checkpoint to ./outputs/checkpoint-500
Configuration saved in ./outputs/checkpoint-500/config.json
Model weights saved in ./outputs/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1000
Configuration saved in ./outputs/checkpoint-1000/config.json
Model weights saved in ./outputs/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1500
Configuration saved in ./outputs/checkpoint-1500/config.json
Model weights saved in ./outputs/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2000
Configuration saved in ./outputs/checkpoint-2000/config.json
Model weights saved in ./outputs/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2500
Configuration saved in ./outputs/checkpoint-2500/config.json
Model weights saved in ./outputs/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-3000
Configuration saved in ./outputs/checkpoint-3

Step,Training Loss
500,0.6247
1000,0.5373
1500,0.554
2000,0.554
2500,0.4469
3000,0.3862
3500,0.449
4000,0.458
4500,0.3847
5000,0.1629


Saving model checkpoint to ./outputs/checkpoint-500
Configuration saved in ./outputs/checkpoint-500/config.json
Model weights saved in ./outputs/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1000
Configuration saved in ./outputs/checkpoint-1000/config.json
Model weights saved in ./outputs/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1500
Configuration saved in ./outputs/checkpoint-1500/config.json
Model weights saved in ./outputs/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2000
Configuration saved in ./outputs/checkpoint-2000/config.json
Model weights saved in ./outputs/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2500
Configuration saved in ./outputs/checkpoint-2500/config.json
Model weights saved in ./outputs/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-3000
Configuration saved in ./outputs/checkpoint-3

Step,Training Loss
500,0.6247
1000,0.5373
1500,0.554
2000,0.554
2500,0.4469
3000,0.3862
3500,0.449
4000,0.458
4500,0.3847
5000,0.1629


Saving model checkpoint to ./outputs/checkpoint-500
Configuration saved in ./outputs/checkpoint-500/config.json
Model weights saved in ./outputs/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1000
Configuration saved in ./outputs/checkpoint-1000/config.json
Model weights saved in ./outputs/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1500
Configuration saved in ./outputs/checkpoint-1500/config.json
Model weights saved in ./outputs/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2000
Configuration saved in ./outputs/checkpoint-2000/config.json
Model weights saved in ./outputs/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2500
Configuration saved in ./outputs/checkpoint-2500/config.json
Model weights saved in ./outputs/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-3000
Configuration saved in ./outputs/checkpoint-3

Step,Training Loss
500,0.6247
1000,0.5373
1500,0.554
2000,0.554
2500,0.4469
3000,0.3862
3500,0.449
4000,0.458
4500,0.3847
5000,0.1629


Saving model checkpoint to ./outputs/checkpoint-500
Configuration saved in ./outputs/checkpoint-500/config.json
Model weights saved in ./outputs/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1000
Configuration saved in ./outputs/checkpoint-1000/config.json
Model weights saved in ./outputs/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1500
Configuration saved in ./outputs/checkpoint-1500/config.json
Model weights saved in ./outputs/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2000
Configuration saved in ./outputs/checkpoint-2000/config.json
Model weights saved in ./outputs/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2500
Configuration saved in ./outputs/checkpoint-2500/config.json
Model weights saved in ./outputs/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-3000
Configuration saved in ./outputs/checkpoint-3

Step,Training Loss
500,0.6247
1000,0.5373
1500,0.554
2000,0.554
2500,0.4469
3000,0.3862
3500,0.449
4000,0.458
4500,0.3847
5000,0.1629


Saving model checkpoint to ./outputs/checkpoint-500
Configuration saved in ./outputs/checkpoint-500/config.json
Model weights saved in ./outputs/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1000
Configuration saved in ./outputs/checkpoint-1000/config.json
Model weights saved in ./outputs/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-1500
Configuration saved in ./outputs/checkpoint-1500/config.json
Model weights saved in ./outputs/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2000
Configuration saved in ./outputs/checkpoint-2000/config.json
Model weights saved in ./outputs/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-2500
Configuration saved in ./outputs/checkpoint-2500/config.json
Model weights saved in ./outputs/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./outputs/checkpoint-3000
Configuration saved in ./outputs/checkpoint-3

## Repeated run with different choice of tokenizers and models imported:

In [3]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForPreTraining
from transformers import Trainer, TrainingArguments
import os

def read_file(fname: str, correct_labels=False) -> pd.DataFrame:
    """Reads a filename, return df with text and labels.

    Args:
        fname (str): Filename to read
        correct_labels (bool, optional): If True, offensive instances get labeled 1
        and acceptable speech gets labeled 0. Else the labels remain unchanged.
        Defaults to False.

    Returns:
        pd.DataFrame: resulting dataframe with columns: text, labels
    """

    df = pd.read_table(fname, sep="\t", header=None, names="text,labels,role".split(","))
    if correct_labels:
        offensive_ids = df.labels != "Acceptable speech"
        df.labels[offensive_ids] = 1
        df.labels[~offensive_ids] = 0
        df["labels"] = df.labels.astype(int)
    df = df.drop(columns=["role"])
    return df

en_test, en_train = "../data/merged-en.test.tsv" , "../data/merged-en.train.tsv"
hr_test, hr_train = "../data/merged-hr.test.tsv" , "../data/merged-hr.train.tsv"
sl_test, sl_train = "../data/merged-sl.test.tsv",  "../data/merged-sl.train.tsv"




train_df = read_file(hr_train, correct_labels=True)
test_df = read_file(hr_test, correct_labels=True)

train_texts, train_labels = train_df.text.values.tolist(), train_df.labels.values.tolist()
test_texts, test_labels = test_df.text.values.tolist(), test_df.labels.values.tolist()

class MergedHateDataset(Dataset):
    """ A dataset class for the merged hatespeech dataset (Frank)
    """    
    def __init__(self, encodings, labels) -> None:
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, index):
        item = {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[index])
        return item
    def __len__(self):
        return len(self.labels)

initial_filename = "EMBEDDIA/crosloengual-bert"
in_filename = initial_filename
out_filename_after_additional_training = "finetuned_models/HR_hate___EMBEDDIA/crosloengual-bert_5_third_attempt"


for i in range(10):
    if i == 0:
        model_name = initial_filename
    else:
        model_name = in_filename
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    train_encodings = tokenizer(train_texts, truncation=True, padding=True)
    test_encodings = tokenizer(test_texts, truncation=True, padding=True)

    train_dataset = MergedHateDataset(train_encodings, train_labels)
    test_dataset = MergedHateDataset(test_encodings, test_labels)

    training_args = TrainingArguments(
        output_dir = "./outputs",
        num_train_epochs = 5,
        per_device_train_batch_size = 4,
        warmup_steps = 100,
        learning_rate = 3e-5,
        logging_dir = "./runs",
        overwrite_output_dir=True
    )

    model = AutoModelForPreTraining.from_pretrained(model_name)
    model.overwrite_output_dir = True
    trainer = Trainer(
        model = model,
        args = training_args,
        train_dataset = train_dataset,
        eval_dataset = test_dataset

    )

    trainer.train()
    model.save_pretrained(out_filename_after_additional_training)
    tokenizer.save_pretrained(out_filename_after_additional_training)
    
    os.system("for file in {~/macocu/task1/task2/runs,~/macocu/task1/task2/outputs,~/macocu/task1/task2/wandb,~/macocu/task1/task2/cache_dir,}; do rm -rf $file; done")

loading configuration file https://huggingface.co/EMBEDDIA/crosloengual-bert/resolve/main/config.json from cache at /home/peterr/.cache/huggingface/transformers/dbc299fc4baf62000b5932d0e8358b2438cc8db0536056ca6ba6d07d6d484599.62f1d55e869204a000e86539f3cf99f1ea413d915c08dba36e27842d16d08c2d
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.9.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 49601
}

loading file https://huggingface.co/EMBEDDIA/crosloengual-bert/resolve/main/vocab.txt from cache at /home/peterr/.cache

KeyError: 'loss'

In [16]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertModel
from transformers import Trainer, TrainingArguments
import os

def read_file(fname: str, correct_labels=False) -> pd.DataFrame:
    """Reads a filename, return df with text and labels.

    Args:
        fname (str): Filename to read
        correct_labels (bool, optional): If True, offensive instances get labeled 1
        and acceptable speech gets labeled 0. Else the labels remain unchanged.
        Defaults to False.

    Returns:
        pd.DataFrame: resulting dataframe with columns: text, labels
    """

    df = pd.read_table(fname, sep="\t", header=None, names="text,labels,role".split(","))
    if correct_labels:
        offensive_ids = df.labels != "Acceptable speech"
        df.labels[offensive_ids] = 1
        df.labels[~offensive_ids] = 0
        df["labels"] = df.labels.astype(int)
    df = df.drop(columns=["role"])
    return df

en_test, en_train = "../data/merged-en.test.tsv" , "../data/merged-en.train.tsv"
hr_test, hr_train = "../data/merged-hr.test.tsv" , "../data/merged-hr.train.tsv"
sl_test, sl_train = "../data/merged-sl.test.tsv",  "../data/merged-sl.train.tsv"




train_df = read_file(hr_train, correct_labels=True)
test_df = read_file(hr_test, correct_labels=True)

train_texts, train_labels = train_df.text.values.tolist(), train_df.labels.values.tolist()
test_texts, test_labels = test_df.text.values.tolist(), test_df.labels.values.tolist()

class MergedHateDataset(Dataset):
    """ A dataset class for the merged hatespeech dataset (Frank)
    """    
    def __init__(self, encodings, labels) -> None:
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, index):
        item = {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[index])
        return item
    def __len__(self):
        return len(self.labels)

initial_filename = "EMBEDDIA/crosloengual-bert"
in_filename = initial_filename
out_filename_after_additional_training = "finetuned_models/HR_hate___EMBEDDIA/crosloengual-bert_5_third_attempt"


for i in range(10):
    if i == 0:
        model_name = initial_filename
    else:
        model_name = in_filename
    tokenizer = BertTokenizer.from_pretrained(model_name)
    train_encodings = tokenizer(train_texts, truncation=True, padding=True)
    test_encodings = tokenizer(test_texts, truncation=True, padding=True)

    train_dataset = MergedHateDataset(train_encodings, train_labels)
    test_dataset = MergedHateDataset(test_encodings, test_labels)

    training_args = TrainingArguments(
        output_dir = "./outputs",
        num_train_epochs = 5,
        per_device_train_batch_size = 4,
        warmup_steps = 100,
        learning_rate = 3e-5,
        logging_dir = "./runs",
        overwrite_output_dir=True
    )

    model = BertModel.from_pretrained(model_name)
    model.overwrite_output_dir = True
    trainer = Trainer(
        model = model,
        args = training_args,
        train_dataset = train_dataset,
        eval_dataset = test_dataset

    )

    trainer.train()
    model.save_pretrained(out_filename_after_additional_training)
    tokenizer.save_pretrained(out_filename_after_additional_training)
    
    os.system("for file in {~/macocu/task1/task2/runs,~/macocu/task1/task2/outputs,~/macocu/task1/task2/wandb,~/macocu/task1/task2/cache_dir,}; do rm -rf $file; done")

AttributeError: 'int' object has no attribute 'read'