This notebook inspects optimal hyperparameters for classification models finetuning.

In [1]:

# import torch
# from numba import cuda
# cuda.select_device(0)
# cuda.close()
# cuda.select_device(0)
# torch.cuda.empty_cache()

In [1]:
label_set = ['Negative', 'Positive', 'Neutral']
STR_TO_NUM = {k: i for i, k in enumerate(label_set)}
NUM_TO_STR = {i:k for i, k in enumerate(label_set)}

import pandas as pd
df = pd.read_json("bcs_polsent.jsonl", orient="records", lines=True)
df["label"] = df.label.apply(lambda s: STR_TO_NUM[s])
df = df[["sentence", "label", "split"]].rename(columns={"sentence": "text", "label":"labels"})
train = df[df.split=="train"].drop(columns=["split"])
dev = df[df.split=="dev"].drop(columns=["split"])
test = df[df.split=="test"].drop(columns=["split"])


In [2]:
from transformers import AutoTokenizer
from datasets import Dataset
train = Dataset.from_pandas(train)
dev = Dataset.from_pandas(dev)


In [5]:
def train_and_eval(model_name, num_epoch, batch_size):

    tokenizer = AutoTokenizer.from_pretrained(model_name, padding=512)
    def preprocess_function(examples):
        return tokenizer(examples["text"], truncation=True)
    tokenized_train = train.map(preprocess_function, batched=True)
    tokenized_dev = dev.map(preprocess_function, batched=True)

    from transformers import DataCollatorWithPadding
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
    from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
    
    training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=8,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_dev,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()

    def predict(model, text):
        import torch 
        inputs = tokenizer("Hello, my dog is cute", return_tensors="pt").input_ids.cuda()
        logits = model(inputs).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        return int(predicted_ids[0])

    y_pred = [NUM_TO_STR[predict(model,i)] for i in dev["text"]]
    y_true = [NUM_TO_STR[i] for i in dev["labels"]]

    from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
    f1 = f1_score(y_true, y_pred, labels = label_set, average="macro")
    acc = accuracy_score(y_true, y_pred )
    cm = confusion_matrix(y_true, y_pred, labels=label_set)

    with open("results.csv", "a") as f:
        f.write(f"{model_name},{batch_size},{num_epoch},{f1},{y_true},{y_pred}")

models = ["xlm-roberta-base", "xlm-roberta-large", "classla/bcms-bertic", "EMBEDDIA/crosloengual-bert"]
epochs = [3,5,9,15,30,60]
batch_sizes = [8,16]

for model in models:
    for batch_size in batch_sizes:
        for epoch in epochs:
            train_and_eval(model, epoch, batch_size)

    

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-base/resolve/main/config.json from cache at /home/peterr/.cache/huggingface/transformers/87683eb92ea383b0475fecf99970e950a03c9ff5e51648d6eee56fb754612465.dfaaaedc7c1c475302398f09706cbb21e23951b73c6e2b3162c1c8a99bb3b62a
Model config XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/xlm-roberta-base/resolve/main/config.json from cache at /home/peterr/.cache/huggingface/transformers/87683eb92ea383b0475fecf99970e950a03c9ff5e51648d6eee56fb754612465.dfaaaedc7c1c475302398f09706cbb21e23951b73c6e2b3162c1c8a99bb3b62a
Model config XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "

Step,Training Loss
500,0.7252
1000,0.2066


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




0.16145637162586315