In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import load_dataset, Dataset
from sklearn.metrics import accuracy_score
from itertools import product
import pandas as pd
import numpy as np
import torch
import os
import re

In [2]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

In [None]:
data_files={
    "train": "sentences_train.csv",
    "validation": "sentences_valid.csv",
    "test": "sentences_test.csv"
}
dataset = load_dataset("csv", data_files=data_files)
dataset = dataset.class_encode_column("persona.age")
dataset = dataset.rename_column("persona.age", "label")

In [13]:
dataset["train"]["label"][760:765]

[3, 6, 2, 4, 3]

In [24]:

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {"accuracy": accuracy_score(labels, preds)}

In [25]:

model_names = [
    "bert-base-uncased",
    "distilbert-base-uncased",
    "roberta-base",
    "albert-base-v2"
]
model_abbr = {
    "bert-base-uncased": "bert",
    "distilbert-base-uncased": "distilbert",
    "roberta-base": "roberta",
    "albert-base-v2": "albert"
}
learning_rates = [3e-5]
batch_sizes = [16]
accum_steps = [2, 4]

In [None]:
experiments = []
for model, bs, acc, lr in product(model_names, batch_sizes, accum_steps, learning_rates):
    abbr = model_abbr[model]
    name = f"{abbr}_bs{bs}_acc{acc}_lr{lr}"
    experiments.append({
        "name": name,
        "model_name": model,
        "batch_size": bs,
        "acc_steps": acc,
        "lr": lr
    })


In [27]:
save_path = "./Classifier/bert_fulltrain_results.csv"
write_header = not os.path.exists(save_path)

In [None]:


for exp in experiments:
    print(f"\nTraining: {exp['name']}")
    
    tokenizer = AutoTokenizer.from_pretrained(exp["model_name"])

    def tokenize_fn(batch):
        return tokenizer(batch["output.sentences"], truncation=True, padding="max_length", max_length=128)

    dataset_tokenized = dataset.map(tokenize_fn, batched=True)
    dataset_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

    model = AutoModelForSequenceClassification.from_pretrained(
        exp["model_name"], num_labels=dataset["train"].features["label"].num_classes
    ).to(device)

    args = TrainingArguments(
        output_dir=f"./Classifier/{exp['name']}",
        per_device_train_batch_size=exp["batch_size"],
        gradient_accumulation_steps=exp["acc_steps"],
        learning_rate=exp["lr"],

        weight_decay=0.01,
        num_train_epochs=10,
        eval_strategy="epoch",
        save_strategy="epoch",
        
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        lr_scheduler_type="cosine",
        warmup_ratio=0.1,
        report_to="none",
        logging_steps=50
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=dataset_tokenized["train"],
        eval_dataset=dataset_tokenized["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    trainer.train()
    eval_result = trainer.evaluate(eval_dataset=dataset_tokenized["test"])

    match = re.search(r"checkpoint-(\d+)", trainer.state.best_model_checkpoint)
    best_step=int(match.group(1))
    steps_per_epoch = trainer.state.global_step / trainer.state.epoch
    best_epoch = round(best_step / steps_per_epoch, 2)
    
    results = []
    results.append({
        "name": exp["name"],
        "model": exp["model_name"],
        "lr": exp["lr"],
        "bs": exp["batch_size"],
        "accum": exp["acc_steps"],
        "acc": eval_result["eval_accuracy"],
        "loss": eval_result["eval_loss"],
        "epoch": best_epoch,
        "best_checkpoint": trainer.state.best_model_checkpoint
    })

    df_results = pd.DataFrame(results)
    df_results.to_csv(save_path, mode='a', header=write_header, index=False)




Training: bert_bs16_acc2_lr3e-05


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3968,1.379088,0.425564
2,1.3355,1.341292,0.45213
3,1.1008,1.410914,0.439599
4,0.7865,1.612345,0.439098



Training: bert_bs16_acc4_lr3e-05


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3804,1.387585,0.415789
2,1.3379,1.337824,0.448872
3,1.1586,1.392938,0.443108
4,0.882,1.509911,0.439098



Training: distilbert_bs16_acc2_lr3e-05


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/31806 [00:00<?, ? examples/s]

Map:   0%|          | 0/3990 [00:00<?, ? examples/s]

Map:   0%|          | 0/3982 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3971,1.378139,0.43183
2,1.3386,1.337554,0.450376
3,1.1199,1.393803,0.444862
4,0.8841,1.589024,0.430576



Training: distilbert_bs16_acc4_lr3e-05


Map:   0%|          | 0/3990 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3856,1.386006,0.421303
2,1.3495,1.342322,0.449875
3,1.1989,1.376148,0.440351
4,0.9705,1.481101,0.4401



Training: roberta_bs16_acc2_lr3e-05


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/31806 [00:00<?, ? examples/s]

Map:   0%|          | 0/3990 [00:00<?, ? examples/s]

Map:   0%|          | 0/3982 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3948,1.38402,0.42782
2,1.3485,1.340462,0.447368
3,1.2115,1.363857,0.427068
4,1.0963,1.403694,0.461153
5,0.8859,1.556428,0.450125
6,0.7407,1.717927,0.457393



Training: roberta_bs16_acc4_lr3e-05


Map:   0%|          | 0/3990 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3924,1.378536,0.428321
2,1.3485,1.340837,0.452381
3,1.2494,1.351233,0.424561
4,1.1297,1.401154,0.44812



Training: albert_bs16_acc2_lr3e-05


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Map:   0%|          | 0/31806 [00:00<?, ? examples/s]

Map:   0%|          | 0/3990 [00:00<?, ? examples/s]

Map:   0%|          | 0/3982 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3999,1.392232,0.421805
2,1.371,1.350803,0.442857
3,1.2497,1.358995,0.427318
4,1.1358,1.379843,0.445363
5,0.8931,1.53737,0.433584
6,0.5594,1.973407,0.423559



Training: albert_bs16_acc4_lr3e-05


Map:   0%|          | 0/3990 [00:00<?, ? examples/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3983,1.397079,0.418546
2,1.3708,1.356142,0.43183
3,1.3097,1.359291,0.431579
4,1.1853,1.388441,0.450125
5,0.9844,1.535256,0.422556
6,0.6219,1.826463,0.412281
