In [8]:
from datasets import load_dataset

train_file = "./cache/unified_train.jsonl"
eval_file = "./cache/unified_eval.jsonl"

dataset_train = load_dataset("json", split="train", data_files=train_file)
dataset_eval = load_dataset("json", split="train", data_files=eval_file)

print([dataset_train.num_rows, dataset_eval.num_rows])

[79998, 18000]


In [9]:
dataset_train[0]

{'labels': 0,
 'text': 'For a car, what scams can be plotted with 0% financing vs rebate?'}

In [10]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("roberta-base", force_download=False)


def encode_batch(batch):
    """Encodes a batch of input data using the model tokenizer."""
    return tokenizer(batch["text"], max_length=512, truncation=True, padding="max_length")


dataset_train = dataset_train.map(encode_batch, batched=True)
# dataset_train = dataset_train.rename_column("label", "labels")
dataset_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

dataset_eval = dataset_eval.map(encode_batch, batched=True)
# dataset_eval = dataset_eval.rename_column("label", "labels")
dataset_eval.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])



In [11]:
from adapters import AutoAdapterModel
from transformers import AutoTokenizer

config = AutoTokenizer.from_pretrained("roberta-base", num_labels=8, force_download=False)
model = AutoAdapterModel.from_pretrained("roberta-base", config=config, force_download=False)

Some weights of RobertaAdapterModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['heads.default.3.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Add a new adapter
model.add_adapter("moa_classifier")
# Add a matching classification head
model.add_classification_head(
    "moa_classifier",
    num_labels=8,
    id2label={
        0: "finance",
        1: "medicine",
        2: "leetcode",
        3: "exam",
        4: "webgpt",
        5: "gpt4tools",
        6: "cot",
        7: "stackoverflow",
    }
)
# Activate the adapter
model.train_adapter("moa_classifier")

In [13]:
import numpy as np
from adapters import AdapterTrainer
from transformers import TrainingArguments, EvalPrediction

training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=2,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=8,
    logging_steps=100,
    output_dir="./training_output",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)


def compute_accuracy(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return {"acc": (preds == p.label_ids).mean()}


trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_eval,
    compute_metrics=compute_accuracy,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()
model.save_adapter("./final_adapter", "moa_classifier")

Step,Training Loss


In [None]:
trainer.evaluate()

In [None]:
from transformers import TextClassificationPipeline

classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=training_args.device.index)

classifier("This is awesome!")