In [15]:
from datasets import load_dataset

train_file = "./cache/unified_train.jsonl"
eval_file = "./cache/unified_eval.jsonl"

dataset_train = load_dataset("json", split="train", data_files=train_file)
dataset_eval = load_dataset("json", split="train", data_files=eval_file)

print([dataset_train.num_rows, dataset_eval.num_rows])

[39999, 16000]


In [16]:
dataset_train[0]

{'labels': 0,
 'text': 'For a car, what scams can be plotted with 0% financing vs rebate?'}

In [17]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("roberta-base", force_download=False)


def encode_batch(batch):
    """Encodes a batch of input data using the model tokenizer."""
    return tokenizer(batch["text"], max_length=512, truncation=True, padding="max_length")


dataset_train = dataset_train.map(encode_batch, batched=True)
# dataset_train = dataset_train.rename_column("label", "labels")
dataset_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

dataset_eval = dataset_eval.map(encode_batch, batched=True)
# dataset_eval = dataset_eval.rename_column("label", "labels")
dataset_eval.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])



In [18]:
from adapters import AutoAdapterModel
from transformers import AutoConfig

id2label = {
    0: "finance",
    1: "medicine",
    2: "leetcode",
    3: "exam",
    4: "webgpt",
    5: "gpt4tools",
    6: "cot",
    7: "stackoverflow",
}

# id2label = {
#     0: "finance",
#     1: "medicine",
#     2: "cot",
#     3: "stackoverflow",
# }

config = AutoConfig.from_pretrained("roberta-base", num_labels=len(id2label), force_download=False)
model = AutoAdapterModel.from_pretrained("roberta-base", config=config, force_download=False)

Some weights of RobertaAdapterModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['heads.default.3.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# Add a new adapter
model.add_adapter("moa_classifier")

# Add a matching classification head
model.add_classification_head(
    "moa_classifier",
    num_labels=len(id2label),
    id2label=id2label
)
# Activate the adapter
model.train_adapter("moa_classifier")

In [20]:
import numpy as np
from adapters import AdapterTrainer
from transformers import TrainingArguments, EvalPrediction

training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=2,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=8,
    logging_steps=100,
    output_dir="./training_output",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)


def compute_accuracy(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return {"acc": (preds == p.label_ids).mean()}


trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_eval,
    compute_metrics=compute_accuracy,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [21]:
trainer.train()
model.save_adapter("./final_adapter", "moa_classifier")

Step,Training Loss
100,0.6742
200,0.0091
300,0.0108
400,0.0048
500,0.0022
600,0.0031
700,0.004
800,0.004
900,0.0034
1000,0.0007


In [22]:
trainer.evaluate()

{'eval_loss': 5.169083118438721,
 'eval_acc': 0.4998125,
 'eval_runtime': 80.0482,
 'eval_samples_per_second': 199.88,
 'eval_steps_per_second': 24.985,
 'epoch': 2.0}

In [23]:
from transformers import TextClassificationPipeline

classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=training_args.device.index)

classifier("What are the tax rates in Russia?")

The model 'RobertaAdapterModel' is not supported for . Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DistilBertForSequenceClassification', 'ElectraForSequenceClassification', 'ErnieForSequenceClassification', 'ErnieMForSequenceClassification', 'EsmForSequenceClassification', 'FalconForSequenceClassification', 'FlaubertForSequenceClassification', 'FNetForSequenceClassification', 'FunnelForSequenceClassification', 'GemmaForSequenceClassification', 'GPT2ForSequenceClassification',

[{'label': 'finance', 'score': 0.9999765157699585}]