In [1]:
import os 
from transformers import AutoModel 
import torch 
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["CUDA_LAUNCH_BLOCKING"]="1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"

device = torch.device("cuda:0")
model = AutoModel.from_pretrained("xlm-roberta-base")
model = model.to(device)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import load_dataset
from adapters.composition import Stack

dataset_en = load_dataset("super_glue", "copa")
dataset_en.num_rows

{'train': 400, 'validation': 100, 'test': 500}

In [3]:
dataset_en["train"].features

{'premise': Value(dtype='string', id=None),
 'choice1': Value(dtype='string', id=None),
 'choice2': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'idx': Value(dtype='int32', id=None),
 'label': ClassLabel(names=['choice1', 'choice2'], id=None)}

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

def encode_batch(examples):
  """Encodes a batch of input data using the model tokenizer."""
  all_encoded = {"input_ids": [], "attention_mask": []}
  # Iterate through all examples in this batch
  for premise, question, choice1, choice2 in zip(examples["premise"], examples["question"], examples["choice1"], examples["choice2"]):
    sentences_a = [premise + " " + question for _ in range(2)]
    # Both answer choices are passed in an array according to the format needed for the multiple-choice prediction head
    sentences_b = [choice1, choice2]
    encoded = tokenizer(
        sentences_a,
        sentences_b,
        max_length=60,
        truncation=True,
        padding="max_length",
    )
    all_encoded["input_ids"].append(encoded["input_ids"])
    all_encoded["attention_mask"].append(encoded["attention_mask"])
  return all_encoded

def preprocess_dataset(dataset):
  # Encode the input data
  dataset = dataset.map(encode_batch, batched=True)
  # The transformers model expects the target class column to be named "labels"
  dataset = dataset.rename_column("label", "labels")
  # Transform to pytorch tensors and only output the required columns
  dataset.set_format(columns=["input_ids", "attention_mask", "labels"])
  return dataset

dataset_en = preprocess_dataset(dataset_en)

In [5]:
print(dataset_en)

DatasetDict({
    train: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'idx', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 400
    })
    validation: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'idx', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 100
    })
    test: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'idx', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 500
    })
})


In [6]:
from adapters import AutoAdapterModel
from transformers import AutoConfig

config = AutoConfig.from_pretrained(
    "xlm-roberta-base",
)
model = AutoAdapterModel.from_pretrained(
    "xlm-roberta-base",
    config=config,
)

Some weights of XLMRobertaAdapterModel were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from adapters import AdapterConfig

# Load the language adapters
lang_adapter_config = AdapterConfig.load("seq_bn", reduction_factor=2)
model.load_adapter("en/wiki@ukp", config=lang_adapter_config)
model.load_adapter("zh/wiki@ukp", config=lang_adapter_config)

# Add a new task adapter
model.add_adapter("copa")

# Add a classification head for our target task
model.add_multiple_choice_head("copa", num_choices=2, overwrite_ok=True)

The 'config' and 'model_name' arguments are specific to the now unsupported legacy Hub repo and will be removed.Please switch to only providing the HF Model Hub identifier.
Fetching 4 files: 100%|██████████| 4/4 [00:00<?, ?it/s]
The 'config' and 'model_name' arguments are specific to the now unsupported legacy Hub repo and will be removed.Please switch to only providing the HF Model Hub identifier.
Fetching 4 files: 100%|██████████| 4/4 [00:00<?, ?it/s]


In [8]:
model.train_adapter("copa")

There are adapters available but none are activated for the forward pass.


In [9]:
# Unfreeze and activate stack setup
model.active_adapters = Stack("en", "copa")

In [10]:
from adapters import AdapterTrainer
from transformers import TrainingArguments
from datasets import concatenate_datasets

training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=20,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=100,
    output_dir="./training_output",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

train_dataset = concatenate_datasets([dataset_en["train"], dataset_en["validation"]])

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)




In [11]:
trainer.train()

Step,Training Loss
100,0.6957
200,0.6931
300,0.6875


TrainOutput(global_step=320, training_loss=0.6918656349182128, metrics={'train_runtime': 162.1485, 'train_samples_per_second': 61.672, 'train_steps_per_second': 1.974, 'total_flos': 739801173600000.0, 'train_loss': 0.6918656349182128, 'epoch': 20.0})

In [12]:
model.save_pretrained("./saved_model_with_adapters")
tokenizer.save_pretrained("./saved_model_with_adapters")
print("Full model + adapters saved at ./saved_model_with_adapters")

Full model + adapters saved at ./saved_model_with_adapters


In [20]:
dataset_zh = load_dataset("xcopa", "zh", verification_mode="no_checks")
print(dataset_zh)
dataset_zh = preprocess_dataset(dataset_zh)

DatasetDict({
    validation: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'label', 'idx', 'changed'],
        num_rows: 100
    })
    test: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'label', 'idx', 'changed'],
        num_rows: 500
    })
})


In [19]:
print(dataset_zh)

DatasetDict({
    validation: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'labels', 'idx', 'changed', 'input_ids', 'attention_mask'],
        num_rows: 100
    })
    test: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'labels', 'idx', 'changed', 'input_ids', 'attention_mask'],
        num_rows: 500
    })
})


In [14]:
model.active_adapters = Stack("zh", "copa")

In [15]:
import numpy as np
from transformers import EvalPrediction


def compute_accuracy(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    mask = p.label_ids != -100
    correct = (preds == p.label_ids) & mask
    return {"acc": (preds == p.label_ids).mean()}


eval_trainer = AdapterTrainer(
    model=model,
    args=TrainingArguments(
        output_dir="./eval_output",
        remove_unused_columns=False,
    ),
    eval_dataset=dataset_zh["test"],
    compute_metrics=compute_accuracy,
)
eval_trainer.evaluate()

{'eval_loss': 0.685754656791687,
 'eval_model_preparation_time': 0.007,
 'eval_acc': 0.586,
 'eval_runtime': 5.4054,
 'eval_samples_per_second': 92.5,
 'eval_steps_per_second': 11.655}

In [47]:
premise = "她拿着钥匙开门。"
question = "为什么？"
choices = ["她想进屋。", "她想出去。"]

# Build multiple-choice input
sentences_a = [premise + " " + question for _ in choices]
sentences_b = choices
# Tokenize
encoded = tokenizer(
    sentences_a, sentences_b, return_tensors="pt", padding=True, truncation=True
)
# Move to correct device
encoded = {k: v.to(model.device) for k, v in encoded.items()}
# Run model
outputs = model(**encoded)
logits = outputs.logits  # Shape: [batch_size=1, num_choices=2]
# Get prediction
pred = logits.argmax(dim=1).item()
print(f"Prediction: {choices[pred]}")

Prediction: 她想出去。
