In [None]:
!pip install datasets transformers[torch] --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━

In [None]:
import numpy as np
import pandas as pd
import os
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import (
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    set_seed,
    DataCollatorWithPadding,
    DefaultDataCollator
)
from datasets import load_dataset, load_metric
from dataclasses import dataclass, field
from typing import Optional, Union

In [None]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )
    use_fast_tokenizer: bool = field(
        default=True,
        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
    )
    model_revision: str = field(
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
    )
    token: str = field(
        default=None,
        metadata={
            "help": (
                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
            )
        },
    )
    use_auth_token: bool = field(
        default=None,
        metadata={
            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
        },
    )
    trust_remote_code: bool = field(
        default=False,
        metadata={
            "help": (
                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
                "execute code present on the Hub on your local machine."
            )
        },
    )

In [None]:
model_args = ModelArguments(model_name_or_path='bert-base-uncased', tokenizer_name='bert-base-uncased', cache_dir='./', use_fast_tokenizer=True)


In [None]:
@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})

    validation_file: Optional[str] = field(
        default=None,
        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )
    preprocessing_num_workers: Optional[int] = field(
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
    max_seq_length: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "The maximum total input sequence length after tokenization. If passed, sequences longer "
                "than this will be truncated, sequences shorter will be padded."
            )
        },
    )
    pad_to_max_length: bool = field(
        default=False,
        metadata={
            "help": (
                "Whether to pad all samples to the maximum sentence length. "
                "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
                "efficient on GPU but very bad for TPU."
            )
        },
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of training examples to this "
                "value if set."
            )
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
                "value if set."
            )
        },
    )

In [None]:
sciq = load_dataset("sciq")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.99M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/339k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/343k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11679 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
sciq

DatasetDict({
    train: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
        num_rows: 11679
    })
    validation: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
        num_rows: 1000
    })
})

In [None]:
def update_columns(example):
    example['question'] = example['question']
    example['answer'] = example['correct_answer']
    example["question_with_text"] = "Question: "+example["question"]+". Supporting Text: "+example["support"]+"."
    example['choice_list'] = [example['distractor3'],example['distractor2'],example['distractor1'],example['correct_answer']]
    example['label'] = 3
    return example

In [None]:
sciq_train = sciq['train'].map(update_columns)
sciq_test = sciq['test'].map(update_columns)
sciq_val = sciq['validation'].map(update_columns)

Map:   0%|          | 0/11679 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:

train_dataset = sciq_train
validation_dataset = sciq_val
test_dataset = sciq_test

In [None]:
data_args = DataTrainingArguments(train_file=train_dataset, validation_file=validation_dataset,
                      max_train_samples=len(train_dataset), max_eval_samples = len(validation_dataset))

In [None]:
model_name = model_args.model_name_or_path.split("/")[-1]

training_args = TrainingArguments(
    f"{model_name}-finetuned-demo_withsupport",
    evaluation_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    do_train = True,
    do_eval = True,
    do_predict = True
)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, use_fast=model_args.use_fast_tokenizer)

print(tokenizer)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


In [None]:
print(validation_dataset[5]['question_with_text'])
print(validation_dataset[5]['choice_list'])

tokenizer(validation_dataset[0]['question'], validation_dataset[0]['choice_list'][0])

Question: What are arteries, veins, and capillaries examples of?. Supporting Text: Blood vessels include arteries, veins, and capillaries..
['tissue', 'organs', 'muscles', 'blood vessels']


{'input_ids': [101, 2040, 3818, 1996, 3399, 1997, 6622, 2011, 3019, 4989, 1029, 102, 9531, 2015, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
repeated_question = [validation_dataset[0]['question']]*4
tokenizer(repeated_question, validation_dataset[0]['choice_list'])

{'input_ids': [[101, 2040, 3818, 1996, 3399, 1997, 6622, 2011, 3019, 4989, 1029, 102, 9531, 2015, 102], [101, 2040, 3818, 1996, 3399, 1997, 6622, 2011, 3019, 4989, 1029, 102, 8233, 102], [101, 2040, 3818, 1996, 3399, 1997, 6622, 2011, 3019, 4989, 1029, 102, 21610, 102], [101, 2040, 3818, 1996, 3399, 1997, 6622, 2011, 3019, 4989, 1029, 102, 11534, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [None]:
# # repeat each question four times, add it to first_sentences list
# examples = validation_dataset
# first_sentences = []
# for example in examples:
#   for i in range(4):
#     first_sentences.append("Question: "+example['question'] +" Supporting Text: "+ example['support'])

# # add choices to second_sentences list
# second_sentences = []
# for example in validation_dataset:
#   for choice in example['choice_list']:
#     second_sentences.append(choice)

In [None]:
# def get_encoded_dictionary(dataset):
#   examples = dataset
#   first_sentences = []
#   for example in examples:
#     for i in range(4):
#       first_sentences.append("Question: "+example['question'] +" Supporting Text: "+ example['support'])

#   # add choices to second_sentences list
#   second_sentences = []
#   for example in dataset:
#     for choice in example['choice_list']:
#       second_sentences.append(choice)

#   tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, max_length=max_seq_length)

#   # Un-flatten
#   dic = {'input_ids':[], 'token_type_ids':[], 'attention_mask':[]}

#   for k, v in tokenized_examples.items():
#       for i in range(0, len(v), 4):
#           dic[k].append(v[i:i+4])
#   return dic

# encoded_train = get_encoded_dictionary(train_dataset)
# encoded_validation = get_encoded_dictionary(validation_dataset)
# encoded_test = get_encoded_dictionary(test_dataset)

In [None]:
max_seq_length = tokenizer.model_max_length

def preprocess_function(examples):
    first_sentences = []
    # replicating first sentences 4 times
    for q in examples["question_with_text"]:
      for i in range(4):
        first_sentences.append(q)




    # putting all choices in a list
    second_sentences = []
    for choice_list in examples['choice_list']:
      for choice in choice_list:
        second_sentences.append(choice)


    # Tokenize
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, max_length=max_seq_length)

    # Un-flatten
    dic = {'input_ids':[], 'token_type_ids':[], 'attention_mask':[]}

    for k, v in tokenized_examples.items():
        for i in range(0, len(v), 4):
            dic[k].append(v[i:i+4])

    return dic


encoded_train = train_dataset.map(preprocess_function, batched=True, remove_columns=['question', 'answer', 'choice_list'])
encoded_validation = validation_dataset.map(preprocess_function, batched=True, remove_columns=['question', 'answer', 'choice_list'])
encoded_test = test_dataset.map(preprocess_function, batched=True, remove_columns=['question', 'choice_list'])

Map:   0%|          | 0/11679 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
## flatten: convert  {input_ids: [[],[],[],[]], attention_mask:[[],[],[],[]]} to
## {input_ids: [], attention_mask:[]}, {input_ids: [], attention_mask:[]}, {input_ids: [], attention_mask:[]}, {input_ids: [], attention_mask:[]}

features = encoded_validation

accepted_keys = ["input_ids", "attention_mask", "label", "token_type_ids"]
features = [{k: v for k, v in encoded_validation[i].items() if k in accepted_keys} for i in range(len(features))]
labels = [feature.pop('label') for feature in features]

## to flatten:
flattened_features = []
for feature in features:
    for i in range(4):
        dic = {}
        for k,v in feature.items():
            dic[k] =v[i]
        flattened_features.append(dic)

In [None]:
'''
max_length=None mean that the sequences will not be truncated at all.
The sequences will only be padded to match the longest sequence in the batch, but not truncated to a fixed length.
'''

batch = tokenizer.pad(
            encoded_inputs=flattened_features,
            padding=True,
            max_length=None,
            pad_to_multiple_of=None,
            return_tensors="pt",
        )

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
# Un-flatten
batch_size = len(encoded_validation)
num_choices = 4
batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}

In [None]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase

    '''
    Union indicates that the padding parameter accepts multiple different types. Union combines several types into one.
    This is a list of allowable types for the padding parameter. It can be a boolean, string, or an instance of PaddingStrategy.
    The default parameter is set to True
    '''
    padding: Union[bool, str, PaddingStrategy] = True

    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        flag = False
        if 'label' in features[0].keys():
          flag = True
          labels = [feature.pop("label") for feature in features]

        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])

        flattened_features = [[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]

        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        if flag:
          batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [None]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer


model = AutoModelForMultipleChoice.from_pretrained(model_args.model_name_or_path)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import numpy as np

def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train,
    eval_dataset=encoded_validation,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
# Training

train_result = trainer.train()
trainer.save_model() # Saves the tokenizer too for easy upload

metrics = train_result.metrics


max_train_samples = (
            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
        )
metrics["train_samples"] = min(max_train_samples, len(train_dataset))

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2548,0.239537,0.917


***** train metrics *****
  epoch                    =        1.0
  total_flos               =  4864783GF
  train_loss               =     0.3174
  train_runtime            = 0:31:44.06
  train_samples            =      11679
  train_samples_per_second =      6.134
  train_steps_per_second   =      1.534


In [None]:
# Evaluation
if training_args.do_eval:

    metrics = trainer.evaluate()
    max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(encoded_validation)
    metrics["eval_samples"] = min(max_eval_samples, len(encoded_validation))

    trainer.log_metrics("eval", metrics)
    trainer.save_metrics("eval", metrics)

***** eval metrics *****
  epoch                   =        1.0
  eval_accuracy           =      0.917
  eval_loss               =     0.2395
  eval_runtime            = 0:00:52.53
  eval_samples            =       1000
  eval_samples_per_second =     19.036
  eval_steps_per_second   =      4.759


In [None]:
test_results = trainer.predict(encoded_test)

argmax_idxs = np.argmax(test_results[0], axis=1)

In [None]:
argmax_idxs

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 2, 3, 3, 3, 3, 3, 1, 1, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3,
       2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 1, 3, 3,
       2, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3,
       1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 1, 3, 3, 2, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3,

In [None]:
type(test_dataset)

In [None]:
test_df = test_dataset.to_pandas()
test_df

Unnamed: 0,question,distractor3,distractor1,distractor2,correct_answer,support,answer,question_with_text,choice_list,label
0,Compounds that are capable of accepting electr...,residues,antioxidants,Oxygen,oxidants,Oxidants and Reductants Compounds that are cap...,oxidants,Question: Compounds that are capable of accept...,"[residues, Oxygen, antioxidants, oxidants]",3
1,What term in biotechnology means a genetically...,phenotype,adult,male,clone,But transgenic animals just have one novel gen...,clone,Question: What term in biotechnology means a g...,"[phenotype, male, adult, clone]",3
2,Vertebrata are characterized by the presence o...,Thumbs,Bones,Muscles,backbone,Figure 29.7 Vertebrata are characterized by th...,backbone,Question: Vertebrata are characterized by the ...,"[Thumbs, Muscles, Bones, backbone]",3
3,What is the height above or below sea level ca...,variation,depth,latitude,elevation,"As you know, the surface of Earth is not flat....",elevation,Question: What is the height above or below se...,"[variation, latitude, depth, elevation]",3
4,"Ice cores, varves and what else indicate the e...",magma,mountain ranges,fossils,tree rings,"Tree rings, ice cores, and varves indicate the...",tree rings,"Question: Ice cores, varves and what else indi...","[magma, fossils, mountain ranges, tree rings]",3
...,...,...,...,...,...,...,...,...,...,...
995,"In the case of the moose, predation is an addi...",weight,speed,color,population,,population,"Question: In the case of the moose, predation ...","[weight, color, speed, population]",3
996,Where do short period comets come from?,photon belt,milky way,vesta belt,kuiper belt,"Short-period comets come from the Kuiper belt,...",kuiper belt,Question: Where do short period comets come fr...,"[photon belt, vesta belt, milky way, kuiper belt]",3
997,Only after implantation can an embryo develop ...,living thing,mammal,humans,fetus,,fetus,Question: Only after implantation can an embry...,"[living thing, humans, mammal, fetus]",3
998,What are atoms with unstable nuclei are consid...,destructive,ions,unstable,radioactive,Atoms with unstable nuclei are radioactive. To...,radioactive,Question: What are atoms with unstable nuclei ...,"[destructive, unstable, ions, radioactive]",3


In [None]:

test_df['predicted_index'] = argmax_idxs
def get_answer(choice_list, predicted_index):
  return choice_list[predicted_index]

test_df['predicted_answer'] = test_df.apply(lambda row: get_answer(row['choice_list'], row['predicted_index']), axis=1 )

In [None]:
test_df

Unnamed: 0,question,distractor3,distractor1,distractor2,correct_answer,support,answer,question_with_text,choice_list,label,predicted_index,predicted_answer
0,Compounds that are capable of accepting electr...,residues,antioxidants,Oxygen,oxidants,Oxidants and Reductants Compounds that are cap...,oxidants,Question: Compounds that are capable of accept...,"[residues, Oxygen, antioxidants, oxidants]",3,3,oxidants
1,What term in biotechnology means a genetically...,phenotype,adult,male,clone,But transgenic animals just have one novel gen...,clone,Question: What term in biotechnology means a g...,"[phenotype, male, adult, clone]",3,3,clone
2,Vertebrata are characterized by the presence o...,Thumbs,Bones,Muscles,backbone,Figure 29.7 Vertebrata are characterized by th...,backbone,Question: Vertebrata are characterized by the ...,"[Thumbs, Muscles, Bones, backbone]",3,3,backbone
3,What is the height above or below sea level ca...,variation,depth,latitude,elevation,"As you know, the surface of Earth is not flat....",elevation,Question: What is the height above or below se...,"[variation, latitude, depth, elevation]",3,3,elevation
4,"Ice cores, varves and what else indicate the e...",magma,mountain ranges,fossils,tree rings,"Tree rings, ice cores, and varves indicate the...",tree rings,"Question: Ice cores, varves and what else indi...","[magma, fossils, mountain ranges, tree rings]",3,3,tree rings
...,...,...,...,...,...,...,...,...,...,...,...,...
995,"In the case of the moose, predation is an addi...",weight,speed,color,population,,population,"Question: In the case of the moose, predation ...","[weight, color, speed, population]",3,3,population
996,Where do short period comets come from?,photon belt,milky way,vesta belt,kuiper belt,"Short-period comets come from the Kuiper belt,...",kuiper belt,Question: Where do short period comets come fr...,"[photon belt, vesta belt, milky way, kuiper belt]",3,3,kuiper belt
997,Only after implantation can an embryo develop ...,living thing,mammal,humans,fetus,,fetus,Question: Only after implantation can an embry...,"[living thing, humans, mammal, fetus]",3,3,fetus
998,What are atoms with unstable nuclei are consid...,destructive,ions,unstable,radioactive,Atoms with unstable nuclei are radioactive. To...,radioactive,Question: What are atoms with unstable nuclei ...,"[destructive, unstable, ions, radioactive]",3,3,radioactive


In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(test_df["answer"],test_df["predicted_answer"])

0.915

In [None]:
test_df.to_csv("bert_base_uncased_questions_support_text_answers.csv")