In [1]:
import os
import transformers
import torch
import numpy as np
from datasets import Dataset, DatasetDict
from eval import *
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union



dir_path = "/data/NFS/andy/course/ADL/hw2/"
data_path = dir_path + "/dataset/public.json"
context_path = dir_path + "/dataset/context.json"
weights_path = dir_path + "mc_weights/"

model_checkpoint = "bert-base-chinese"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [2]:
def to_same_paragraphs(data):
    for i in range(len(data["paragraphs"])):
        while len(data["paragraphs"][i]) != 7:
            data["paragraphs"][i].append("")
    return data

def data_transfer(data):
    new_data = {"question":[], "paragraphs":[], "label":[]}
    for sub in data:
        new_data["question"].append(sub["question"])
        new_data["paragraphs"].append([context[i] for i in sub["paragraphs"]])
        new_data["label"].append(sub["paragraphs"].index(sub["relevant"]))
    
    new_data = to_same_paragraphs(new_data)
    return new_data

def preprocess_function(examples):
    question = [[q] * 7 for q in examples["question"]]
    sub_contexts = []
    for i in range(len(examples["paragraphs"])):
        for p in examples["paragraphs"][i]:
            sub_contexts.append([p])

    question = sum(question, [])
    sub_contexts = sum(sub_contexts, [])
    
    max_length = 512
    tokenized_examples = tokenizer(question, sub_contexts, max_length=max_length, truncation=True)
    return {k: [v[i:i+7] for i in range(0, len(v), 7)] for k, v in tokenized_examples.items()}

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop("label") for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = []
        for i, feature in enumerate(features):
            num_choices = len(features[i]["input_ids"])
            f = []
            for j in range(num_choices):
                d = {}
                for k, v in feature.items():
                    d.update({k: v[j]})
                f.append(d)
            flattened_features.append(f)
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        
        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [3]:
public_data = load_json(data_path)
context = load_json(context_path)

public_data = data_transfer(public_data)
datasets = Dataset.from_dict(public_data)

[*] Loading /data/NFS/andy/course/ADL/hw2//dataset/public.json...done
[*] Loading /data/NFS/andy/course/ADL/hw2//dataset/context.json...done


In [4]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

encoded_datasets = datasets.map(preprocess_function, batched=True, batch_size=1000)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [5]:
model = transformers.AutoModelForMultipleChoice.from_pretrained(weights_path)

In [6]:
trainer = transformers.Trainer(
    model,
    eval_dataset=encoded_datasets,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics,
)

In [7]:
trainer.evaluate()

{'eval_loss': 0.17410269379615784,
 'eval_accuracy': 0.9415768384933472,
 'eval_runtime': 871.1245,
 'eval_samples_per_second': 4.048}