In [1]:
import os
import transformers
import numpy as np
import torch
from datasets import Dataset, DatasetDict
from eval import *
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union


dir_path = "/data/NFS/andy/course/ADL/hw2/"
data_path = dir_path + "/dataset/train.json"
context_path = dir_path + "/dataset/context.json"
weights_path = dir_path + "/mc_weigths/"

model_checkpoint = "bert-base-chinese"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
def preprocess_function(examples):
    question = [[q] * 7 for q in examples["question"]]
    sub_contexts = []
    for i in range(len(examples["paragraphs"])):
        for p in examples["paragraphs"][i]:
            sub_contexts.append([p])

    question = sum(question, [])
    sub_contexts = sum(sub_contexts, [])
    
    max_length = 512
    tokenized_examples = tokenizer(question, sub_contexts, max_length=max_length, truncation=True)
    return {k: [v[i:i+7] for i in range(0, len(v), 7)] for k, v in tokenized_examples.items()}

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop("label") for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = []
        for i, feature in enumerate(features):
            num_choices = len(features[i]["input_ids"])
            f = []
            for j in range(num_choices):
                d = {}
                for k, v in feature.items():
                    d.update({k: v[j]})
                f.append(d)
            flattened_features.append(f)
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        
        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [5]:
data = load_json(data_path)
context = load_json(context_path)

length = int(len(data)*0.8)
train_data = {"question":[], "paragraphs":[], "label":[]}
eval_data = {"question":[], "paragraphs":[], "label":[]}
for sub in data[:length]:
    train_data["question"].append(sub["question"])
    train_data["paragraphs"].append([context[i] for i in sub["paragraphs"]])
    train_data["label"].append(sub["paragraphs"].index(sub["relevant"]))
for sub in data[length:]:
    eval_data["question"].append(sub["question"])
    eval_data["paragraphs"].append([context[i] for i in sub["paragraphs"]])
    eval_data["label"].append(sub["paragraphs"].index(sub["relevant"]))
    
for i in range(len(train_data["paragraphs"])):
    while len(train_data["paragraphs"][i]) != 7:
        train_data["paragraphs"][i].append("")
        
for i in range(len(eval_data["paragraphs"])):
    while len(eval_data["paragraphs"][i]) != 7:
        eval_data["paragraphs"][i].append("")
        
datasets = DatasetDict({"train":Dataset.from_dict(train_data), "eval":Dataset.from_dict(eval_data)})

In [8]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

encoded_datasets = datasets.map(preprocess_function, batched=True, batch_size=1000)

In [15]:
model = transformers.AutoModelForMultipleChoice.from_pretrained(model_checkpoint, num_labels=7)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMultipleChoice: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-chinese and are newly

In [16]:
batch_size = 2

args = transformers.TrainingArguments(
    output_dir = "./mc_checkpoints",
    evaluation_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16 = True,
    gradient_accumulation_steps = 32,
    load_best_model_at_end = True,
)

trainer = transformers.Trainer(
    model,
    args,
    train_dataset=encoded_datasets["train"],
    eval_dataset=encoded_datasets["eval"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics,
)

In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.145049,0.947153
1,0.172900,0.148919,0.949611
2,0.172900,0.150763,0.955141


TrainOutput(global_step=915, training_loss=0.11664581507281528, metrics={'train_runtime': 7557.67, 'train_samples_per_second': 0.121, 'total_flos': 1.2772164168600779e+17, 'epoch': 3.0})

In [23]:
trainer.save_model(weights_path)