# Load Dataset

In [1]:
from datasets import load_dataset

ds = load_dataset("rajpurkar/squad")

In [2]:
# Structure
print(ds)
print("\nTraining set:", ds["train"][0])
print("\nTesting set:", ds["validation"][0])

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

Training set: {'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues an

# Tokenization

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorWithPadding

# Tokenization
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")  # Define model

if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        model.resize_token_embeddings(len(tokenizer))


def tokenization(example):
    questions = example["question"]
    
    answer_list = example["answers"]
    tmp_answer_list1 = [subdict['text'] for subdict in answer_list]  # [[]...[]]
    answers = ["; ".join(sublist) for sublist in tmp_answer_list1]  # []
    
    return tokenizer(questions, answers, truncation=True, padding="max_length", 
                     max_length=256)


train_tokenized_ds = ds['train'].map(tokenization, batched=True)
valid_tokenized_ds = ds['validation'].map(tokenization, batched=True)

train_ds = train_tokenized_ds.add_column("labels", train_tokenized_ds["input_ids"])
valid_ds = valid_tokenized_ds.add_column("labels", valid_tokenized_ds["input_ids"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

# Training

In [6]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

In [7]:
from transformers import Trainer

trainer = Trainer(model, training_args, train_dataset=train_ds, eval_dataset=valid_ds, 
                  data_collator=data_collator, processing_class=tokenizer)

In [None]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50257}.
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,1.5359
