# Load Dataset

In [1]:
from datasets import load_dataset

ds = load_dataset("rajpurkar/squad")

In [2]:
# Structure
print(ds)
print("\nTraining set:", ds["train"][0])
print("\nTesting set:", ds["validation"][0])

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

Training set: {'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues an

# Tokenization

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorWithPadding

# Tokenization
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")


def tokenization(example):
    questions = example["question"]
    
    answer_list = example["answers"]
    tmp_answer_list1 = [subdict['text'] for subdict in answer_list]  # [[]...[]]
    answers = ["; ".join(sublist) for sublist in tmp_answer_list1]  # []
    
    return tokenizer(questions, answers, truncation=True)


train_tokenized_ds = ds['train'].map(tokenization, batched=True)
valid_tokenized_ds = ds['validation'].map(tokenization, batched=True)

train_ds = train_tokenized_ds.add_column("labels", train_tokenized_ds["input_ids"])
valid_ds = valid_tokenized_ds.add_column("labels", valid_tokenized_ds["input_ids"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [12]:
print(train_ds['labels'])
print(train_ds['input_ids'])

Column([[2514, 4150, 750, 262, 5283, 5335, 7910, 1656, 287, 1248, 3365, 287, 406, 454, 8906, 4881, 30, 48615, 6206, 324, 5857, 311, 12944, 343, 516], [2061, 318, 287, 2166, 286, 262, 23382, 20377, 8774, 11819, 30, 64, 15317, 15207, 286, 1951], [464, 32520, 3970, 286, 262, 17380, 2612, 379, 23382, 20377, 318, 13970, 284, 543, 4645, 30, 1169, 8774, 11819], [2061, 318, 262, 10299, 33955, 379, 23382, 20377, 30, 64, 37919, 1295, 286, 11443, 290, 14580], [2061, 10718, 319, 1353, 286, 262, 8774, 11819, 379, 23382, 20377, 30, 64, 10861, 15207, 286, 262, 5283, 5335]])
Column([[2514, 4150, 750, 262, 5283, 5335, 7910, 1656, 287, 1248, 3365, 287, 406, 454, 8906, 4881, 30, 48615, 6206, 324, 5857, 311, 12944, 343, 516], [2061, 318, 287, 2166, 286, 262, 23382, 20377, 8774, 11819, 30, 64, 15317, 15207, 286, 1951], [464, 32520, 3970, 286, 262, 17380, 2612, 379, 23382, 20377, 318, 13970, 284, 543, 4645, 30, 1169, 8774, 11819], [2061, 318, 262, 10299, 33955, 379, 23382, 20377, 30, 64, 37919, 1295, 286, 1

In [6]:
valid_ds

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 10570
})

# Training

In [7]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

In [8]:
# Define model
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")

In [9]:
from transformers import Trainer

trainer = Trainer(model, training_args, train_dataset=train_ds, eval_dataset=valid_ds, 
                  data_collator=data_collator, processing_class=tokenizer)

In [10]:
if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        model.resize_token_embeddings(len(tokenizer))

trainer.train()

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50257}.


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).