In [16]:
from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, GPT2LMHeadModel, pipeline, \
                         Trainer, TrainingArguments

In [17]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')  # load up a standard gpt2 model

tokenizer.pad_token = tokenizer.eos_token
# set our pad token to be the eos token. This lets gpt know how to fill space

In [18]:
# load up our data into a dataset

pds_data = TextDataset(
    tokenizer=tokenizer,
    file_path='/content/737362256-Hands-On-Large-Language-Models.txt',  # Principles of Data Science - Sinan Ozdemir
    block_size=64  # length of each chunk of text to use as a datapoint
)

In [19]:
pds_data[0], pds_data[0].shape  # inspect the first point

(tensor([  200,    39,  1746,    12,  2202, 13601, 15417, 32329,   198, 32065,
         28491,   290, 16588,   198,   198,  3152, 12556, 13868,   304, 12106,
            11,   345,   651,  3835,   287,   511, 14555,  1296,   960,  1169,
          1772,   447,   247,    82,   198,  1831,   290,   555, 42131,  2695,
           355,   484,  3551,   960,   568,   345,   460,  1011,  4621,   286,
           777,   198, 23873,  5823,   890,   878,   262,  1743,  2650,   286,
           777,  8714,    13,   198]),
 torch.Size([64]))

In [20]:
print(tokenizer.decode(pds_data[0]))

Hands-On Large Language Models
Language Understanding and Generation

With Early Release ebooks, you get books in their earliest form—the author’s
raw and unedited content as they write—so you can take advantage of these
technologies long before the official release of these titles.



In [21]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
    # MLM is Masked Language Modelling (for BERT + auto-encoding tasks)
)

In [22]:
# example of how collator pads data dynamically
collator_example = data_collator([tokenizer('I am an input'), tokenizer('So am I')])

collator_example

{'input_ids': tensor([[   40,   716,   281,  5128],
        [ 2396,   716,   314, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]]), 'labels': tensor([[  40,  716,  281, 5128],
        [2396,  716,  314, -100]])}

In [23]:
collator_example.input_ids  # 50256 is our pad token id

tensor([[   40,   716,   281,  5128],
        [ 2396,   716,   314, 50256]])

In [24]:
tokenizer.pad_token_id

50256

In [25]:
collator_example.attention_mask  # Note the 0 in the attention mask where we have a pad token

tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]])

In [26]:
collator_example.labels  # note the -100 to ignore loss calculation for the padded token
# Labels are shifted inside the GPT model so we don't need to worry about that

tensor([[  40,  716,  281, 5128],
        [2396,  716,  314, -100]])

In [27]:
model = GPT2LMHeadModel.from_pretrained('gpt2')  # load up a GPT2 model

pretrained_generator = pipeline(  # create a generator with built in params
    'text-generation', model=model, tokenizer='gpt2',
    config={'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cuda:0


In [28]:
print('----------')
for generated_sequence in pretrained_generator('This dataset shows the relationship', num_return_sequences=3):
    print(generated_sequence['generated_text'])
    print('----------')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


----------
This dataset shows the relationship between obesity, diabetes and the occurrence of metabolic diseases such as myocardial infarction, stroke and hypertension. The associations between body mass index and those diseases have been estimated from the recent case of an elderly Chinese man who
----------
This dataset shows the relationship between the two variables at a single point on a population scale. The correlation for a particular set of covariates is shown in Table 1. For the nonlinear model, the positive correlation is shown, due to the increase in
----------
This dataset shows the relationship between the two functions. First, with a simple set of variables (I, P, G) where N = 100 x 100 = 1000, then the correlation does not matter at all compared to (where N is the number
----------


In [29]:
training_args = TrainingArguments(
    output_dir="./gpt2_pds", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    logging_steps=10,
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=pds_data.examples[:int(len(pds_data.examples)*.8)],
    eval_dataset=pds_data.examples[int(len(pds_data.examples)*.8):]
)

trainer.evaluate()





<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


{'eval_loss': 5.006204128265381,
 'eval_model_preparation_time': 0.0029,
 'eval_runtime': 0.7535,
 'eval_samples_per_second': 180.48,
 'eval_steps_per_second': 6.635}

In [30]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time
1,4.3496,4.047226,0.0029
2,3.6485,3.947193,0.0029
3,3.4523,3.922522,0.0029


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=51, training_loss=3.7772733323714314, metrics={'train_runtime': 60.0411, 'train_samples_per_second': 26.981, 'train_steps_per_second': 0.849, 'total_flos': 52911636480000.0, 'train_loss': 3.7772733323714314, 'epoch': 3.0})

In [31]:
trainer.evaluate()  # loss decrease is slowing down so we are hitting our limit

{'eval_loss': 3.9225220680236816,
 'eval_model_preparation_time': 0.0029,
 'eval_runtime': 0.7274,
 'eval_samples_per_second': 186.977,
 'eval_steps_per_second': 6.874,
 'epoch': 3.0}

In [32]:
trainer.save_model()

In [33]:
loaded_model = GPT2LMHeadModel.from_pretrained('./gpt2_pds')

finetuned_generator = pipeline(
    'text-generation', model=loaded_model, tokenizer=tokenizer,
    config={'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

Device set to use cuda:0


In [34]:
# examples are now sustainably about data
print('----------')
for generated_sequence in finetuned_generator('This dataset shows the relationship', num_return_sequences=3):
    print(generated_sequence['generated_text'])
    print('----------')

----------
This dataset shows the relationship between language and the number of words that can be used as a result of each generation, in a highly linear fashion

Figure 3-11 shows the correlation of the top 10, 50, and 1000 words for each generation
----------
This dataset shows the relationship between several domains in each cluster.
The best way to model queries on these questions is the following:

To model a series of questions
model the following query

Topic: Answer [Topic]
Topic:
----------
This dataset shows the relationship between the first three terms.

When we assign a label
to an RNN, the labels
learned on the same task are assigned according to the expected
results for all label pairs. Then, we can
----------
