In [2]:
from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, GPT2LMHeadModel, pipeline, \
                         Trainer, TrainingArguments

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')  # load up a standard gpt2 model

tokenizer.pad_token = tokenizer.eos_token
# set our pad token to be the eos token. This lets gpt know how to fill space

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [4]:
# load up our data into a dataset
pds_data = TextDataset(
    tokenizer=tokenizer,
    file_path='/content/wheelrobo 1.txt',  # Principles of Data Science - Sinan Ozdemir
    block_size=64  # length of each chunk of text to use as a datapoint
)



In [5]:
pds_data[0], pds_data[0].shape  # inspect the first point

(tensor([   52, 10116, 13403, 22826, 46254,   290,  2764,   278, 16071,   198,
          4480,  1766, 46357,  4448,   320,  1262, 11361,   198,    50,    13,
            47,   615,   342,   430,   352,    11,   317, 18270,   449,   371,
            17,    11,   311,  1601,   499, 25619,  3099,   399,    18,   198,
         48902,  8129, 21714,    16,   837, 38778,   362,    11,    18,   198,
            16,    11,    17,    11,    18,  3961,   286, 13851,  5800,   290,
         14044,    11,   569,   695]),
 torch.Size([64]))

In [6]:
print(tokenizer.decode(pds_data[0]))

UAV Based Fruit Detection and Counting Robot
with Coppeliasim using Python
S.Pavithra 1, Aarthy J R2, Suryaprabha N3
Assistant Professor Sr1 ,Student 2,3
1,2,3 School of Computer Science and Engineering, Vell


In [7]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
    # MLM is Masked Language Modelling (for BERT + auto-encoding tasks)
)


In [8]:
# example of how collator pads data dynamically
collator_example = data_collator([tokenizer('I am an input'), tokenizer('So am I')])

collator_example

{'input_ids': tensor([[   40,   716,   281,  5128],
        [ 2396,   716,   314, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]]), 'labels': tensor([[  40,  716,  281, 5128],
        [2396,  716,  314, -100]])}

In [9]:
collator_example.input_ids  # 50256 is our pad token id

tensor([[   40,   716,   281,  5128],
        [ 2396,   716,   314, 50256]])

In [10]:
tokenizer.pad_token_id

50256

In [11]:
collator_example.attention_mask  # Note the 0 in the attention mask where we have a pad token

tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]])

In [12]:
collator_example.labels  # note the -100 to ignore loss calculation for the padded token
# Labels are shifted inside the GPT model so we don't need to worry about that

tensor([[  40,  716,  281, 5128],
        [2396,  716,  314, -100]])

In [13]:
model = GPT2LMHeadModel.from_pretrained('gpt2')  # load up a GPT2 model

pretrained_generator = pipeline(  # create a generator with built in params
    'text-generation', model=model, tokenizer='gpt2',
    config={'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cpu


In [14]:
print('----------')
for generated_sequence in pretrained_generator('This dataset shows the relationship', num_return_sequences=3):
    print(generated_sequence['generated_text'])
    print('----------')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


----------
This dataset shows the relationship between alcohol and cancer rates among US teenagers, by race/ethnicity, age at first diagnosis and race/ethnicity. In particular, I found that high-risk adults were at greater higher risk for black and Hispanic cancers
----------
This dataset shows the relationship between age, sex, education, family income, health, income, ethnicity, occupation and marital status in the UK (data source provided by the UK). There is no obvious association between this and life expectancy, and only slightly
----------
This dataset shows the relationship between the size of an animal's cage (M) versus body weight in wild specimens collected from the US (3). The mean size of a specimen is shown as a percentage of the original dataset when plotted on a log scale
----------


In [15]:
training_args = TrainingArguments(
    output_dir="./gpt2_pds", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    logging_steps=10,
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=pds_data.examples[:int(len(pds_data.examples)*.8)],
    eval_dataset=pds_data.examples[int(len(pds_data.examples)*.8):]
)

trainer.evaluate()





<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


{'eval_loss': 4.867547512054443,
 'eval_model_preparation_time': 0.0042,
 'eval_runtime': 1.0011,
 'eval_samples_per_second': 2.997,
 'eval_steps_per_second': 0.999}

In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time
1,No log,4.672539,0.0042
2,No log,4.578821,0.0042
3,No log,4.535911,0.0042


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=3, training_loss=5.290067036946614, metrics={'train_runtime': 78.0234, 'train_samples_per_second': 0.384, 'train_steps_per_second': 0.038, 'total_flos': 979845120000.0, 'train_loss': 5.290067036946614, 'epoch': 3.0})

In [17]:
trainer.save_model()

In [18]:
loaded_model = GPT2LMHeadModel.from_pretrained('./gpt2_pds')

finetuned_generator = pipeline(
    'text-generation', model=loaded_model, tokenizer=tokenizer,
    config={'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

Device set to use cpu


In [19]:
# examples are now sustainably about data
print('----------')
for generated_sequence in finetuned_generator('This dataset shows the relationship', num_return_sequences=3):
    print(generated_sequence['generated_text'])
    print('----------')

----------
This dataset shows the relationship with frequency and age (Table 2, left). Frequency-specific allele frequencies of the three subgroups with a maximum allele frequency of 1.3 (C) for each subject from the 1.3- and 0.3
----------
This dataset shows the relationship between BMI and educational attainment in a sample of US men and the outcomes reported within the Framingham study, with low-income or white Americans being the most likely to have a high school diploma. While these data are limited by
----------
This dataset shows the relationship between time and distance. Although the two are related by a degree ranging from 0.3–1.8, the data show that the distance is quite similar between the two data points. This suggests that distance is quite close
----------
