In [None]:
from transformers import GPT2Tokenizer,TextDataset,DataCollatorForLanguageModeling,GPT2LMHeadModel, pipeline, Trainer, TrainingArguments

In [None]:
tokenizer=GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

# Now, you can use the file path "/content/drive/My Drive/" to access your Google Drive files.


In [None]:
!ls -ltr
import os
os.chdir('/content/drive/My Drive/')

In [None]:
import os

# List files in the root of your Google Drive
drive_root = '/content/drive/My Drive/'
print(os.listdir(drive_root))


In [None]:
pds_data=TextDataset(
    tokenizer=tokenizer,
    file_path='/content/drive/My Drive/PDS2.txt',
    block_size=32
)

In [None]:
type(pds_data)

In [None]:
len(pds_data)

In [None]:
pds_data[0]

In [None]:
pds_data[0].shape

In [None]:
print(tokenizer.decode(pds_data[0]))

In [None]:
print(tokenizer.decode(pds_data[4695]))

In [None]:
data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm=False,)

In [None]:
tokenizer.pad_token = tokenizer.eos_token

collator_examples=data_collator([tokenizer('I am an input'),tokenizer('So am I')])
collator_examples

In [None]:
collator_examples.input_ids

In [None]:
tokenizer.pad_token_id

In [None]:
collator_examples.attention_mask

In [None]:
collator_examples.labels

In [None]:
model=GPT2LMHeadModel.from_pretrained('gpt2')

In [None]:
pretrained_generator=pipeline(
    'text-generation',model=model, tokenizer='gpt2',
    config={'max-length':200, 'do_sample':True,'top_p':0.9,'temperature':0.7,'top_k':10}
)

In [None]:
print('-------')
for generated_seq in pretrained_generator('A dataset shows the relationships',num_return_sequences=3):
    print(generated_seq['generated_text'])
    print('#######')

In [None]:
len(pds_data.examples)

In [None]:
!pip install transformers[torch]
!pip install accelerate -U

In [None]:
training_args=TrainingArguments(
    output_dir='./gpt2_pds',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=len(pds_data.examples)//5,
    logging_steps=50,
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

In [None]:
trainer=Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=pds_data.examples[:int(len(pds_data.examples)*0.8)],
    eval_dataset=pds_data.examples[int(len(pds_data.examples)*0.8):]
)

In [None]:
trainer.evaluate()

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model()

In [None]:
loaded_model=GPT2LMHeadModel.from_pretrained('./gpt2_pds')

In [None]:
finetuned_generator=pipeline('text-generation',model=loaded_model, tokenizer=tokenizer,
                             config={'max-length':200, 'do_sample':True,'top_p':0.9,'temperature':0.7,'top_k':10})

In [None]:
print('-------')
for generated_seq in finetuned_generator('A dataset shows the relationships',num_return_sequences=3):
    print(generated_seq['generated_text'])
    print('#######')