In [None]:
!pip install --upgrade jupyter
!pip install --upgrade ipywidgets
!jupyter nbextension enable --py widgetsnbextension

In [None]:
!pip install transformers==4.18.0
!pip install datasets==2.9.0
!pip install pandas==1.4.1
!pip install numpy==1.22.2
!pip install wandb==0.13.9
!pip install torch==1.8.1

In [2]:
from transformers import TrainingArguments
from transformers import GPT2LMHeadModel
from transformers import GPT2Tokenizer
from transformers import Trainer
import transformers 
import numpy as np
import datasets
import logging
import torch
import wandb
import os

In [5]:
TRAIN_EPOCHS = 2
TRAIN_BATCH_SIZE = 4
EVAL_BATCH_SIZE = 4
MAX_LEN = 512
LOGGING_STEPS = 64
SAVE_STEPS = 10240  # reduce it to a smaler value like 512 if you want to save checkpoints
SAVE_TOTAL_LIMIT = 2

BOS_TOKEN = '<|startoftext|>'
EOS_TOKEN = '<|endoftext|>'
PAD_TOKEN = '<|pad|>'

In [6]:
!wandb login <USE YOUR WEIGHTS & BIASES API KEY HERE>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [7]:
path = os.path.abspath('finetune-gpt2.ipynb')
path

'/root/how-to-train-faq-chatbot-from-scratch/02-finetune/01-finetune-custom-gpt2.ipynb'

In [8]:
os.environ['WANDB_NOTEBOOK_NAME'] = path

In [9]:
reloaded_dataset = datasets.load_from_disk('/tokenized-custom')
reloaded_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1428
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 159
    })
})

In [10]:
tokenizer = GPT2Tokenizer.from_pretrained('/tokenize/vocab-custom', 
                                          bos_token=BOS_TOKEN,
                                          eos_token=EOS_TOKEN, 
                                          pad_token=PAD_TOKEN, 
                                          lower=True,
                                          return_tensors='pt')
tokenizer.padding_side = 'left'
tokenizer.model_max_length = MAX_LEN
logger.info(f'Tokenizer: {tokenizer}')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Tokenizer: PreTrainedTokenizer(name_or_path='.././01-tokenize/vocab-custom', vocab_size=50257, model_max_len=512, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'bos_token': AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|pad|>'})


In [11]:
model = GPT2LMHeadModel.from_pretrained('arun-shankar/GPT-2-covid-news-articles').cuda()
model.resize_token_embeddings(len(tokenizer))

Embedding(50259, 768)

In [12]:
def custom_data_collator(batch):
    # batch size for data collation = per_device_train_batch_size * number of GPUs
    input_ids = torch.stack([example['input_ids'] for example in batch])
    attention_mask = torch.stack([example['attention_mask'] for example in batch])
    labels = torch.stack([example['labels'] for example in batch])
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

In [13]:
training_args = TrainingArguments(output_dir='./model/custom-finetuned', 
                                  overwrite_output_dir=True, 
                                  num_train_epochs=TRAIN_EPOCHS,  
                                  optim='adamw_torch', 
                                  save_strategy='steps', 
                                  evaluation_strategy='epoch',
                                  per_device_train_batch_size=TRAIN_BATCH_SIZE, 
                                  per_device_eval_batch_size=EVAL_BATCH_SIZE, 
                                  warmup_steps=10, 
                                  weight_decay=0.1,
                                  logging_steps=LOGGING_STEPS,
                                  save_steps=SAVE_STEPS, 
                                  save_total_limit=SAVE_TOTAL_LIMIT,
                                  report_to='wandb',
                                  logging_dir='logs')

In [14]:
trainer = Trainer(model=model, 
                  args=training_args, 
                  train_dataset=reloaded_dataset['train'], 
                  eval_dataset=reloaded_dataset['validation'], 
                  data_collator=custom_data_collator)

In [2]:
trainer.train()

#### Save finetuned model to local 

In [16]:
trainer.save_model('./model/finetuned')

Saving model checkpoint to ./model/custom-finetuned
Configuration saved in ./model/custom-finetuned/config.json
Model weights saved in ./model/custom-finetuned/pytorch_model.bin
