<a href="https://colab.research.google.com/github/AnDDoanf/Biblical_NLP_task/blob/master/textgen/Fine_tune_GPT2_for_Biblical_Commentary_Text_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import modules

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.3 transformers-4.27.4


In [None]:
import pandas as pd
import torch
import os
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

### Define models

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained('gpt2')#.cuda()
model.resize_token_embeddings(len(tokenizer))

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50259, 768)

### Load data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def txt2paragraph(file):
  text = open(file).read()
  replace_list = ['A. ', 'B.', 'C.', 'D.', 'E.', 'F.', 'G.', 'H.', 'I.', 'J.', 'K.', 'L.', 'M.', 'N.', 'O.', 'P.', 
                  '\n1.','\n2.','\n3.','\n4.', '\n5.','\n6.', '\n7.','\n8.' '\n9.','\n10.',
                  '\n·', '\na.', '\nb.', '\nc.', '\nd.', '\ne.', '\nf.', '\ng.', '\nh.', '\ni.', '\nj.', '\nk.', '\nl.', '\nm.', '\nn.', '\no.', '\np.',
                  '\ni.', '\nii.', '\niii.', '\niv.', '\nv.', '\nvi.', '\nvii.', '\nviii.', '\nix.', '\nx.', '\nxi.']
  
  import re
  clean = re.sub(r'|'.join(map(re.escape, replace_list)), '', text)

  clean = re.sub('.\n', '. ', clean)
  clean = re.sub('\n', '', clean)
  return clean.split(".")[:-3]

def getText(datasetdir):
  lst = []
  for bookdir in sorted(os.listdir(datasetdir)):
    for chapterdir in sorted(os.listdir(os.path.join(datasetdir, bookdir))):
      lst = lst + txt2paragraph(os.path.join(datasetdir, bookdir, chapterdir))
  return lst

In [None]:
data = getText("/content/drive/MyDrive/bible_text_generation/bible_commentary_dataset")

In [None]:
len(data)

144729

In [None]:
max_length = max([len(tokenizer.encode(line)) for line in data])


### Preprocess data

In [None]:
class PreProcessing(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [None]:
preprocessData = PreProcessing(data, tokenizer, max_length=max_length)
train_size = int(0.9 * len(preprocessData))
train_dataset, val_dataset = random_split(preprocessData, [train_size, len(preprocessData) - train_size])

### Fine Tuning model

In [None]:
training_args = TrainingArguments(output_dir='./results', num_train_epochs=1, logging_steps=1000, save_steps=2000,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                  warmup_steps=10, weight_decay=0.05, logging_dir='./logs', report_to = 'none')

In [None]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

Step,Training Loss


KeyboardInterrupt: ignored

### Generate Text

In [None]:
generated = tokenizer("God", return_tensors="pt").input_ids#.cuda()


sample_outputs = model.generate(generated, do_sample=True, top_k=50, 
                                max_length=300, top_p=0.95, temperature=1.9, num_return_sequences=20)

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))