# In this notebook we will tune the GPT2 to generate anime character dialog lines based on the keywords.
# Basically it is a conditional text generation problem. 
# Already trained transformer based text generation algorithms generates text based on the casual language masking or masked language masking. So i am going to tune a gpt architecture which will generate new text based on keywords which we will insert in the input and the text will be generated by catering the keywords.

# Installing huggingface transformers

In [1]:
%%time
%%capture
!pip install transformers

CPU times: user 24.5 ms, sys: 8.52 ms, total: 33.1 ms
Wall time: 3.63 s


# Importing Important Libraries

In [2]:
import os
import io
import requests
import numpy as np
import pandas as pd
import re
import zipfile
import random
import time
import csv
import datetime
from itertools import compress
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoTokenizer, AutoConfig, AutoModelForPreTraining, \
                         AdamW, get_linear_schedule_with_warmup, \
                         TrainingArguments, BeamScorer, Trainer

import torch
from torch.utils.data import Dataset, random_split, DataLoader, \
                             RandomSampler, SequentialSampler

from IPython.display import clear_output

print(f"PyTorch version: {torch.__version__}")

PyTorch version: 1.12.1+cu113


# Hyperparameters Initialization

In [3]:
DEBUG           = False

USE_APEX        = True
APEX_OPT_LEVEL  = 'O1'

# You can use any version of the gpt according to your computation powers
MODEL           = 'gpt2' #{gpt2, gpt2-medium, gpt2-large, gpt2-xl}

# if you want to fine tune only some layers of 
# transformers you can change the number of layers to be 
# tuned in the given variable
#UNFREEZE_LAST_N = 6 #The last N layers to unfreeze for training

# These are the special tokens which will help
# in seperating the keywords during the Dataset loading 
SPECIAL_TOKENS  = { "bos_token": "<|BOS|>",
                    "eos_token": "<|EOS|>",
                    "unk_token": "<|UNK|>",                    
                    "pad_token": "<|PAD|>",
                    "sep_token": "<|SEP|>"}


MAXLEN          = 768  #{768, 1024, 1280, 1600}

# 80% of the data will be used as Training data
# 20% will be used as test data
TRAIN_SIZE      = 0.8

if USE_APEX:
    TRAIN_BATCHSIZE = 4
    BATCH_UPDATE    = 16
else:
    TRAIN_BATCHSIZE = 2
    BATCH_UPDATE    = 32

EPOCHS          = 4
LR              = 5e-4 # Learning rate
EPS             = 1e-8
WARMUP_STEPS    = 1e2

SEED            = 2020

In [4]:
# function will seed every thing so that the results will become 
# constant accross different experiments
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

# Data Prepration

In [5]:
df = pd.read_csv('/content/Quotes_with_keyword.csv',usecols=['Quote','Keyword'])

In [6]:
df.head()

Unnamed: 0,Quote,Keyword
0,In the end the shape and form don't matter at ...,matter
1,"I'm still a man too, I wanted to look calm and...",love
2,"Clausewitz, he pointed out that no matter how ...",armchair
3,Because of the existence of love - sacrifice i...,comprehends
4,Courage is a word of justice. It means the qua...,excuse


Making a dictionary of the data

In [7]:
data_list = dict()
for index, row in df.iterrows():
  # data_list will look like
  # {index, [Quote,[keyword]]}
  # casting keyword in the list because if in future 
  # if we have to embedd more keywords then we will use the same code
  # currently only we are giving just on keyword 
  data_list[index] = [row['Quote'],[row['Keyword']]] 

This function will split the training data and test data according to the ratio

In [8]:
def split_data(data, S=TRAIN_SIZE):
    # Shuffle ids
    ids = list(data.keys())
    random.shuffle(ids)

    # Split into training and validation sets    
    train_size = int(S * len(data))

    train_ids = ids[:train_size]
    val_ids = ids[train_size:]

    train_data = dict()
    for id in train_ids:
        train_data[id] = data[id]

    val_data = dict()
    for id in val_ids:
        val_data[id] = data[id]

    return train_data, val_data

# Custom Dataset Class

This dataset class will help in making a dataset which will embedd keywords in to the prompts so that while training the architecture will keep the keywords in computation for learning the sequences

In [9]:
class myDataset(Dataset):
    def __init__(self, data, tokenizer, randomize=True):
        text, keywords = [], []
        for k, v in data.items():
            text.append(v[0])
            keywords.append(v[1])

        self.randomize = randomize
        self.tokenizer = tokenizer
        self.text      = text
        self.keywords  = keywords  

    #---------------------------------------------#

    @staticmethod
    def join_keywords(keywords, randomize=True):
        N = len(keywords)

        #random sampling and shuffle
        if randomize: 
            M = random.choice(range(N+1))
            keywords = keywords[:M]
            random.shuffle(keywords)

        return ','.join(keywords)

    #---------------------------------------------#

    def __len__(self):
        return len(self.text)

    #---------------------------------------------#
    
    def __getitem__(self, i):
        keywords = self.keywords[i].copy()
        kw = self.join_keywords(keywords, self.randomize) 
        
        # input prompt will now contain keyword and text both
        input = SPECIAL_TOKENS['bos_token'] + kw + SPECIAL_TOKENS['sep_token'] + \
                self.text[i] + SPECIAL_TOKENS['eos_token']

        # tokenization of the prompt
        encodings_dict = tokenizer(input,                                   
                                   truncation=True, 
                                   max_length=MAXLEN, 
                                   padding="max_length")   
        
        input_ids = encodings_dict['input_ids']
        attention_mask = encodings_dict['attention_mask']
        
        # returning the tokenized data
        return {'label': torch.tensor(input_ids),
                'input_ids': torch.tensor(input_ids), 
                'attention_mask': torch.tensor(attention_mask)}

#Tokenizer and Model initialization

In [10]:
# Function to get tokenizer
def get_tokenier(special_tokens=None):
    tokenizer = AutoTokenizer.from_pretrained(MODEL) #GPT2Tokenizer

    if special_tokens:
        tokenizer.add_special_tokens(special_tokens)
        print("Special tokens added")
    return tokenizer

# Function to get Model
def get_model(tokenizer, special_tokens=None, load_model_path=None):

    #GPT2LMHeadModel
    if special_tokens:
      # changing the config to induce special token
        config = AutoConfig.from_pretrained(MODEL, 
                                            bos_token_id=tokenizer.bos_token_id,
                                            eos_token_id=tokenizer.eos_token_id,
                                            sep_token_id=tokenizer.sep_token_id,
                                            pad_token_id=tokenizer.pad_token_id,
                                            output_hidden_states=False)
    else: 
        config = AutoConfig.from_pretrained(MODEL,                                     
                                            pad_token_id=tokenizer.eos_token_id,
                                            output_hidden_states=False)    

    #----------------------------------------------------------------#
    model = AutoModelForPreTraining.from_pretrained(MODEL, config=config)

    if special_tokens:
        #Special tokens added, model needs to be resized accordingly
        model.resize_token_embeddings(len(tokenizer))

    if load_model_path:
        model.load_state_dict(torch.load(load_model_path))

    model.cuda()
    return model

In [11]:
%%time
tokenizer = get_tokenier(special_tokens=SPECIAL_TOKENS)
model = get_model(tokenizer, 
                  special_tokens=SPECIAL_TOKENS,
                #   load_model_path='pytorch_model.bin'
                 )

Special tokens added
CPU times: user 4.55 s, sys: 1.67 s, total: 6.22 s
Wall time: 7.69 s


### You can uncomment this block to only fine tune layers of your own choice

In [12]:
# # - Freeze selective layers:
# # - Freeze all layers except last n:
# for parameter in model.parameters():
#     parameter.requires_grad = False

# for i, m in enumerate(model.transformer.h):        
#     #Only un-freeze the last n transformer blocks
#     if i+1 > 12 - UNFREEZE_LAST_N:
#         for parameter in m.parameters():
#             parameter.requires_grad = True 

# for parameter in model.transformer.ln_f.parameters():        
#     parameter.requires_grad = True

# for parameter in model.lm_head.parameters():        
#     parameter.requires_grad = True

# Splitting the dataset and making training and validation/test dataset

In [13]:
data = data_list
train_data, val_data = split_data(data)

train_dataset = myDataset(train_data, tokenizer)
val_dataset = myDataset(val_data, tokenizer, randomize=False)

# Initializing the training arguments and hyperparameters

In [14]:
%%time

training_args = TrainingArguments(
    output_dir="/content/",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCHSIZE,
    per_device_eval_batch_size=TRAIN_BATCHSIZE,
    gradient_accumulation_steps=BATCH_UPDATE,
    evaluation_strategy="epoch",
    fp16=True,
    fp16_opt_level=APEX_OPT_LEVEL,
    warmup_steps=WARMUP_STEPS,    
    learning_rate=LR,
    adam_epsilon=EPS,
    weight_decay=0.01,        
    save_total_limit=1     
)

#---------------------------------------------------#
trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# #---------------------------------------------------#
trainer.train() # Starts the traning

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 6684
  Num Epochs = 4
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 16
  Total optimization steps = 416
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
0,No log,0.154708
1,No log,0.15005
2,No log,0.150614
3,No log,0.153365


***** Running Evaluation *****
  Num examples = 1672
  Batch size = 4
***** Running Evaluation *****
  Num examples = 1672
  Batch size = 4
***** Running Evaluation *****
  Num examples = 1672
  Batch size = 4
***** Running Evaluation *****
  Num examples = 1672
  Batch size = 4


Training completed. Do not forget to share your model on huggingface.co/models =)




CPU times: user 51min 58s, sys: 6.09 s, total: 52min 4s
Wall time: 52min


TrainOutput(global_step=416, training_loss=0.28583033268268293, metrics={'train_runtime': 3120.5837, 'train_samples_per_second': 8.568, 'train_steps_per_second': 0.133, 'total_flos': 1.0467881385984e+16, 'train_loss': 0.28583033268268293, 'epoch': 4.0})

# Saving the model

In [15]:
trainer.save_model('/content/Trained_Model1')

Saving model checkpoint to /content/Trained_Model1
Configuration saved in /content/Trained_Model1/config.json
Model weights saved in /content/Trained_Model1/pytorch_model.bin
tokenizer config file saved in /content/Trained_Model1/tokenizer_config.json
Special tokens file saved in /content/Trained_Model1/special_tokens_map.json


# Loading the tokenizer and model for prediction

In [None]:
tokenizer = get_tokenier(special_tokens=SPECIAL_TOKENS)
model = get_model(tokenizer, 
                  special_tokens=SPECIAL_TOKENS,
                  load_model_path='/content/Trained_Model/pytorch_model.bin')

In [108]:
keywords = ['evil']
kw = myDataset.join_keywords(keywords, randomize=False)

prompt = SPECIAL_TOKENS['bos_token'] + kw + SPECIAL_TOKENS['sep_token']

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
device = torch.device("cuda")
generated = generated.to(device)

model.eval();

In [109]:
# Beam-search text generation:
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                max_length=MAXLEN,                                                      
                                num_beams=5,
                                repetition_penalty=5.0,
                                early_stopping=True,      
                                num_return_sequences=10
                                )

for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=True)
    a = len(','.join(keywords))    
    print("{}: {}\n\n".format(i+1,  text[a:]))

1: There is no such thing as evil. It's just that you can't live without it.


2: If you're looking for a good place to live, this is it. It's the only place I can think of that makes me happy.


3: There's nothing wrong with that. I'm just saying, if you don't like it, then go ahead and get rid of it.


4: It's not just me. It's all of you, too.


5: I'm not a bad person, I just don't want to be evil.


6: If you want to know what the hell is going on in this world, I'll show you.


7: I don't want to be the one who says, 'Hey, I'm not going to let you kill me. You're just a coward.'


8: There are many things that can be done to help save the world.


9: If you don't know what the hell I'm talking about, just take a look at me.


10: Humans are the only ones who can truly understand what it means to be human.




In [49]:
!zip -r /content/Trained_Model1.zip /content/Trained_Model1

  adding: content/Trained_Model1/ (stored 0%)
  adding: content/Trained_Model1/merges.txt (deflated 53%)
  adding: content/Trained_Model1/special_tokens_map.json (deflated 48%)
  adding: content/Trained_Model1/config.json (deflated 52%)
  adding: content/Trained_Model1/tokenizer.json (deflated 72%)
  adding: content/Trained_Model1/tokenizer_config.json (deflated 41%)
  adding: content/Trained_Model1/pytorch_model.bin (deflated 9%)
  adding: content/Trained_Model1/training_args.bin (deflated 48%)
  adding: content/Trained_Model1/vocab.json (deflated 59%)
  adding: content/Trained_Model1/added_tokens.json (deflated 46%)
