# Generator fine tuning

### set up

In [1]:
%%time
%%capture
!pip install transformers

CPU times: user 22.2 ms, sys: 4.07 ms, total: 26.2 ms
Wall time: 2.63 s


In [2]:
!nvidia-smi

Mon Jan 17 23:34:05 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P8    12W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import os
import io
import requests
import numpy as np
import pandas as pd
import re
import zipfile
import random
import time
import csv
import datetime
from itertools import compress
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoTokenizer, AutoConfig, AutoModelForPreTraining, \
                         AdamW, get_linear_schedule_with_warmup, \
                         TrainingArguments, BeamScorer, Trainer

import torch
from torch.utils.data import Dataset, random_split, DataLoader, \
                             RandomSampler, SequentialSampler

from IPython.display import clear_output

from google.colab import drive
drive.mount('/content/drive')
import json

print(f"PyTorch version: {torch.__version__}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
PyTorch version: 1.10.0+cu111


### config

In [4]:
DEBUG           = False

USE_APEX        = True
APEX_OPT_LEVEL  = 'O1'

MODEL           = 'gpt2' #{gpt2, gpt2-medium, gpt2-large, gpt2-xl}

UNFREEZE_LAST_N = 6 #The last N layers to unfreeze for training

SPECIAL_TOKENS  = { "bos_token": "<|BOS|>",
                    "eos_token": "<|EOS|>",
                    "unk_token": "<|UNK|>",                    
                    "pad_token": "<|PAD|>",
                    "sep_token": "<|SEP|>"}
                    
MAXLEN          = 768  #{768, 1024, 1280, 1600}

TRAIN_SIZE      = 0.8

if USE_APEX:
    TRAIN_BATCHSIZE = 4
    BATCH_UPDATE    = 16
else:
    TRAIN_BATCHSIZE = 2
    BATCH_UPDATE    = 32

EPOCHS          = 7
LR              = 5e-4
EPS             = 1e-8
WARMUP_STEPS    = 1e2

SEED            = 2020

In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

### auxiliary functions

In [6]:
class mailsDataset(Dataset):

    def __init__(self, mails, tokenizer, randomize=True):
        self.randomize = randomize
        self.tokenizer = tokenizer 
        self.mails     = mails

    def __len__(self):
        return len(self.mails)
    
    def __getitem__(self, i):
        
        input = SPECIAL_TOKENS['bos_token'] + self.mails[i]['subject'] + \
                SPECIAL_TOKENS['sep_token'] + self.mails[i]['abstractive_summary'] + SPECIAL_TOKENS['sep_token'] + \
                self.mails[i]['content'] + SPECIAL_TOKENS['eos_token']

        encodings_dict = tokenizer(input,                                   
                                   truncation=True, 
                                   max_length=MAXLEN, 
                                   padding="max_length")   
        
        input_ids = encodings_dict['input_ids']
        attention_mask = encodings_dict['attention_mask']
        
        return {'label': torch.tensor(input_ids),
                'input_ids': torch.tensor(input_ids), 
                'attention_mask': torch.tensor(attention_mask)}

In [7]:
def split_data(mails, S=TRAIN_SIZE):
    # shuffle list
    random.shuffle(mails)

    # split into training and validation sets    
    train_size = int(S * len(mails))

    train_mails = mails[:train_size]
    val_mails = mails[train_size:]

    return train_mails, val_mails

In [8]:
def get_tokenier(special_tokens=None):
    tokenizer = AutoTokenizer.from_pretrained(MODEL) #GPT2Tokenizer

    if special_tokens:
        tokenizer.add_special_tokens(special_tokens)
        print("Special tokens added")
    return tokenizer

def get_model(tokenizer, special_tokens=None, load_model_path=None):

    #GPT2LMHeadModel
    if special_tokens:
        config = AutoConfig.from_pretrained(MODEL, 
                                            bos_token_id=tokenizer.bos_token_id,
                                            eos_token_id=tokenizer.eos_token_id,
                                            sep_token_id=tokenizer.sep_token_id,
                                            pad_token_id=tokenizer.pad_token_id,
                                            output_hidden_states=False)
    else: 
        config = AutoConfig.from_pretrained(MODEL,                                     
                                            pad_token_id=tokenizer.eos_token_id,
                                            output_hidden_states=False)    

    #----------------------------------------------------------------#
    model = AutoModelForPreTraining.from_pretrained(MODEL, config=config)

    if special_tokens:
        #Special tokens added, model needs to be resized accordingly
        model.resize_token_embeddings(len(tokenizer))

    if load_model_path:
        model.load_state_dict(torch.load(load_model_path))

    model.cuda()
    return model

### load model and tokenizer

In [9]:
%%time

tokenizer = get_tokenier(special_tokens=SPECIAL_TOKENS)
model = get_model(tokenizer, 
                  special_tokens=SPECIAL_TOKENS,
                #   load_model_path='pytorch_model.bin'
                 )

Special tokens added
CPU times: user 4.85 s, sys: 2.21 s, total: 7.06 s
Wall time: 11 s


In [10]:
# - Freeze selective layers:
# - Freeze all layers except last n:
for parameter in model.parameters():
    parameter.requires_grad = False

for i, m in enumerate(model.transformer.h):        
    #Only un-freeze the last n transformer blocks
    if i+1 > 12 - UNFREEZE_LAST_N:
        for parameter in m.parameters():
            parameter.requires_grad = True 

for parameter in model.transformer.ln_f.parameters():        
    parameter.requires_grad = True

for parameter in model.lm_head.parameters():        
    parameter.requires_grad = True

### load data

In [11]:
with open('/content/drive/My Drive/master/isp/summarized_bc3_email_corpus_dataset_t5_large.json', 'r') as file:
    json_data_1 = json.load(file)
with open('/content/drive/My Drive/master/isp/summarized_bc3_email_corpus_dataset_pegasus.json', 'r') as file:
    json_data_2 = json.load(file)
json_data = json_data_1
json_data['mails'] += json_data_2['mails'] 

In [12]:
train_data, val_data = split_data(json_data['mails'])

train_dataset = mailsDataset(train_data, tokenizer)
val_dataset = mailsDataset(val_data, tokenizer, randomize=False)

f'There are {len(train_dataset) :,} samples for training, and {len(val_dataset) :,} samples for validation testing'

'There are 416 samples for training, and 104 samples for validation testing'

### fine tune model

In [13]:
%%time

training_args = TrainingArguments(
    output_dir="/content/",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCHSIZE,
    per_device_eval_batch_size=TRAIN_BATCHSIZE,
    gradient_accumulation_steps=BATCH_UPDATE,
    evaluation_strategy="epoch",
    fp16=True,
    fp16_opt_level=APEX_OPT_LEVEL,
    warmup_steps=WARMUP_STEPS,    
    learning_rate=LR,
    adam_epsilon=EPS,
    weight_decay=0.01,        
    save_total_limit=1   
)

#---------------------------------------------------#
trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

#---------------------------------------------------#
trainer.train()
trainer.save_model()    

Using amp half precision backend
***** Running training *****
  Num examples = 416
  Num Epochs = 7
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 16
  Total optimization steps = 42


Epoch,Training Loss,Validation Loss
0,No log,29.730238
1,No log,18.907234
2,No log,4.335934
3,No log,1.866698
4,No log,1.440816
5,No log,1.227566
6,No log,1.087682


***** Running Evaluation *****
  Num examples = 104
  Batch size = 4
***** Running Evaluation *****
  Num examples = 104
  Batch size = 4
***** Running Evaluation *****
  Num examples = 104
  Batch size = 4
***** Running Evaluation *****
  Num examples = 104
  Batch size = 4
***** Running Evaluation *****
  Num examples = 104
  Batch size = 4
***** Running Evaluation *****
  Num examples = 104
  Batch size = 4
***** Running Evaluation *****
  Num examples = 104
  Batch size = 4


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /content/
Configuration saved in /content/config.json
Model weights saved in /content/pytorch_model.bin
tokenizer config file saved in /content/tokenizer_config.json
Special tokens file saved in /content/special_tokens_map.json


CPU times: user 5min 7s, sys: 33.6 s, total: 5min 41s
Wall time: 5min 42s


In [21]:
!cp -r 'pytorch_model.bin' '/content/drive/My Drive/master/isp/fine_tuned_gpt2_768.bin'

### generation

In [15]:
tokenizer = get_tokenier(special_tokens=SPECIAL_TOKENS)
model = get_model(tokenizer, 
                  special_tokens=SPECIAL_TOKENS,
                  load_model_path='pytorch_model.bin')

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": 

Special tokens added


loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50257,
  "embd_pdrop": 0.1,
  "eos_token_id": 50258,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 50260,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "sep_token_id": 50261,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "sum

In [16]:
mail = json_data['mails'][0]

prompt = SPECIAL_TOKENS['bos_token'] + mail['subject'] + \
         SPECIAL_TOKENS['sep_token'] + mail['abstractive_summary'] + SPECIAL_TOKENS['sep_token']
         
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
device = torch.device("cuda")
generated = generated.to(device)

model.eval();

In [19]:
# Beam-search text generation:
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                max_length=MAXLEN,                                                      
                                num_beams=5,
                                repetition_penalty=5.0,
                                early_stopping=True,      
                                num_return_sequences=1
                                )
for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=True)
    a = len(mail['subject']) + len(mail['abstractive_summary'])   
    print("{}: {}\n\n".format(i+1,  text[a:]))

  next_indices = next_tokens // vocab_size


1: I believe that there is a need for more information about the accessibility of these sites and how they can be used by people with disabilities. If you have any questions or comments, please feel free to email me at: info@w3c.org




In [20]:
mail

{'abstractive_summary': 'The World Wide Web Consortium (W3C) has published its guidelines for making web pages accessible to people with disabilities.',
 'content': 'Hello Education and Outreach Colleagues, I was talking to a friend today -- an amateur web developer and professional disability rights advocate -- who complained that the W3C guidelines are overly technical for her needs. She wants a plain language version of the guidelines. As she is fairly technologically savvy, she expressed frustration at having to work so hard to understand what must be done to make accessible web pages. To illustrate her point, she read me the Quick Tip card description of Image map. I agree with her, the tone is definitely geeky. But not everyone who develops web pages speaks the language of client-side servers and hotspots. I would guess that most people who develop web pages are amateurs (in the original sense of the word: from amore or amour: an activity done out of love.) Will these people free