In [1]:
import json

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup

In [2]:
# Root Path
root_path='/root/research/Graph-To-Text/'

# Device: (Single) GPU
device=torch.device('cuda:0')

# Hyperparams
batch_size=3
accumulation_steps=2
epochs=3
lr=1e-5

In [3]:
# Pre-Trained Tokenizer, LM
tokenizer=GPT2Tokenizer.from_pretrained('gpt2-large')
model=GPT2LMHeadModel.from_pretrained('gpt2-large')

# Add PAD Token: [PAD]
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

Embedding(50258, 1280)

In [4]:
# Special Tokens
print("bos_token:", tokenizer.bos_token)
print("eos_token:", tokenizer.eos_token)
print("pad_token_id:", tokenizer.pad_token_id)

bos_token: <|endoftext|>
eos_token: <|endoftext|>
pad_token_id: 50257


In [5]:
def process_webnlg(dicts):
    """
    Process WebNLG Dataset
    """
    triples=[]
    texts=[]
    
    for index, dict_ in enumerate(dicts['entries']):
        
        data=dict_[str(index+1)]
        
        # Triple Data
        triple_proc=''
        for triple in data['modifiedtripleset']:
            subj, prop, obj=triple['subject'], triple['property'], triple['object']
            triple_proc+='| {} : {} : {} '.format(subj, prop, obj)
            
        # Text Data
        for text in data['lexicalisations']:
            if text['comment']!='good': continue
                
            triples.append(triple_proc)
            texts.append(text['lex'])
            
    print(len(triples), "data")
    
    return triples, texts

In [6]:
with open(root_path+'dataset/webnlg/train.json', 'r') as f:
    dict_train=json.load(f)
    f.close()
    
# Process Train Set
triples_train, texts_train=process_webnlg(dict_train)

with open(root_path+'dataset/webnlg/dev.json', 'r') as f:
    dict_dev=json.load(f)
    f.close()
    
# Process Dev Set
triples_dev, texts_dev=process_webnlg(dict_dev)

18025 data
2258 data


In [7]:
class WebNLGDataset(Dataset):
    """
    PyTorch Dataset Class: WebNLG Dataset
    """
    def __init__(self, tokenizer, triples, texts):
        self.data=[]
        self.label=[]
        
        for index, triple in enumerate(triples):
            data=tokenizer.encode(triple+tokenizer.bos_token+texts[index]+tokenizer.eos_token)
            self.data.append(data)
            
            label=tokenizer.encode(triple+tokenizer.bos_token+texts[index]+tokenizer.eos_token)
            sep=label.index(tokenizer.bos_token_id)+1
            label[:sep]=[-100]*sep
            self.label.append(label)
            
        print(len(self.data), "data")
    
    def __getitem__(self, idx):
        return self.data[idx], self.label[idx]
    
    def __len__(self):
        return len(self.data)

In [8]:
def collate_fn(batch):
    """
    For Same Sequence Length on Same Batch: Padding
    """
    max_len=0
    for data, _ in batch:
        if len(data)>max_len: max_len=len(data)
            
    datas=[]
    labels=[]
    for data, label in batch:
        data.extend([tokenizer.pad_token_id]*(max_len-len(data)))
        datas.append(data)
        
        label.extend([tokenizer.pad_token_id]*(max_len-len(label)))
        labels.append(label)
        
    return torch.tensor(datas), torch.tensor(labels)

In [9]:
# Train Set
dataset_train=WebNLGDataset(tokenizer=tokenizer, triples=triples_train, texts=texts_train)
dataloader_train=DataLoader(dataset_train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# Dev Set
dataset_dev=WebNLGDataset(tokenizer=tokenizer, triples=triples_dev, texts=texts_dev)
dataloader_dev=DataLoader(dataset_dev, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

18025 data
2258 data


In [10]:
# Optim, Scheduler
optimizer=AdamW(model.parameters(), lr=lr)
scheduler=get_linear_schedule_with_warmup(
    optimizer=optimizer,
    # 3% of Total Steps
    num_warmup_steps=int(0.03*epochs*len(dataset_train)/batch_size),
    num_training_steps=int(epochs*len(dataset_train)/(accumulation_steps*batch_size))
)

# TensorBoard: Logging
writer=SummaryWriter()
step_global=0

for epoch in range(epochs):
    # Train Phase
    model.train()
    model.to(device)
    
    loss_train=0
    optimizer.zero_grad()
    
    for step, (data, label) in enumerate(dataloader_train):
        data=data.to(device)
        label=label.to(device)
        
        outputs=model(data, labels=label)
        
        loss=outputs[0]/accumulation_steps
        loss.backward()
        
        loss_train+=loss.item()
        
        if (step+1)%accumulation_steps==0:
            step_global+=1
            
            # TensorBoard
            writer.add_scalar(
                f'Loss_Train (finetuned_batch{int(accumulation_steps*batch_size)}_epoch{epochs}_lr{lr})',
                loss_train,
                step_global
            )
            # Console
            if step_global%500==0:
                print(f'epoch {epoch+1} step {step_global} loss_train {loss_train:.4f}')
            # Set Loss to 0
            loss_train=0
            
            optimizer.step()
            scheduler.step()
            
            optimizer.zero_grad()
            
    # Eval Phase
    model.eval()
    
    loss_eval=0
    
    with torch.no_grad():
        for step, (data, label) in enumerate(dataloader_dev):
            data=data.to(device)
            label=label.to(device)
            
            outputs=model(data, labels=label)

            loss=outputs[0]
            loss_eval+=loss.item()
        loss_eval=loss_eval/(step+1)
        
        # TensorBoard
        writer.add_scalar(
            f'Loss_Eval (finetuned_batch{int(accumulation_steps*batch_size)}_epoch{epochs}_lr{lr})',
            loss_eval,
            epoch+1
        )
        # Console
        print("=====")
        print(f'epoch {epoch+1} loss_eval {loss_eval:.4f}')
        print("=====")
        
    # Save Model
    model.to(torch.device('cpu'))
    torch.save(model, root_path+f'model/finetuned_batch{int(accumulation_steps*batch_size)}_epoch{epoch+1}of{epochs}_lr{lr}.pt')

epoch 1 step 500 loss_train 0.5126
epoch 1 step 1000 loss_train 0.3445
epoch 1 step 1500 loss_train 0.6000
epoch 1 step 2000 loss_train 0.2809
epoch 1 step 2500 loss_train 0.3732
epoch 1 step 3000 loss_train 0.3763
=====
epoch 1 loss_eval 0.3171
=====
epoch 2 step 3500 loss_train 0.2349
epoch 2 step 4000 loss_train 0.2884
epoch 2 step 4500 loss_train 0.1846
epoch 2 step 5000 loss_train 0.3344
epoch 2 step 5500 loss_train 0.2260
epoch 2 step 6000 loss_train 0.2515
=====
epoch 2 loss_eval 0.1982
=====
epoch 3 step 6500 loss_train 0.0845
epoch 3 step 7000 loss_train 0.1369
epoch 3 step 7500 loss_train 0.1944
epoch 3 step 8000 loss_train 0.1050
epoch 3 step 8500 loss_train 0.2375
epoch 3 step 9000 loss_train 0.2122
=====
epoch 3 loss_eval 0.1408
=====
