In [1]:
import json

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup

In [2]:
# Root Path
root_path='/root/research/Graph-To-Text/'

# Device: (Single) GPU
device=torch.device('cuda:0')

# Hyperparams
batch_size=1
accumulation_steps=4
epochs=3
lr=3e-5

In [3]:
# Pre-Trained Tokenizer, LM
tokenizer=T5Tokenizer.from_pretrained('t5-large')
model=T5ForConditionalGeneration.from_pretrained('t5-large')

In [4]:
def process_webnlg(dicts):
    """
    Process WebNLG Dataset
    """
    triples=[]
    texts=[]
    
    for index, dict_ in enumerate(dicts['entries']):
        
        data=dict_[str(index+1)]
        
        # Triple Data
        triple_proc=''
        for triple in data['modifiedtripleset']:
            subj, prop, obj=triple['subject'], triple['property'], triple['object']
            triple_proc+='| {} : {} : {} '.format(subj, prop, obj)
            
        # Text Data
        for text in data['lexicalisations']:
            if text['comment']!='good': continue
                
            triples.append(triple_proc)
            texts.append(text['lex'])
            
    print(len(triples), "data")
    
    return triples, texts

In [5]:
with open(root_path+'dataset/webnlg/train.json', 'r') as f:
    dict_train=json.load(f)
    f.close()
    
# Process Train Set
triples_train, texts_train=process_webnlg(dict_train)

with open(root_path+'dataset/webnlg/dev.json', 'r') as f:
    dict_dev=json.load(f)
    f.close()
    
# Process Dev Set
triples_dev, texts_dev=process_webnlg(dict_dev)

18025 data
2258 data


In [6]:
class WebNLGDataset(Dataset):
    """
    PyTorch Dataset Class: WebNLG Dataset
    """
    def __init__(self, tokenizer, triples, texts):
        self.data=[]
        self.label=[]
        
        for index, triple in enumerate(triples):
            data=tokenizer.encode(triple)
            self.data.append(data)
            
            label=tokenizer.encode(texts[index])
            self.label.append(label)
            
        print(len(self.data), "data")
    
    def __getitem__(self, idx):
        return self.data[idx], self.label[idx]
    
    def __len__(self):
        return len(self.data)

In [7]:
def collate_fn(batch):
    """
    For Same Sequence Length on Same Batch: Padding(-100)
    """
    max_len_data=0
    max_len_label=0
    for data, label in batch:
        if len(data)>max_len_data: max_len_data=len(data)
        if len(label)>max_len_label: max_len_label=len(label)
            
    datas=[]
    attn_masks=[]
    labels=[]
    for data, label in batch:
        data.extend([-100]*(max_len_data-len(data)))
        datas.append(data)
        
        attn_mask=[int(e!=-100) for e in data]
        attn_masks.append(attn_mask)
        
        label.extend([-100]*(max_len_label-len(label)))
        labels.append(label)
        
    return torch.tensor(datas), torch.tensor(attn_masks), torch.tensor(labels)

In [8]:
# Train Set
dataset_train=WebNLGDataset(tokenizer=tokenizer, triples=triples_train, texts=texts_train)
dataloader_train=DataLoader(dataset_train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# Dev Set
dataset_dev=WebNLGDataset(tokenizer=tokenizer, triples=triples_dev, texts=texts_dev)
dataloader_dev=DataLoader(dataset_dev, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

18025 data
2258 data


In [9]:
# Optim, Scheduler
optimizer=AdamW(model.parameters(), lr=lr)
scheduler=get_linear_schedule_with_warmup(
    optimizer=optimizer,
    # 3% of Total Steps
    num_warmup_steps=int(0.03*epochs*len(dataset_train)/(accumulation_steps*batch_size)),
    num_training_steps=int(epochs*len(dataset_train)/(accumulation_steps*batch_size))
)

# TensorBoard: Logging
writer=SummaryWriter()
step_global=0

for epoch in range(epochs):
    # Train Phase
    model.train()
    model.to(device)
    
    loss_train=0
    optimizer.zero_grad()
    
    for step, (data, attn_mask, label) in enumerate(dataloader_train):
        data=data.to(device)
        attn_mask=attn_mask.to(device)
        label=label.to(device)
        
        outputs=model(input_ids=data, attention_mask=attn_mask, labels=label)
        
        loss=outputs[0]/accumulation_steps
        loss.backward()
        
        loss_train+=loss.item()
        
        if (step+1)%accumulation_steps==0:
            step_global+=1
            
            # TensorBoard
            writer.add_scalar(
                f'loss_train/T5_finetuned_batch{int(accumulation_steps*batch_size)}_epoch{epochs}_lr{lr}',
                loss_train,
                step_global
            )
            # Console
            if step_global%500==0:
                print(f'epoch {epoch+1} step {step_global} loss_train {loss_train:.4f}')
            # Set Loss to 0
            loss_train=0
            
            optimizer.step()
            scheduler.step()
            
            optimizer.zero_grad()
            
    # Eval Phase
    model.eval()
    
    loss_eval=0
    
    with torch.no_grad():
        for step, (data, attn_mask, label) in enumerate(dataloader_dev):
            data=data.to(device)
            attn_mask=attn_mask.to(device)
            label=label.to(device)
            
            outputs=model(input_ids=data, attention_mask=attn_mask, labels=label)

            loss=outputs[0]
            loss_eval+=loss.item()
        loss_eval=loss_eval/(step+1)
        
        # TensorBoard
        writer.add_scalar(
            f'loss_eval/T5_finetuned_batch{int(accumulation_steps*batch_size)}_epoch{epochs}_lr{lr}',
            loss_eval,
            epoch+1
        )
        # Console
        print("=====")
        print(f'epoch {epoch+1} loss_eval {loss_eval:.4f}')
        print("=====")
        
    # Save Model
    model.to(torch.device('cpu'))
    torch.save(model, root_path+f'model/T5_finetuned_batch{int(accumulation_steps*batch_size)}_epoch{epoch+1}of{epochs}_lr{lr}.pt')

epoch 1 step 500 loss_train 1.4016
epoch 1 step 1000 loss_train 0.9987
epoch 1 step 1500 loss_train 0.8015
epoch 1 step 2000 loss_train 0.6552
epoch 1 step 2500 loss_train 0.8780
epoch 1 step 3000 loss_train 0.7992
epoch 1 step 3500 loss_train 0.6083
epoch 1 step 4000 loss_train 0.7251
epoch 1 step 4500 loss_train 0.6962
=====
epoch 1 loss_eval 0.5887
=====
epoch 2 step 5000 loss_train 0.5652
epoch 2 step 5500 loss_train 0.5714
epoch 2 step 6000 loss_train 0.4592
epoch 2 step 6500 loss_train 0.6563
epoch 2 step 7000 loss_train 0.6346
epoch 2 step 7500 loss_train 0.7409
epoch 2 step 8000 loss_train 0.5618
epoch 2 step 8500 loss_train 0.6158
epoch 2 step 9000 loss_train 0.8697
=====
epoch 2 loss_eval 0.5499
=====
epoch 3 step 9500 loss_train 0.9482
epoch 3 step 10000 loss_train 0.6019
epoch 3 step 10500 loss_train 0.5093
epoch 3 step 11000 loss_train 0.9502
epoch 3 step 11500 loss_train 0.5280
epoch 3 step 12000 loss_train 0.6950
epoch 3 step 12500 loss_train 0.3766
epoch 3 step 13000 lo