In [1]:
import json

import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup

# Google's Official Preprocess Codes
# https://github.com/google-research/language/blob/master/language/totto/baseline_preprocessing/preprocess_utils.py
from preprocess_utils import get_highlighted_subtable, linearize_subtable

In [2]:
# Train Config
device=torch.device('cuda:2')
batch_size=3
accumulation_steps=8
epochs=3
lr=1e-4

In [3]:
# Pre-Trained T5 Tokenizer
tokenizer=T5Tokenizer.from_pretrained('t5-large')
# Add Special Tokens: Table Tags
tokenizer.add_special_tokens({
    'additional_special_tokens': [
        '<page_title>',
        '</page_title>',
        '<section_title>',
        '</section_title>',
        '<table>',
        '</table>',
        '<cell>',
        '</cell>',
        '<col_header>',
        '</col_header>',
        '<row_header>',
        '</row_header>'
    ]
})
# Pre-Trained T5 Model
model=T5ForConditionalGeneration.from_pretrained('t5-large').to(device)
# Resize PLM's Embedding Layer
model.resize_token_embeddings(len(tokenizer))

Embedding(32112, 1024)

In [4]:
class ToTToDataset(Dataset):
    def __init__(self, path_data, tokenizer):
        #
        self.data=[]
        self.label=[]
        
        # Load Dataset
        with open(path_data, 'r') as f:
            dataset=f.read().splitlines()
            f.close()
            
        for _data in dataset:
            data=json.loads(_data)
            
            # Preprocess
            subtable=get_highlighted_subtable(table=data['table'], cell_indices=data['highlighted_cells'], with_heuristic_headers=True)
            cells_linearized=linearize_subtable(
                subtable=subtable,
                table_page_title=data['table_page_title'],
                table_section_title=data['table_section_title']
            )
            
            # Encode
            encoded=tokenizer.encode(cells_linearized)
            if len(encoded)>512:
                # Truncate
                encoded=encoded[:511]+[tokenizer.eos_token_id]
            self.data.append(encoded)
            self.label.append(tokenizer.encode(data['sentence_annotations'][0]['final_sentence']))
            
        print(len(self.data), 'datas')
        print(len(self.label), 'labels')
        
    def __getitem__(self, idx):
        return self.data[idx], self.label[idx]
        
    def __len__(self):
        return len(self.data)

In [5]:
def collate_fn(batch):
    """
    Same Sequence Length on Same Batch
    """
    max_len_data=0
    max_len_label=0
    for data, label in batch:
        if len(data)>max_len_data: max_len_data=len(data)
        if len(label)>max_len_label: max_len_label=len(label)
            
    datas=[]
    attn_masks=[]
    labels=[]
    for data, label in batch:
        data.extend([tokenizer.pad_token_id]*(max_len_data-len(data)))
        datas.append(data)
        
        attn_mask=[int(e!=tokenizer.pad_token_id) for e in data]
        attn_masks.append(attn_mask)
        
        label.extend([-100]*(max_len_label-len(label)))
        labels.append(label)
        
    return torch.tensor(datas), torch.tensor(attn_masks), torch.tensor(labels)

In [6]:
dataset_train=ToTToDataset(path_data='../totto_data/totto_train_data.jsonl', tokenizer=tokenizer)
dataloader_train=DataLoader(dataset_train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

Token indices sequence length is longer than the specified maximum sequence length for this model (578 > 512). Running this sequence through the model will result in indexing errors


120761 datas
120761 labels


In [7]:
# Optim, Scheduler
optimizer=AdamW(model.parameters(), lr=lr)
scheduler=get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=500,
    num_training_steps=int(epochs*len(dataset_train)/(accumulation_steps*batch_size))
)

# TensorBoard: Logging
writer=SummaryWriter()
step_global=0

for epoch in range(epochs):
    # Train Phase
    model.train()
    model.to(device)
    
    loss_train=0
    optimizer.zero_grad()
    
    for step, (data, attn_mask, label) in enumerate(dataloader_train):
        data=data.to(device)
        attn_mask=attn_mask.to(device)
        label=label.to(device)
        
        outputs=model(input_ids=data, attention_mask=attn_mask, labels=label)
        
        loss=outputs[0]/accumulation_steps
        loss.backward()
        
        loss_train+=loss.item()
        
        if (step+1)%accumulation_steps==0:
            step_global+=1
            
            # TensorBoard
            writer.add_scalar(
                f'loss_train/T5_Finetuned_on_ToTTo(Subtable)_batch{int(accumulation_steps*batch_size)}_epoch{epochs}_lr{lr}',
                loss_train,
                step_global
            )
            # Console
            if step_global%1000==0:
                print(f'epoch {epoch+1} step {step_global} loss_train {loss_train:.4f}')
            # Set Loss to 0
            loss_train=0
            
            optimizer.step()
            scheduler.step()
            
            optimizer.zero_grad()
            
    # Save Model
    model.to(torch.device('cpu'))
    torch.save(model, f'../model/T5_Finetuned_on_ToTTo(Subtable)_batch{int(accumulation_steps*batch_size)}_epoch{epoch+1}of{epochs}_lr{lr}.pt')



epoch 1 step 1000 loss_train 0.9203
epoch 1 step 2000 loss_train 0.9574
epoch 1 step 3000 loss_train 1.0080
epoch 1 step 4000 loss_train 0.9594
epoch 1 step 5000 loss_train 1.0454
epoch 2 step 6000 loss_train 0.9070
epoch 2 step 7000 loss_train 0.8418
epoch 2 step 8000 loss_train 0.7949
epoch 2 step 9000 loss_train 0.8890
epoch 2 step 10000 loss_train 0.7176
epoch 3 step 11000 loss_train 0.7717
epoch 3 step 12000 loss_train 0.7495
epoch 3 step 13000 loss_train 0.7708
epoch 3 step 14000 loss_train 0.7367
epoch 3 step 15000 loss_train 0.6968
