In [1]:
# Load Train Set
with open('../totto_data/totto_train_data.jsonl', 'r') as f:
    data_train=f.read().splitlines()
    f.close()
    
# Number of Train Data
len(data_train)

120761

In [2]:
import json

# Sample Data
data_sample=json.loads(data_train[-1])

# Key-Value Set
for key, value in data_sample.items():
    if key=='table': continue
        
    print('$', key, '\n', value)

$ table_webpage_url 
 http://en.wikipedia.org/wiki/Swimming_at_the_2012_Summer_Olympics_%E2%80%93_Men's_100_metre_backstroke
$ table_page_title 
 Swimming at the 2012 Summer Olympics – Men's 100 metre backstroke
$ table_section_title 
 Final
$ table_section_text 
 
$ highlighted_cells 
 [[4, 0], [4, 2], [4, 4]]
$ example_id 
 -2235792344822110317
$ sentence_annotations 
 [{'original_sentence': 'Leading the race early on the initial length, Lacourt dropped off the podium to a fourth-place time in 53.08.', 'sentence_after_deletion': 'Lacourt dropped to a fourth-place time in 53.08.', 'sentence_after_ambiguity': 'Lacourt was dropped to a fourth-place time in 53.08.', 'final_sentence': 'Lacourt was dropped to a fourth-place time in 53.08.'}]


In [3]:
# Google's Official Preprocess Codes
# https://github.com/google-research/language/blob/master/language/totto/baseline_preprocessing/preprocess_utils.py
from preprocess_utils import get_highlighted_subtable, linearize_subtable

print('$', 'Highlighted Cells')
for (index_row, index_col) in data_sample['highlighted_cells']:
    print(data_sample['table'][index_row][index_col])
    
print('\n$', 'Linearized (Preprocessed) Cells')
subtable=get_highlighted_subtable(table=data_sample['table'], cell_indices=data_sample['highlighted_cells'], with_heuristic_headers=True)
cells_linearized=linearize_subtable(
    subtable=subtable,
    table_page_title=data_sample['table_page_title'],
    table_section_title=data_sample['table_section_title']
)
print(cells_linearized)

print('\n$', 'Final (Label) Sentence')
for sentence in data_sample['sentence_annotations']:
    print(sentence['final_sentence'])

$ Highlighted Cells
{'value': '4', 'is_header': False, 'column_span': 1, 'row_span': 1}
{'value': 'Camille Lacourt', 'is_header': False, 'column_span': 1, 'row_span': 1}
{'value': '53.08', 'is_header': False, 'column_span': 1, 'row_span': 1}

$ Linearized (Preprocessed) Cells
<page_title> Swimming at the 2012 Summer Olympics – Men's 100 metre backstroke </page_title> <section_title> Final </section_title> <table> <cell> 4 <col_header> Rank </col_header> </cell> <cell> Camille Lacourt <col_header> Name </col_header> </cell> <cell> 53.08 <col_header> Time </col_header> </cell> </table>

$ Final (Label) Sentence
Lacourt was dropped to a fourth-place time in 53.08.


In [4]:
# Prepare for Training
from transformers import T5Tokenizer

# T5 Tokenizer
tokenizer=T5Tokenizer.from_pretrained('t5-large')

# Vocab Size
len(tokenizer)

32100

In [5]:
# Add Special Tokens: Table Tags
tokenizer.add_special_tokens({
    'additional_special_tokens': [
        '<page_title>',
        '</page_title>',
        '<section_title>',
        '</section_title>',
        '<table>',
        '</table>',
        '<cell>',
        '</cell>',
        '<col_header>',
        '</col_header>',
        '<row_header>',
        '</row_header>'
    ]
})
# When Training, Resize PLM's Embedding Layer
#model.resize_token_embeddings(len(tokenizer))

# Vocab Size
len(tokenizer)

32112

In [6]:
# Tokenize Linearized Cells
print(tokenizer.tokenize(cells_linearized))

['<page_title>', '▁Swimming', '▁at', '▁the', '▁2012', '▁Summer', '▁Olympics', '▁', '–', '▁Men', "'", 's', '▁100', '▁', 'metre', '▁back', 'stroke', '</page_title>', '<section_title>', '▁Final', '</section_title>', '<table>', '<cell>', '▁4', '<col_header>', '▁', 'Rank', '</col_header>', '</cell>', '<cell>', '▁Camill', 'e', '▁La', 'court', '<col_header>', '▁Name', '</col_header>', '</cell>', '<cell>', '▁53', '.', '08', '<col_header>', '▁Time', '</col_header>', '</cell>', '</table>']
