In [1]:
import json

from transformers import GPT2Tokenizer

### Load Dataset

In [2]:
with open('../dataset/webnlg/train.json', 'r') as f:
    dict_train=json.load(f)
    f.close()

In [3]:
dict_train.keys()

dict_keys(['entries'])

In [4]:
dict_train['entries'][0]

{'1': {'category': 'Airport',
  'dbpedialinks': [],
  'lexicalisations': [{'comment': 'good',
    'lang': '',
    'lex': 'The Aarhus is the airport of Aarhus, Denmark.',
    'xml_id': 'Id1'},
   {'comment': 'good',
    'lang': '',
    'lex': 'Aarhus Airport serves the city of Aarhus, Denmark.',
    'xml_id': 'Id2'}],
  'links': [],
  'modifiedtripleset': [{'object': '"Aarhus, Denmark"',
    'property': 'cityServed',
    'subject': 'Aarhus_Airport'}],
  'originaltriplesets': {'originaltripleset': [[{'object': '"Aarhus, Denmark"@en',
      'property': 'cityServed',
      'subject': 'Aarhus_Airport'}]]},
  'shape': 'NA',
  'shape_type': 'NA',
  'size': '1',
  'xml_id': 'Id1'}}

### Process Data

In [5]:
categories=[]
properties=[]

data_triple=[]
data_text=[]

In [6]:
for index, data in enumerate(dict_train['entries']):
    categories.append(data[str(index+1)]['category'])
    
    triples=data[str(index+1)]['modifiedtripleset']
    triple_=""
    for triple in triples:
        subj, prop, obj=triple['subject'], triple['property'], triple['object']
        properties.append(prop)
        triple_+="| {} : {} : {} ".format(subj, prop, obj)
    
    texts=data[str(index+1)]['lexicalisations']
    for text in texts:
        if text['comment']!="good": continue
            
        data_triple.append(triple_)
        data_text.append(text['lex'])

In [7]:
categories=list(set(categories))
print(len(categories), "Categories")
print(categories)

10 Categories
['ComicsCharacter', 'City', 'Building', 'Food', 'Airport', 'Monument', 'WrittenWork', 'Astronaut', 'SportsTeam', 'University']


In [8]:
print(len(list(set(properties))), "Properties")
print(len(list(set(data_triple))), "Unique Triples")
print(len(data_triple), "Triples")
print(len(data_text), "Texts")

246 Properties
6888 Unique Triples
18025 Triples
18025 Texts


In [9]:
data_triple[0]

'| Aarhus_Airport : cityServed : "Aarhus, Denmark" '

In [10]:
data_text[0]

'The Aarhus is the airport of Aarhus, Denmark.'

### Prepare for Training (GPT2-Medium)

In [11]:
tokenizer=GPT2Tokenizer.from_pretrained('gpt2-medium')

In [12]:
print("bos_token:", tokenizer.bos_token)
print("eos_token:", tokenizer.eos_token)

bos_token: <|endoftext|>
eos_token: <|endoftext|>


In [13]:
for index, triple in enumerate(data_triple):
    data=tokenizer.encode(triple+tokenizer.bos_token+data_text[index]+tokenizer.eos_token)
    print(triple+tokenizer.bos_token+data_text[index]+tokenizer.eos_token)
    print("-----")
    print(data)
    
    print("=====")
    
    label=data=tokenizer.encode(triple+tokenizer.bos_token+data_text[index]+tokenizer.eos_token)
    sep=label.index(tokenizer.bos_token_id)+1
    label[:sep]=[-100]*sep
    print(label)
    
    break

| Aarhus_Airport : cityServed : "Aarhus, Denmark" <|endoftext|>The Aarhus is the airport of Aarhus, Denmark.<|endoftext|>
-----
[91, 317, 283, 7537, 62, 16170, 634, 1058, 1748, 50, 8520, 1058, 366, 32, 283, 7537, 11, 16490, 1, 220, 50256, 464, 317, 283, 7537, 318, 262, 9003, 286, 317, 283, 7537, 11, 16490, 13, 50256]
=====
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 464, 317, 283, 7537, 318, 262, 9003, 286, 317, 283, 7537, 11, 16490, 13, 50256]
