In [1]:
import logging
import json

import torch
import torch.nn as nn
from transformers import T5Tokenizer

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [2]:
# Root Path
root_path='/root/research/Graph-To-Text/'
# Model Path
model_path='./model/T5_finetuned_batch4_epoch3of3_lr3e-05.pt'
# Device: (Single) GPU
device=torch.device('cuda:1')
# Beam Search
num_beams=5

# Debugger
logging.disable(logging.CRITICAL)

In [3]:
with open(root_path+'dataset/webnlg/train.json', 'r') as f:
    dict_train=json.load(f)
    f.close()
    
categories_seen=[]

for index, data in enumerate(dict_train['entries']):
    categories_seen.append(data[str(index+1)]['category'])
    
categories_seen=list(set(categories_seen))
print(len(categories_seen), "Categories in Train Set")
print(categories_seen)

10 Categories in Train Set
['City', 'ComicsCharacter', 'Monument', 'WrittenWork', 'Food', 'Astronaut', 'SportsTeam', 'Airport', 'Building', 'University']


In [4]:
with open(root_path+'dataset/webnlg/test.json', 'r') as f:
    dict_test=json.load(f)
    f.close()
    
categories_unseen=[]

triples_seen=[]
triples_unseen=[]

refs_seen=[]
refs_unseen=[]

for index, data in enumerate(dict_test['entries']):
    data=data[str(index+1)]
    
    triple_proc=""
    for triple in data['modifiedtripleset']:
        subj, prop, obj=triple['subject'], triple['property'], triple['object']
        triple_proc+="| {} : {} : {} ".format(subj, prop, obj)
        
    texts=data['lexicalisations']
    
    if data['category'] not in categories_seen:
        categories_unseen.append(data['category'])
        triples_unseen.append(triple_proc)
        refs_unseen.append([text['lex'] for text in texts])
        continue
        
    triples_seen.append(triple_proc)
    refs_seen.append([text['lex'] for text in texts])
    
categories_unseen=list(set(categories_unseen))
print(len(categories_unseen), "Unseen Categories")
print(categories_unseen)
print("=====")

print(len(triples_seen), "Seen Data")
print(len(triples_unseen), "Unseen Data")

5 Unseen Categories
['CelestialBody', 'Athlete', 'Politician', 'MeanOfTransportation', 'Artist']
=====
971 Seen Data
891 Unseen Data


In [5]:
tokenizer=T5Tokenizer.from_pretrained('t5-large')
model=torch.load(root_path+model_path).to(device)

In [6]:
scores_seen=[]
generations=""

print("Seen Categories")
for index, triple in enumerate(triples_seen):
    if (index+1)%100==0: print(index+1)
        
    input_ids=tokenizer.encode(triple)
    output=model.generate(torch.tensor([input_ids]).to(device), max_length=500, num_beams=num_beams)
    cand=tokenizer.decode(output[0], skip_special_tokens=True)
    generations+=cand+"\n"
    
    bleu_score=sentence_bleu(
        [ref.split() for ref in refs_seen[index]],
        cand.split(),
        smoothing_function=SmoothingFunction().method4
    )
    scores_seen.append(bleu_score)
print("BLEU Score: {:.2f}".format(100*sum(scores_seen)/len(scores_seen)))

with open(root_path+'generation/'+model_path.split("/")[-1][:-3]+'_Seen', 'w') as f:
    f.write(generations)
    f.close()

Seen Categories
100
200
300
400
500
600
700
800
900
BLEU Score: 57.95


In [7]:
scores_unseen=[]
generations=""

print("Unseen Categories")
for index, triple in enumerate(triples_unseen):
    if (index+1)%100==0: print(index+1)
        
    input_ids=tokenizer.encode(triple)
    output=model.generate(torch.tensor([input_ids]).to(device), max_length=500, num_beams=num_beams)
    cand=tokenizer.decode(output[0], skip_special_tokens=True)
    generations+=cand+"\n"
    
    bleu_score=sentence_bleu(
        [ref.split() for ref in refs_unseen[index]],
        cand.split(),
        smoothing_function=SmoothingFunction().method4
    )
    scores_unseen.append(bleu_score)
print("BLEU Score: {:.2f}".format(100*sum(scores_unseen)/len(scores_unseen)))

with open(root_path+'generation/'+model_path.split("/")[-1][:-3]+'_Unseen', 'w') as f:
    f.write(generations)
    f.close()

Unseen Categories
100
200
300
400
500
600
700
800
BLEU Score: 45.99


In [8]:
scores_seen=[]

with open(root_path+'generation/'+model_path.split("/")[-1][:-3]+'_Seen', 'r') as f:
    cands_seen=f.read().split("\n")
    f.close()

print("Seen Categories")

for index, refs in enumerate(refs_seen):
    bleu_score=sentence_bleu(
        [ref.split() for ref in refs],
        cands_seen[index].split(),
        smoothing_function=SmoothingFunction().method4
    )
    scores_seen.append(bleu_score)
print("BLEU Score: {:.2f}".format(100*sum(scores_seen)/len(scores_seen)))

Seen Categories
BLEU Score: 57.95


In [9]:
scores_unseen=[]

with open(root_path+'generation/'+model_path.split("/")[-1][:-3]+'_Unseen', 'r') as f:
    cands_unseen=f.read().split("\n")
    f.close()

print("Unseen Categories")

for index, refs in enumerate(refs_unseen):
    bleu_score=sentence_bleu(
        [ref.split() for ref in refs],
        cands_unseen[index].split(),
        smoothing_function=SmoothingFunction().method4
    )
    scores_unseen.append(bleu_score)
print("BLEU Score: {:.2f}".format(100*sum(scores_unseen)/len(scores_unseen)))

Unseen Categories
BLEU Score: 45.99
