### Imports

Libraries that we need in order to make this work

In [1]:
import json
import sng_parser
from pprint import pprint
from time import time

### Check parser

First we check how the parser works and the structure of the output

In [2]:
graph = sng_parser.parse("A dog and a cat in the sidewalk.")
pprint(graph)
sng_parser.tprint(graph)

{'entities': [{'head': 'dog',
               'lemma_head': 'dog',
               'lemma_span': 'a dog',
               'modifiers': [{'dep': 'det', 'lemma_span': 'a', 'span': 'A'}],
               'span': 'A dog',
               'span_bounds': (0, 2),
               'type': 'unknown'},
              {'head': 'cat',
               'lemma_head': 'cat',
               'lemma_span': 'a cat',
               'modifiers': [{'dep': 'det', 'lemma_span': 'a', 'span': 'a'}],
               'span': 'a cat',
               'span_bounds': (3, 5),
               'type': 'unknown'},
              {'head': 'sidewalk',
               'lemma_head': 'sidewalk',
               'lemma_span': 'the sidewalk',
               'modifiers': [{'dep': 'det',
                              'lemma_span': 'the',
                              'span': 'the'}],
               'span': 'the sidewalk',
               'span_bounds': (6, 8),
               'type': 'unknown'}],
 'relations': [{'lemma_relation': 'in',
          

In [3]:
graph = sng_parser.parse("A tiger cat with a collar and a gold tab sitting on a bed.")
pprint(graph)
sng_parser.tprint(graph)

{'entities': [{'head': 'tiger cat',
               'lemma_head': 'tiger cat',
               'lemma_span': 'a tiger cat',
               'modifiers': [{'dep': 'det', 'lemma_span': 'a', 'span': 'A'}],
               'span': 'A tiger cat',
               'span_bounds': (0, 3),
               'type': 'unknown'},
              {'head': 'collar',
               'lemma_head': 'collar',
               'lemma_span': 'a collar',
               'modifiers': [{'dep': 'det', 'lemma_span': 'a', 'span': 'a'}],
               'span': 'a collar',
               'span_bounds': (4, 6),
               'type': 'unknown'},
              {'head': 'gold tab',
               'lemma_head': 'gold tab',
               'lemma_span': 'a gold tab',
               'modifiers': [{'dep': 'det', 'lemma_span': 'a', 'span': 'a'}],
               'span': 'a gold tab',
               'span_bounds': (7, 10),
               'type': 'unknown'},
              {'head': 'bed',
               'lemma_head': 'bed',
               '

### Captions files

Captions files

In [5]:
PATH_TRAIN = "../RNN2LY/data/datasets/AMR2014train-dev-test/GraphTrain.json"
PATH_DEV = "../RNN2LY/data/datasets/AMR2014train-dev-test/GraphDev.json"
PATH_VAL = "../RNN2LY/data/datasets/AMR2014train-dev-test/GraphTest.json"

In [6]:
DISPLAY_STEP = 1000

### Check json files

check one of the files and its structure

In [7]:
with open(PATH_VAL, "r") as json_file:
    data = json.load(json_file)

In [9]:
data['391895']

{'image_filename': 'COCO_val2014_000000391895.jpg',
 'width': 640,
 'height': 360,
 'valid_captions': 5,
 'graphs': [{'caption_n': 1,
   'caption': 'a man with a red helmet on a small moped on a dirt road',
   'objects': [['have-03', 0],
    ['man', 1],
    ['helmet', 1],
    ['red', 1],
    ['road', 0],
    ['dirt', 0]],
   'relations': [':mod', ':location', ':ARG1', ':ARG0'],
   'triples': [[['have-03', 0], ':ARG0', ['man', 1]],
    [['have-03', 0], ':ARG1', ['helmet', 2]],
    [['red', 3], ':ARG1', ['helmet', 2]],
    [['man', 1], ':location', ['road', 4]],
    [['road', 4], ':mod', ['dirt', 5]]]},
  {'caption_n': 2,
   'caption': 'man riding a motor bike on a dirt road on the countryside',
   'objects': [['ride-01', 0],
    ['man', 1],
    ['bike', 1],
    ['motor', 0],
    ['road', 0],
    ['countryside', 0],
    ['dirt', 0]],
   'relations': [':mod', ':location', ':ARG1', ':ARG0'],
   'triples': [[['ride-01', 0], ':ARG0', ['man', 1]],
    [['ride-01', 0], ':ARG1', ['bike', 2]],
 

### Generator

Generate the graphs

In [27]:
def generate_graphs(input_filename, output_filename):
    
    step = 0
    output_dict = {}
    
    with open(input_filename, "r") as json_file:
        data = json.load(json_file)
        
    for key, value in data.items():
        
        if step % DISPLAY_STEP == 0:
            print("Steps:", step, "/", len(data))
        
        output_dict[key] = {}
        output_dict[key]['image_filename'] = value['image_filename']
        output_dict[key]['width'] = value['width']
        output_dict[key]['height'] = value['height']
        output_dict[key]['valid_captions'] = 0
        output_dict[key]['graphs'] = []
        
        for g in value['graphs']:
            graph = sng_parser.parse(g['caption'])
            
            # If there is at least one relation add it to the dataset
            if len(graph['relations']) > 0:
                # Parse the graph. we are interested in objects, relations and triples.
                objects, relations, triples = [], set(), []
                obj_map = {}
                for relation in graph['relations']:
                    # obtain the relations
                    sub, rel, obj = relation['subject'], relation['relation'], relation['object']
                    # add the relaions
                    relations.add(rel)
                    # map the objects with the length of the map to differenciate different objects with the same name.
                    # sub and obj are numbers
                    if sub not in obj_map:
                        obj_map[sub] = len(obj_map)
                        objects.append([graph['entities'][sub]['head'], 1])
                    if obj not in obj_map:
                        obj_map[obj] = len(obj_map)
                        objects.append([graph['entities'][obj]['head'], 1])
                    triples.append([[graph['entities'][sub]['head'], obj_map[sub]], rel, [graph['entities'][obj]['head'], obj_map[obj]]])

                # Increase by one the counter
                output_dict[key]['valid_captions'] += 1
                
                # Generate the information and add it to the list
                graph_info = {
                    'caption_n': output_dict[key]['valid_captions'],
                    'caption': g['caption'],
                    'objects': objects,
                    'relations': list(relations),
                    'triples': triples
                }

                output_dict[key]['graphs'].append(graph_info)
        step += 1
    
    with open(output_filename, "w") as json_file:
        json.dump(output_dict, json_file)


Function to generate the graphs given and input file and the output name

In [28]:
generate_graphs(PATH_TRAIN, "../RNN2LY/data/datasets/SGP-train-dev-test/GraphTrain.json")

Steps: 0 / 74504
Steps: 1000 / 74504
Steps: 2000 / 74504
Steps: 3000 / 74504
Steps: 4000 / 74504
Steps: 5000 / 74504
Steps: 6000 / 74504
Steps: 7000 / 74504
Steps: 8000 / 74504
Steps: 9000 / 74504
Steps: 10000 / 74504
Steps: 11000 / 74504
Steps: 12000 / 74504
Steps: 13000 / 74504
Steps: 14000 / 74504
Steps: 15000 / 74504
Steps: 16000 / 74504
Steps: 17000 / 74504
Steps: 18000 / 74504
Steps: 19000 / 74504
Steps: 20000 / 74504
Steps: 21000 / 74504
Steps: 22000 / 74504
Steps: 23000 / 74504
Steps: 24000 / 74504
Steps: 25000 / 74504
Steps: 26000 / 74504
Steps: 27000 / 74504
Steps: 28000 / 74504
Steps: 29000 / 74504
Steps: 30000 / 74504
Steps: 31000 / 74504
Steps: 32000 / 74504
Steps: 33000 / 74504
Steps: 34000 / 74504
Steps: 35000 / 74504
Steps: 36000 / 74504
Steps: 37000 / 74504
Steps: 38000 / 74504
Steps: 39000 / 74504
Steps: 40000 / 74504
Steps: 41000 / 74504
Steps: 42000 / 74504
Steps: 43000 / 74504
Steps: 44000 / 74504
Steps: 45000 / 74504
Steps: 46000 / 74504
Steps: 47000 / 74504
Steps

In [29]:
generate_graphs(PATH_DEV, "../RNN2LY/data/datasets/SGP-train-dev-test/GraphDev.json")

Steps: 0 / 8279
Steps: 1000 / 8279
Steps: 2000 / 8279
Steps: 3000 / 8279
Steps: 4000 / 8279
Steps: 5000 / 8279
Steps: 6000 / 8279
Steps: 7000 / 8279
Steps: 8000 / 8279


In [30]:
generate_graphs(PATH_VAL, "../RNN2LY/data/datasets/SGP-train-dev-test/GraphTest.json")

Steps: 0 / 40504
Steps: 1000 / 40504
Steps: 2000 / 40504
Steps: 3000 / 40504
Steps: 4000 / 40504
Steps: 5000 / 40504
Steps: 6000 / 40504
Steps: 7000 / 40504
Steps: 8000 / 40504
Steps: 9000 / 40504
Steps: 10000 / 40504
Steps: 11000 / 40504
Steps: 12000 / 40504
Steps: 13000 / 40504
Steps: 14000 / 40504
Steps: 15000 / 40504
Steps: 16000 / 40504
Steps: 17000 / 40504
Steps: 18000 / 40504
Steps: 19000 / 40504
Steps: 20000 / 40504
Steps: 21000 / 40504
Steps: 22000 / 40504
Steps: 23000 / 40504
Steps: 24000 / 40504
Steps: 25000 / 40504
Steps: 26000 / 40504
Steps: 27000 / 40504
Steps: 28000 / 40504
Steps: 29000 / 40504
Steps: 30000 / 40504
Steps: 31000 / 40504
Steps: 32000 / 40504
Steps: 33000 / 40504
Steps: 34000 / 40504
Steps: 35000 / 40504
Steps: 36000 / 40504
Steps: 37000 / 40504
Steps: 38000 / 40504
Steps: 39000 / 40504
Steps: 40000 / 40504
