In [43]:
import pickle
import os
import json

In [2]:
from rdflib import Graph, RDF, URIRef
from rdflib.namespace import SKOS


In [5]:
nist_ont_pref = 'https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/InterchangeOntology#'
turtle_dir = '/home/alireza/aida/hypotheses/m9_sample'
with open(os.path.join(turtle_dir, 'kbfiles.txt'), 'r') as fin:
    turtle_files = [os.path.join(turtle_dir, item.strip()) for item in fin]

In [6]:
turtle_files[:10]

['/home/alireza/aida/hypotheses/m9_sample/./T103/T103_Q001_H003.hypothesis.feedback.ttl',
 '/home/alireza/aida/hypotheses/m9_sample/./T103/IC00120VX.parent.ttl',
 '/home/alireza/aida/hypotheses/m9_sample/./T103/IC0011X4T.parent.ttl',
 '/home/alireza/aida/hypotheses/m9_sample/./T103/IC00120UN.parent.ttl',
 '/home/alireza/aida/hypotheses/m9_sample/./T103/IC00150IB.parent.ttl',
 '/home/alireza/aida/hypotheses/m9_sample/./T103/IC0011YF6.parent.ttl',
 '/home/alireza/aida/hypotheses/m9_sample/./T103/T103_Q001_H006.partial.ttl',
 '/home/alireza/aida/hypotheses/m9_sample/./T103/IC0011XCW.parent.ttl',
 '/home/alireza/aida/hypotheses/m9_sample/./T103/IC0018KDK.parent.ttl',
 '/home/alireza/aida/hypotheses/m9_sample/./T103/T103.kb.ttl']

In [50]:
def get_entities(turtle_path):
    entity2mention = {}
    
    entity_ = URIRef(nist_ont_pref+'Entity')
    hasName_ = URIRef(nist_ont_pref+'hasName')
    sys_ = URIRef(nist_ont_pref+'system')
    justified_by_ = URIRef(nist_ont_pref+'justifiedBy')
    
    turtle_content = open(turtle_path).read()
    graph = Graph()
    graph.parse(data=turtle_content, format='n3')
    entities = graph.subjects(predicate=RDF.type,object=entity_)
    for entity in entities:
        entity_id = entity.toPython()
        temp = list(graph.objects(predicate=hasName_,subject=entity))
        entity_name = temp[0].toPython() if len(temp) > 0 else None
        temp = list(graph.objects(predicate=sys_,subject=entity))
        en_sys_rdf = temp[0] if len(temp) > 0 else None
        temp = list(graph.subjects(predicate=RDF.subject,object=entity))
        en_asser_node = temp[0] if len(temp) > 0 else None
        if en_asser_node != None:
            temp = list(graph.objects(subject = en_asser_node,predicate=RDF.object))
            en_type_rdf = temp[0] if len(temp) > 0 else None
        else:
            en_type_rdf = None

        entity2mention[entity_id] = {'mentions': {},
                                     'name': entity_name,
                                     'type_rdf': en_type_rdf,
                                     'sys_rdf': en_sys_rdf}

        just_by = graph.objects(predicate=justified_by_,subject=en_asser_node)        
        for just in just_by:
            mention_id = just.toPython()
            temp = list(graph.objects(subject=just, predicate=SKOS.prefLabel))
            mention_name = temp[0].toPython() if len(temp) > 0 else None
            source = list(graph.objects(subject=just,
                            predicate=URIRef(nist_ont_pref+'source')))[0].toPython()
            
            mention_type = list(graph.objects(predicate=RDF.type,subject=just))[0]
            mtype = mention_type.split('#')[-1]
            
            pv_data_rdf = list(graph.objects(subject=just,
                            predicate=URIRef(nist_ont_pref+'privateData')))
            pv_data_dict = {}
            for pv_rdf in pv_data_rdf:
                dict_str=list(graph.objects(subject=pv_rdf,
                                predicate=URIRef(nist_ont_pref+'jsonContent')))[0].toPython()
                pv_data_dict.update(json.loads(dict_str))
                    
            entity2mention[entity_id]['mentions'][mention_id] = {
                'type': mention_type,
                'name': mention_name,
                'source': source,
                'private_data': pv_data_dict,
            }
            
            if mtype == 'TextJustification':
                off_beg = list(graph.objects(subject=just,
                                predicate=URIRef(nist_ont_pref+'startOffset')))[0].toPython()
                off_end = list(graph.objects(subject=just,
                                predicate=URIRef(nist_ont_pref+'endOffsetInclusive')))[0].toPython()
                entity2mention[entity_id]['mentions'][mention_id]['offset'] = (off_beg, off_end)
            elif mtype == 'ImageJustification':
                bbox = list(graph.objects(subject=just,
                            predicate=URIRef(nist_ont_pref+'boundingBox')))[0]
                bbox_coords = (
                    list(graph.objects(subject=bbox, predicate=URIRef(nist_ont_pref+'boundingBoxUpperLeftX')))[0].toPython(),
                    list(graph.objects(subject=bbox, predicate=URIRef(nist_ont_pref+'boundingBoxUpperLeftY')))[0].toPython(),
                    list(graph.objects(subject=bbox, predicate=URIRef(nist_ont_pref+'boundingBoxLowerRightX')))[0].toPython(),
                    list(graph.objects(subject=bbox, predicate=URIRef(nist_ont_pref+'boundingBoxLowerRightY')))[0].toPython(),
                )
                entity2mention[entity_id]['mentions'][mention_id]['bbox'] = bbox_coords
            elif mtype == 'KeyFrameVideoJustification':
                bbox = list(graph.objects(subject=just,
                            predicate=URIRef(nist_ont_pref+'boundingBox')))[0]
                bbox_coords = (
                    list(graph.objects(subject=bbox, predicate=URIRef(nist_ont_pref+'boundingBoxUpperLeftX')))[0].toPython(),
                    list(graph.objects(subject=bbox, predicate=URIRef(nist_ont_pref+'boundingBoxUpperLeftY')))[0].toPython(),
                    list(graph.objects(subject=bbox, predicate=URIRef(nist_ont_pref+'boundingBoxLowerRightX')))[0].toPython(),
                    list(graph.objects(subject=bbox, predicate=URIRef(nist_ont_pref+'boundingBoxLowerRightY')))[0].toPython(),
                )
                keyframe = list(graph.objects(subject=just,
                                predicate=URIRef(nist_ont_pref+'keyFrame')))[0].toPython()
                entity2mention[entity_id]['mentions'][mention_id]['bbox'] = bbox_coords
                entity2mention[entity_id]['mentions'][mention_id]['keyframe_id'] = keyframe
            else:
                raise NotImplementedError
                
    return entity2mention

In [51]:
entities = {}
for turtle_path in turtle_files:
    entities.update(get_entities(turtle_path))

In [52]:
with open('../../temp/hypo_sample_1.pkl', 'wb') as fout:
    pickle.dump(entities, fout)

In [54]:
with open('../../temp/hypo_sample_1.pkl', 'rb') as fin:
    entities = pickle.load(fin)

In [70]:
image_hypo = {}
for eid in entities:
    for mention in entities[eid]['mentions'].values():
        if mention['type'].split('#')[-1] == 'ImageJustification' and mention['source'] != None:
            if mention['source'] not in image_hypo:
                image_hypo[mention['source']] = []
            image_hypo[mention['source']].append((eid, entities[eid]['type_rdf']))
            

In [71]:
len(image_hypo)

109

In [72]:
keyframe_hypo = {}
for eid in entities:
    for mention in entities[eid]['mentions'].values():
        if mention['type'].split('#')[-1] == 'KeyFrameVideoJustification' and mention['source'] != None:
            if mention['source'] not in image_hypo:
                keyframe_hypo[mention['source']] = []
            keyframe_hypo[mention['source']].append((eid, entities[eid]['type_rdf']))


In [73]:
len(keyframe_hypo)

98

In [74]:
with open('../../temp/hypo_sample_1_leaf2ents.pkl', 'wb') as fout:
    pickle.dump((image_hypo, keyframe_hypo), fout)