# RDF Summarization and Versioning

In [None]:
 !pip install rdflib

Collecting rdflib
[?25l  Downloading https://files.pythonhosted.org/packages/d0/6b/6454aa1db753c0f8bc265a5bd5c10b5721a4bb24160fb4faf758cf6be8a1/rdflib-5.0.0-py3-none-any.whl (231kB)
[K     |████████████████████████████████| 235kB 7.6MB/s 
Collecting isodate
[?25l  Downloading https://files.pythonhosted.org/packages/9b/9f/b36f7774ff5ea8e428fdcfc4bb332c39ee5b9362ddd3d40d9516a55221b2/isodate-0.6.0-py2.py3-none-any.whl (45kB)
[K     |████████████████████████████████| 51kB 5.5MB/s 
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.6.0 rdflib-5.0.0


In [None]:
import rdflib
import copy
import urllib.request
import difflib

In [None]:
ontology1 = 'https://raw.githubusercontent.com/Chraebe/SummarizationCourse/main/data/dbpedia_2015-10_tw.nt'
ontology1_original = 'https://raw.githubusercontent.com/Chraebe/SummarizationCourse/main/data/dbpedia_2015-10.nt'
ontology1_mapping = 'https://raw.githubusercontent.com/Chraebe/SummarizationCourse/main/data/dbpedia_2015-10_tw.nt.mapping.nt'
ontology2 = 'https://raw.githubusercontent.com/Chraebe/SummarizationCourse/main/data/dbpedia_2016-10_tw.nt'
ontology2_original = 'https://raw.githubusercontent.com/Chraebe/SummarizationCourse/main/data/dbpedia_2016-10.nt'
ontology2_mapping = 'https://raw.githubusercontent.com/Chraebe/SummarizationCourse/main/data/dbpedia_2016-10_tw.nt.mapping.nt'
additions = 'https://raw.githubusercontent.com/Chraebe/SummarizationCourse/main/data/add.nt'
deletions = 'https://raw.githubusercontent.com/Chraebe/SummarizationCourse/main/data/del.nt'

In [None]:
# Load data into rdflib graphs

graph_ont1 = rdflib.Graph()
graph_ont1.parse(ontology1, format='nt')

graph_ont1_original = rdflib.Graph()
graph_ont1_original.parse(ontology1_original, format='nt')

graph_ont1_mapping = rdflib.Graph()
graph_ont1_mapping.parse(ontology1_mapping, format='nt')

graph_ont2 = rdflib.Graph()
graph_ont2.parse(ontology2, format='nt')

graph_ont2_mapping = rdflib.Graph()
graph_ont2_mapping.parse(ontology2_mapping, format='nt')

graph_add = rdflib.Graph()
graph_add.parse(additions, format='nt')

graph_del = rdflib.Graph()
graph_del.parse(deletions, format='nt')

<Graph identifier=N4236e775b68e42adb5da0e9a6e242a80 (<class 'rdflib.graph.Graph'>)>

In [None]:
# Example querying

qr = 'SELECT ?s WHERE {?s <http://rq.org/nodeMapping> <http://dbpedia.org/ontology/Cape> } LIMIT 10'
res = graph_ont2_mapping.query(qr)

for r in res:
  print(r)

(rdflib.term.URIRef('http://rq.org/tw18616'),)


In the following code, we remove the deleted triples from the summary:

In [None]:
num = 1
graph_tmp = copy.deepcopy(graph_ont1)
for triple in graph_del:
  print(f'\rTriple {num}', end = '')
  s = triple[0].n3()
  p = triple[1].n3()
  o = triple[2].n3()

  # Find subject mapping
  s_mapping_qr = f'SELECT ?s WHERE {{ ?s <http://rq.org/nodeMapping> {s} }}'
  res = graph_ont1_mapping.query(s_mapping_qr)
  for r in res:
    map_s = r[0].n3()
    map_s_rdf = r[0]
  
  # Find object mapping
  map_o = ""
  o_mapping_qr = f'SELECT ?s WHERE {{ ?s <http://rq.org/nodeMapping> {o} }}'
  res = graph_ont1_mapping.query(o_mapping_qr)
  for r in res:
    map_o = r[0].n3()
    map_o_rdf = r[0]
  if map_o == "":
    non_o_qr = f'SELECT ?o WHERE {{ {s} {p} ?o }}'
    res = graph_tmp.query(non_o_qr)
    for r in res:
      map_o = r[0].n3()
      map_o_rdf = r[0]

  if map_o == "":
    num = num + 1
    continue
    
  # Find all corresponding edges in the graph. 
  # If the edge support is larger than 1, decrement
  # If the edge support is 1, remove the edge AND the original triple
  edge_qr = f'''SELECT ?e ?es WHERE {{
                  ?e <http://rq.org/reifiedEdgeSubject> {map_s};
                    <http://rq.org/reifiedEdgeProperty> {p};
                    <http://rq.org/reifiedEdgeObject> {map_o};
                    <http://rq.org/edgeSupport> ?es .
                }}'''
  res = graph_tmp.query(edge_qr)
  for r in res:
    e = r[0]
    support = int(r[1])
    support_rdf = r[1]
    
    graph_tmp.remove((e, rdflib.term.URIRef('http://rq.org/edgeSupport'), support_rdf))
    if support > 1:
      # Decrement support value
      new_val = support - 1
      new_support = rdflib.term.Literal(f'{new_val}')
      graph_tmp.add((e, rdflib.term.URIRef('http://rq.org/edgeSupport'), new_support))
    else:
      # Remove edge and triple(s)
      graph_tmp.remove((e, rdflib.term.URIRef('http://rq.org/reifiedEdgeSubject'), map_s_rdf))
      graph_tmp.remove((e, rdflib.term.URIRef('http://rq.org/reifiedEdgeProperty'), triple[1]))
      graph_tmp.remove((e, rdflib.term.URIRef('http://rq.org/reifiedEdgeObject'), map_o_rdf))
      graph_tmp.remove((s, p, o))
      graph_tmp.remove((s, p, map_o_rdf))
      graph_tmp.remove((map_s_rdf, p, o))
      graph_tmp.remove((map_s_rdf, p, map_o_rdf))

  # Find/update node support.
  # If the node support is larger than 1, decrement
  # If the node support is 1, do nothing (the node might still be there)
  node_qr = f'SELECT * WHERE {{ {map_s} <http://rq.org/nodeSupport> ?o }}'
  res = graph_tmp.query(node_qr)
  for r in res:
    support = int(r[0])
    support_rdf = r[0]
    if support > 1:
      new_val = support - 1
      new_support = rdflib.term.Literal(f'{new_val}')
      graph_tmp.remove((map_s_rdf, rdflib.term.URIRef('http://rq.org/nodeSupport'), support_rdf))
      graph_tmp.add((map_s_rdf, rdflib.term.URIRef('http://rq.org/nodeSupport'), new_support))

  node_qr = f'SELECT * WHERE {{ {map_o} <http://rq.org/nodeSupport> ?o }}'
  res = graph_tmp.query(node_qr)
  for r in res:
    support = int(r[0])
    support_rdf = r[0]
    if support > 1:
      new_val = support - 1
      new_support = rdflib.term.Literal(f'{new_val}')
      graph_tmp.remove((map_o_rdf, rdflib.term.URIRef('http://rq.org/nodeSupport'), support_rdf))
      graph_tmp.add((map_o_rdf, rdflib.term.URIRef('http://rq.org/nodeSupport'), new_support))

  num = num + 1

Triple 3908

Now that we have removed the deleted triples, we can add the added triples to the summary in a similar way as the additions:

In [None]:
# Find the current edge ID value
lines = urllib.request.urlopen(ontology1)
line = ''
for l in lines:
  line = l.decode('utf-8')
line = line.replace("<http://rq.org/edgetw", "")
line = line.split(">")[0]

curr_id = int(line) + 1

print(str(curr_id))

21535


In [None]:
num = 1

for triple in graph_add:
  print(f'\rTriple {num}', end = '')
  # First, we need to find the mappings for subject and object.
  s = triple[0].n3()
  p = triple[1].n3()
  o = triple[2].n3()

  # Find subject mapping
  s_mapping_qr = f'SELECT ?s WHERE {{ ?s <http://rq.org/nodeMapping> {s} }}'
  res = graph_ont2_mapping.query(s_mapping_qr)
  found = False
  for r in res:
    map_s = r[0].n3()
    map_s_rdf = r[0]
    found = True

  if not found:
    num = num + 1
    continue

  # Find object mapping
  map_o = ""
  o_mapping_qr = f'SELECT ?s WHERE {{ ?s <http://rq.org/nodeMapping> {o} }}'
  res = graph_ont2_mapping.query(o_mapping_qr)
  for r in res:
    map_o = r[0].n3()
    map_o_rdf = r[0]
  if map_o == "":
    non_o_qr = f'SELECT ?o WHERE {{ {s} {p} ?o }}'
    res = graph_tmp.query(non_o_qr)
    for r in res:
      map_o = r[0].n3()
      map_o_rdf = r[0]
  if map_o == "":
    non_o_qr = f'SELECT ?o WHERE {{ ?s {p} ?o }}'
    res = graph_tmp.query(non_o_qr)
    for r in res:
      map_o = r[0].n3()
      map_o_rdf = r[0]
      break

  if map_o == "":
    num = num + 1
    continue

  # Find all corresponding edges in the graph. 
  # If the edge exists increment support
  # If not, create edge
  edge_qr = f'''SELECT ?e ?es WHERE {{
                  ?e <http://rq.org/reifiedEdgeSubject> {map_s};
                    <http://rq.org/reifiedEdgeProperty> {p};
                    <http://rq.org/reifiedEdgeObject> {map_o};
                    <http://rq.org/edgeSupport> ?es .
                }}'''
  found = False
  res = graph_tmp.query(edge_qr)
  for r in res:
    found = True
    e = r[0]
    support = int(r[1])
    support_rdf = r[1]

    new_val = support + 1
    new_support = rdflib.term.Literal(f'{new_val}')
    graph_tmp.remove((e, rdflib.term.URIRef('http://rq.org/edgeSupport'), support_rdf))
    graph_tmp.add((e, rdflib.term.URIRef('http://rq.org/edgeSupport'), new_support))

  if not found:
    e = rdflib.term.URIRef(f'http://rq.org/edgetw{curr_id}')
    curr_id = curr_id + 1
    graph_tmp.add((e, rdflib.term.URIRef('http://rq.org/reifiedEdgeSubject'), map_s_rdf))
    graph_tmp.add((e, rdflib.term.URIRef('http://rq.org/reifiedEdgeProperty'), triple[1]))
    graph_tmp.add((e, rdflib.term.URIRef('http://rq.org/reifiedEdgeObject'), map_o_rdf))
    graph_tmp.add((e, rdflib.term.URIRef('http://rq.org/edgeSupport'), rdflib.term.Literal('1')))
    graph_tmp.add((triple[0], triple[1], map_o_rdf))

  # Find all corresponding nodes in the graph. 
  # If the node exists, increment support
  # Otherwise create the node.
  node_qr = f'SELECT * WHERE {{ {map_s} <http://rq.org/nodeSupport> ?o }}'
  res = graph_tmp.query(node_qr)
  found = False
  for r in res:
    found = True
    support = int(r[0])
    support_rdf = r[0]
    if support > 1:
      new_val = support + 1
      new_support = rdflib.term.Literal(f'{new_val}')
      graph_tmp.remove((map_s_rdf, rdflib.term.URIRef('http://rq.org/nodeSupport'), support_rdf))
      graph_tmp.add((map_s_rdf, rdflib.term.URIRef('http://rq.org/nodeSupport'), new_support))

  if not found:
    graph_tmp.add((map_s_rdf, rdflib.term.URIRef('http://rq.org/nodeSupport'), rdflib.term.Literal('1')))

  node_qr = f'SELECT * WHERE {{ {map_o} <http://rq.org/nodeSupport> ?o }}'
  res = graph_tmp.query(node_qr)
  found = False
  for r in res:
    found = True
    support = int(r[0])
    support_rdf = r[0]
    if support > 1:
      new_val = support + 1
      new_support = rdflib.term.Literal(f'{new_val}')
      graph_tmp.remove((map_o_rdf, rdflib.term.URIRef('http://rq.org/nodeSupport'), support_rdf))
      graph_tmp.add((map_o_rdf, rdflib.term.URIRef('http://rq.org/nodeSupport'), new_support))

  if not found:
    graph_tmp.add((map_o_rdf, rdflib.term.URIRef('http://rq.org/nodeSupport'), rdflib.term.Literal('1')))

  num = num + 1

Triple 4640

In [None]:
# Serialization of the generated summary into N-Triples 
#nt = graph_tmp.serialize(format='nt') # serialize int
graph_tmp.serialize('output.nt', 'nt')

## Evaluation

In [None]:
original_summary = sorted([l.decode('utf-8').rstrip() for l in urllib.request.urlopen(ontology2)])
our_summary = sorted(graph_tmp.serialize(format='nt').decode(encoding='utf-8').split('\n'))

In [None]:
diff = difflib.ndiff(original_summary, our_summary)
diff = list(diff)  # can take more than 5 minutes to compute!

In [None]:
class LinesBuffer:
  def __init__(self, size: int):
    self.__size = size
    self.__buf = []

  def put(self, l: str):
    self.__buf.append(l)
    if len(self.__buf) > self.__size:
      del self.__buf[0]

  def get_buffer(self):
    return self.__buf[:]

  def clear(self):
    self.__buf = []

  def count_qm(self):
    c = 0
    for l in self.__buf:
      if l[0] == '?':
        c += 1
    return c


diff = [l.rstrip() for l in diff if l[0] != ' ' and len(l) > 1]

d = set()

cd = 0
cs = 0
previous_lines = LinesBuffer(4)
for l in diff:
  if l[0] == '+' or l[0] == '-':
    d.add(l)
  previous_lines.put(l)
  if previous_lines.count_qm() == 2:
    lines = previous_lines.get_buffer()
    lp = ''
    lm = ''
    for l in lines:
      if l[0] == '-':
        lm = l
      elif l[0] == '+':
        lp = l
    s = difflib.SequenceMatcher(None, lm, lp)
    similarity = s.ratio()
    if similarity < 0.95:
      cd += 1
    else:
      cs += 1
      d.remove(lm)
      d.remove(lp)
    previous_lines.clear()

print(cd)
print(cs)
print(len(original_summary))
print(len(our_summary))
print(len(d))
print(len(d)/(len(original_summary)+len(our_summary)))

35442
33993
112882
120784
94774
0.40559602167195913
51337


In [None]:
s = difflib.SequenceMatcher(None, original_summary, our_summary)
s.quick_ratio()