# Shape Prediction

An experiment in evolutionary software using *reinforcement learning* to make something interesting from a given set of graph data.

In [1]:
import kglab

namespaces = {
    "nom":  "http://example.org/#",
    "wtm":  "http://purl.org/heals/food/",
    "ind":  "http://purl.org/heals/ingredient/",
    "skos": "http://www.w3.org/2004/02/skos/core#",
    }

kg = kglab.KnowledgeGraph(
    name = "A recipe KG example based on Food.com",
    base_uri = "https://www.food.com/recipe/",
    language = "en",
    namespaces = namespaces,
    )

kg.load_rdf("dat/recipes.ttl")

In [2]:
import sys
import inspect

__name__ = "kglab"

clsmembers = inspect.getmembers(sys.modules[__name__], inspect.isclass)
clsmembers

[('EvoShape', kglab.kglab.EvoShape),
 ('EvoShapeEdge', kglab.kglab.EvoShapeEdge),
 ('EvoShapeNode', kglab.kglab.EvoShapeNode),
 ('KnowledgeGraph', kglab.kglab.KnowledgeGraph),
 ('Leaderboard', kglab.kglab.Leaderboard),
 ('Measure', kglab.kglab.Measure),
 ('ShapeFactory', kglab.kglab.ShapeFactory),
 ('Simplex0', kglab.kglab.Simplex0),
 ('Simplex1', kglab.kglab.Simplex1),
 ('Subgraph', kglab.kglab.Subgraph)]

## Graph measures and topological analysis

Let's measure this graph, to develop some estimators that we'll use later...

In [3]:
import pandas as pd

pd.set_option("max_rows", None)

measure = kglab.Measure()
measure.measure_graph(kg)

In [4]:
print("edges", measure.edge_count)
print("nodes", measure.node_count)

edges 1891
nodes 286


In [5]:
measure.s_gen.get_tally()

Unnamed: 0,count
https://www.food.com/recipe/103964,11
https://www.food.com/recipe/430777,10
https://www.food.com/recipe/137158,10
https://www.food.com/recipe/268209,10
https://www.food.com/recipe/135405,10
https://www.food.com/recipe/76907,10
https://www.food.com/recipe/123656,10
http://example.org/#Pancake,10
https://www.food.com/recipe/220361,10
https://www.food.com/recipe/12055,10


In [6]:
measure.p_gen.get_tally()

Unnamed: 0,count
http://purl.org/heals/food/hasIngredient,1078
http://www.w3.org/1999/02/22-rdf-syntax-ns#type,286
http://www.w3.org/2004/02/skos/core#definition,249
http://purl.org/heals/food/hasCookTime,240
http://www.w3.org/2004/02/skos/core#prefLabel,9
http://purl.org/dc/terms/identifier,7
http://www.w3.org/2004/02/skos/core#narrower,5
http://www.w3.org/2004/02/skos/core#altLabel,5
http://www.w3.org/2004/02/skos/core#closeMatch,3
http://example.org/#usesProcess,2


In [7]:
measure.o_gen.get_tally()

Unnamed: 0,count
http://purl.org/heals/food/Recipe,240
http://purl.org/heals/ingredient/AllPurposeFlour,190
http://purl.org/heals/ingredient/ChickenEgg,170
http://purl.org/heals/ingredient/Salt,158
http://purl.org/heals/ingredient/Butter,147
http://purl.org/heals/ingredient/CowMilk,132
http://purl.org/heals/ingredient/WhiteSugar,99
http://purl.org/heals/ingredient/Water,70
http://purl.org/heals/ingredient/VanillaExtract,54
http://example.org/#Noodle,21


In [8]:
measure.l_gen.get_tally()

Unnamed: 0,count
PT30M,29
PT20M,21
PT25M,20
PT35M,18
PT40M,18
PT10M,12
PT15M,11
PT1H,10
PT5M,8
PT1H30M,7


In [9]:
df, link_map = measure.n_gen.get_tally_map()
df

Unnamed: 0,count
"(https://www.food.com/recipe/430777, http://purl.org/heals/food/hasIngredient)",7
"(https://www.food.com/recipe/76907, http://purl.org/heals/food/hasIngredient)",7
"(https://www.food.com/recipe/135405, http://purl.org/heals/food/hasIngredient)",7
"(https://www.food.com/recipe/123656, http://purl.org/heals/food/hasIngredient)",7
"(https://www.food.com/recipe/137158, http://purl.org/heals/food/hasIngredient)",7
"(https://www.food.com/recipe/103964, http://purl.org/heals/food/hasIngredient)",7
"(https://www.food.com/recipe/151617, http://purl.org/heals/food/hasIngredient)",7
"(https://www.food.com/recipe/268209, http://purl.org/heals/food/hasIngredient)",7
"(https://www.food.com/recipe/100230, http://purl.org/heals/food/hasIngredient)",6
"(https://www.food.com/recipe/9037, http://purl.org/heals/food/hasIngredient)",6


In [10]:
df, link_map = measure.e_gen.get_tally_map()

In [11]:
print(link_map)

defaultdict(<class 'set'>, {rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'): {rdflib.term.URIRef('http://example.org/#Noodle'), rdflib.term.URIRef('http://example.org/#Pancake'), rdflib.term.URIRef('http://www.w3.org/2004/02/skos/core#ConceptScheme'), rdflib.term.URIRef('http://www.w3.org/2004/02/skos/core#Concept'), rdflib.term.URIRef('http://example.org/#Component'), rdflib.term.URIRef('http://purl.org/heals/food/Recipe')}, rdflib.term.URIRef('http://purl.org/heals/food/hasIngredient'): {rdflib.term.URIRef('http://purl.org/heals/ingredient/Salt'), rdflib.term.URIRef('http://purl.org/heals/ingredient/ChickenEgg'), rdflib.term.URIRef('http://purl.org/heals/ingredient/AllPurposeFlour'), rdflib.term.URIRef('http://purl.org/heals/ingredient/WholeWheatFlour'), rdflib.term.URIRef('http://purl.org/heals/ingredient/Bacon'), rdflib.term.URIRef('http://purl.org/heals/ingredient/Honey'), rdflib.term.URIRef('http://purl.org/heals/ingredient/AppleCiderVinegar'), rdflib.term.U

## ShapeFactory and evolved shapes

In [12]:
factory = kglab.ShapeFactory(kg, measure)
subgraph = factory.subgraph

In [13]:
es0 = factory.new_shape()

print(es0.serialize(subgraph))
[ print(r) for r in es0.get_rdf() ];

[240, (-1, [(299, 298)]), (298, [])]
_:N05fd2a6a4e8a455f9b6dd0ea418df326 rdf:type wtm:Recipe .


Now we can use this `ShapeFactory` object to evolve a *shape* within the graph, then generate a SPARQL query to test its cardinality:

In [14]:
sparql, bindings = es0.get_sparql()

print(sparql)
print(bindings)

SELECT DISTINCT ?v1 WHERE { ?v1 ?pred1_0 ?node0 }
{'?node0': rdflib.term.URIRef('http://purl.org/heals/food/Recipe'), '?pred1_0': rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')}


In [15]:
for row in kg.query(sparql):
    print(row)

(rdflib.term.URIRef('https://www.food.com/recipe/7536'),)
(rdflib.term.URIRef('https://www.food.com/recipe/157638'),)
(rdflib.term.URIRef('https://www.food.com/recipe/504246'),)
(rdflib.term.URIRef('https://www.food.com/recipe/220141'),)
(rdflib.term.URIRef('https://www.food.com/recipe/284620'),)
(rdflib.term.URIRef('https://www.food.com/recipe/511497'),)
(rdflib.term.URIRef('https://www.food.com/recipe/467335'),)
(rdflib.term.URIRef('https://www.food.com/recipe/350593'),)
(rdflib.term.URIRef('https://www.food.com/recipe/438738'),)
(rdflib.term.URIRef('https://www.food.com/recipe/148900'),)
(rdflib.term.URIRef('https://www.food.com/recipe/21301'),)
(rdflib.term.URIRef('https://www.food.com/recipe/160402'),)
(rdflib.term.URIRef('https://www.food.com/recipe/62799'),)
(rdflib.term.URIRef('https://www.food.com/recipe/458'),)
(rdflib.term.URIRef('https://www.food.com/recipe/274637'),)
(rdflib.term.URIRef('https://www.food.com/recipe/103073'),)
(rdflib.term.URIRef('https://www.food.com/recip

We can also use this library to construct a specific shape programatically, e.g., a recipe:

In [16]:
es1 = kglab.EvoShape(kg, measure)

type_uri = "http://purl.org/heals/food/Recipe"
type_node = kglab.EvoShapeNode(type_uri, terminal=True)
es1.add_link(es1.root, kg.get_ns("rdf").type, type_node)

edge_uri = "http://purl.org/heals/food/hasIngredient"
edge_node_uri = "http://purl.org/heals/ingredient/VanillaExtract"
edge_node = kglab.EvoShapeNode(edge_node_uri)
es1.add_link(es1.root, edge_uri, edge_node)

edge_uri = "http://purl.org/heals/food/hasIngredient"
edge_node_uri = "http://purl.org/heals/ingredient/AllPurposeFlour"
edge_node = kglab.EvoShapeNode(edge_node_uri)
es1.add_link(es1.root, edge_uri, edge_node)

edge_uri = "http://purl.org/heals/food/hasIngredient"
edge_node_uri = "http://purl.org/heals/ingredient/Salt"
edge_node = kglab.EvoShapeNode(edge_node_uri)
es1.add_link(es1.root, edge_uri, edge_node)

edge_uri = "http://purl.org/heals/food/hasIngredient"
edge_node_uri = "http://purl.org/heals/ingredient/ChickenEgg"
edge_node = kglab.EvoShapeNode(edge_node_uri)
es1.add_link(es1.root, edge_uri, edge_node)

In [17]:
[ print(r) for r in es1.get_rdf() ]
es1.serialize(subgraph)

_:Nea79f2a4dcb24450ad5e6ac6ad2f4e0f rdf:type wtm:Recipe .
_:Nea79f2a4dcb24450ad5e6ac6ad2f4e0f wtm:hasIngredient ind:VanillaExtract .
_:Nea79f2a4dcb24450ad5e6ac6ad2f4e0f wtm:hasIngredient ind:AllPurposeFlour .
_:Nea79f2a4dcb24450ad5e6ac6ad2f4e0f wtm:hasIngredient ind:Salt .
_:Nea79f2a4dcb24450ad5e6ac6ad2f4e0f wtm:hasIngredient ind:ChickenEgg .


[14,
 (-1, [(15, 16), (15, 22), (15, 27), (15, 28), (299, 13)]),
 (13, []),
 (16, []),
 (22, []),
 (27, []),
 (28, [])]

In [18]:
sparql, bindings = es1.get_sparql()

print(sparql)
print(bindings)

SELECT DISTINCT ?v0 WHERE { ?v0 ?pred0_0 ?obj0_0 . ?v0 ?pred0_1 ?obj0_1 . ?v0 ?pred0_1 ?obj0_2 . ?v0 ?pred0_1 ?obj0_3 . ?v0 ?pred0_1 ?obj0_4 }
{'?pred0_0': rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), '?obj0_0': rdflib.term.URIRef('http://purl.org/heals/food/Recipe'), '?pred0_1': rdflib.term.URIRef('http://purl.org/heals/food/hasIngredient'), '?obj0_1': rdflib.term.URIRef('http://purl.org/heals/ingredient/VanillaExtract'), '?obj0_2': rdflib.term.URIRef('http://purl.org/heals/ingredient/AllPurposeFlour'), '?obj0_3': rdflib.term.URIRef('http://purl.org/heals/ingredient/Salt'), '?obj0_4': rdflib.term.URIRef('http://purl.org/heals/ingredient/ChickenEgg')}


Query to find matching instances for this shape `es1` within the graph:

In [19]:
for row in kg.query(sparql, bindings=bindings):
    print(row)

(rdflib.term.URIRef('https://www.food.com/recipe/135405'),)
(rdflib.term.URIRef('https://www.food.com/recipe/272433'),)
(rdflib.term.URIRef('https://www.food.com/recipe/268242'),)
(rdflib.term.URIRef('https://www.food.com/recipe/371414'),)
(rdflib.term.URIRef('https://www.food.com/recipe/137158'),)
(rdflib.term.URIRef('https://www.food.com/recipe/62799'),)
(rdflib.term.URIRef('https://www.food.com/recipe/144415'),)
(rdflib.term.URIRef('https://www.food.com/recipe/430777'),)
(rdflib.term.URIRef('https://www.food.com/recipe/362055'),)
(rdflib.term.URIRef('https://www.food.com/recipe/151617'),)
(rdflib.term.URIRef('https://www.food.com/recipe/20191'),)
(rdflib.term.URIRef('https://www.food.com/recipe/73828'),)
(rdflib.term.URIRef('https://www.food.com/recipe/123656'),)
(rdflib.term.URIRef('https://www.food.com/recipe/76907'),)


## Leaderboard which can be distributed across a cluster

We can calculate metrics to describe how these shapes `es0` and `es1` might rank on a *leaderboard*:

In [20]:
es0.get_cardinality()

240

In [21]:
es1.get_cardinality()

14

Then calculate a vector distance between `es1` and `es0` which we'd generated earlier:

In [22]:
es0.calc_distance(es1)

0.16666666666666666

Now we can generate a compact, ordinal representation for the `es1` shape, which can be serialized as a string, transferred across a network to an actor, then deserialized as the same shape -- *as long as we use a similarly structured subgraph*

In [23]:
import json

ser = es1.serialize(subgraph)
j_ser = json.dumps(ser)

print(j_ser)

[14, [-1, [[15, 16], [15, 22], [15, 27], [15, 28], [299, 13]]], [13, []], [16, []], [22, []], [27, []], [28, []]]


In [24]:
ser = json.loads(j_ser)
ser

[14,
 [-1, [[15, 16], [15, 22], [15, 27], [15, 28], [299, 13]]],
 [13, []],
 [16, []],
 [22, []],
 [27, []],
 [28, []]]

In [25]:
print(subgraph.id_list)

['http://example.org/#Batter', 'http://example.org/#Component', 'http://example.org/#Dough', 'http://example.org/#Kneading', 'http://example.org/#Mixing', 'http://example.org/#NOM_Vocab', 'http://example.org/#Noodle', 'http://example.org/#Pancake', 'http://example.org/#Process', 'http://example.org/#madeFrom', 'http://example.org/#usesProcess', 'http://purl.org/dc/terms/identifier', 'http://purl.org/dc/terms/publisher', 'http://purl.org/heals/food/Recipe', 'http://purl.org/heals/food/hasCookTime', 'http://purl.org/heals/food/hasIngredient', 'http://purl.org/heals/ingredient/AllPurposeFlour', 'http://purl.org/heals/ingredient/AppleCiderVinegar', 'http://purl.org/heals/ingredient/Bacon', 'http://purl.org/heals/ingredient/BlackPepper', 'http://purl.org/heals/ingredient/BrownSugar', 'http://purl.org/heals/ingredient/Butter', 'http://purl.org/heals/ingredient/ChickenEgg', 'http://purl.org/heals/ingredient/CowMilk', 'http://purl.org/heals/ingredient/Garlic', 'http://purl.org/heals/ingredient

Test the deseserialization

In [26]:
es2 = kglab.EvoShape(kg, measure)
uri_map = es2.deserialize(ser, subgraph)

In [27]:
print(es2.root.uri)

for k, v in uri_map.items():
    print(k, v)

None
None <kglab.kglab.EvoShapeNode object at 0x11e249a50>
http://purl.org/heals/ingredient/AllPurposeFlour <kglab.kglab.EvoShapeNode object at 0x11e254550>
http://purl.org/heals/ingredient/ChickenEgg <kglab.kglab.EvoShapeNode object at 0x11e254dd0>
http://purl.org/heals/ingredient/Salt <kglab.kglab.EvoShapeNode object at 0x11e2543d0>
http://purl.org/heals/ingredient/VanillaExtract <kglab.kglab.EvoShapeNode object at 0x11e254790>
http://purl.org/heals/food/Recipe <kglab.kglab.EvoShapeNode object at 0x11e254190>


In [28]:
for e in es2.root.edges:
    print("obj", e.obj)
    print("edge", e.pred, e.obj.uri)

obj <kglab.kglab.EvoShapeNode object at 0x11e254550>
edge http://purl.org/heals/food/hasIngredient http://purl.org/heals/ingredient/AllPurposeFlour
obj <kglab.kglab.EvoShapeNode object at 0x11e254dd0>
edge http://purl.org/heals/food/hasIngredient http://purl.org/heals/ingredient/ChickenEgg
obj <kglab.kglab.EvoShapeNode object at 0x11e2543d0>
edge http://purl.org/heals/food/hasIngredient http://purl.org/heals/ingredient/Salt
obj <kglab.kglab.EvoShapeNode object at 0x11e254790>
edge http://purl.org/heals/food/hasIngredient http://purl.org/heals/ingredient/VanillaExtract
obj <kglab.kglab.EvoShapeNode object at 0x11e254190>
edge http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://purl.org/heals/food/Recipe


In [29]:
for n in es2.nodes:
    print(n)
    print(n.uri)

<kglab.kglab.EvoShapeNode object at 0x11e254790>
http://purl.org/heals/ingredient/VanillaExtract
<kglab.kglab.EvoShapeNode object at 0x11e249a50>
None
<kglab.kglab.EvoShapeNode object at 0x11e2543d0>
http://purl.org/heals/ingredient/Salt
<kglab.kglab.EvoShapeNode object at 0x11e254550>
http://purl.org/heals/ingredient/AllPurposeFlour
<kglab.kglab.EvoShapeNode object at 0x11e254190>
http://purl.org/heals/food/Recipe
<kglab.kglab.EvoShapeNode object at 0x11e254dd0>
http://purl.org/heals/ingredient/ChickenEgg


In [30]:
[ print(r) for r in es2.get_rdf() ]
es2.serialize(subgraph)

_:N0c0722c59d9946a2887ebc13abec8436 wtm:hasIngredient ind:AllPurposeFlour .
_:N0c0722c59d9946a2887ebc13abec8436 wtm:hasIngredient ind:ChickenEgg .
_:N0c0722c59d9946a2887ebc13abec8436 wtm:hasIngredient ind:Salt .
_:N0c0722c59d9946a2887ebc13abec8436 wtm:hasIngredient ind:VanillaExtract .
_:N0c0722c59d9946a2887ebc13abec8436 rdf:type wtm:Recipe .


[14,
 (-1, [(15, 16), (15, 22), (15, 27), (15, 28), (299, 13)]),
 (13, []),
 (16, []),
 (22, []),
 (27, []),
 (28, [])]

In [31]:
es2.get_sparql()

('SELECT DISTINCT ?v1 WHERE { ?v1 ?pred1_0 ?obj1_0 . ?v1 ?pred1_0 ?obj1_1 . ?v1 ?pred1_0 ?obj1_2 . ?v1 ?pred1_0 ?node0 . ?v1 ?pred1_4 ?obj1_4 }',
 {'?node0': rdflib.term.URIRef('http://purl.org/heals/ingredient/VanillaExtract'),
  '?pred1_0': rdflib.term.URIRef('http://purl.org/heals/food/hasIngredient'),
  '?obj1_0': rdflib.term.URIRef('http://purl.org/heals/ingredient/AllPurposeFlour'),
  '?obj1_1': rdflib.term.URIRef('http://purl.org/heals/ingredient/ChickenEgg'),
  '?obj1_2': rdflib.term.URIRef('http://purl.org/heals/ingredient/Salt'),
  '?pred1_4': rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
  '?obj1_4': rdflib.term.URIRef('http://purl.org/heals/food/Recipe')})

Prototype a leaderboard -

In [32]:
leaderboard = kglab.Leaderboard()
leaderboard.df

Unnamed: 0,instances,nodes,distance,rank,shape


In [33]:
dist = leaderboard.add_shape(es0.serialize(subgraph))
print(dist)
leaderboard.df

0


Unnamed: 0,instances,nodes,distance,rank,shape
0,240,2,0.0,0.0,"[240, (-1, [(299, 298)]), (298, [])]"


In [34]:
dist = leaderboard.add_shape(es1.serialize(subgraph))
print(dist)
leaderboard.df

0


Unnamed: 0,instances,nodes,distance,rank,shape
0,240,2,0.166667,1.732051,"[240, (-1, [(299, 298)]), (298, [])]"
1,14,6,0.166667,1.732051,"[14, (-1, [(15, 16), (15, 22), (15, 27), (15, ..."


In [35]:
es3 = kglab.EvoShape(kg, measure)

type_uri = "http://purl.org/heals/food/Recipe"
type_node = kglab.EvoShapeNode(type_uri, terminal=True)
es3.add_link(es3.root, kg.get_ns("rdf").type, type_node)

edge_uri = "http://purl.org/heals/food/hasIngredient"
edge_node_uri = "http://purl.org/heals/ingredient/Butter"
edge_node = kglab.EvoShapeNode(edge_node_uri)
es3.add_link(es3.root, edge_uri, edge_node)

In [36]:
shape = es3.serialize(subgraph)
shape

[147, (-1, [(15, 21), (299, 13)]), (13, []), (21, [])]

In [37]:
dist = leaderboard.add_shape(es3.serialize(subgraph))
print(dist)

leaderboard.df

2


Unnamed: 0,instances,nodes,distance,rank,shape
2,147,3,0.333333,2.380476,"[147, (-1, [(15, 21), (299, 13)]), (13, []), (..."
0,240,2,0.166667,2.081666,"[240, (-1, [(299, 298)]), (298, [])]"
1,14,6,0.166667,2.081666,"[14, (-1, [(15, 16), (15, 22), (15, 27), (15, ..."


## Generating triads from co-occurrence