<a href="https://colab.research.google.com/github/CyrineG/lfx-2022/blob/main/KG_script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install rdflib neo4j

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import rdflib, time

In [None]:
#utility function to get the local part of a URI (stripping out the namespace)

def getLocalPart(uri):
  pos = -1
  pos = uri.rfind('#') 
  if pos < 0 :
    pos = uri.rfind('/')  
  #if pos < 0 :
    #pos = uri.rindex(':')
  return uri[pos+1:]

def getNamespacePart(uri):
  pos = -1
  pos = uri.rfind('#') 
  if pos < 0 :
    pos = uri.rfind('/')  
  #if pos < 0 :
    #pos = uri.rindex(':')
  return uri[0:pos+1]

# quick test
print(getLocalPart("http://onto.neo4j.com/rail#Station"))
print(getNamespacePart("http://onto.neo4j.com/rail#Station"))


Station
http://onto.neo4j.com/rail#


In [None]:
# processing the ontology...
 
g = rdflib.Graph()
g.parse("https://raw.githubusercontent.com/CyrineG/lfx-2022/main/onto/kubernetes-resources2.ttl", format='turtle')

simple_query = """
prefix owl: <http://www.w3.org/2002/07/owl#> 
prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> 

SELECT DISTINCT ?c
  WHERE {
    ?c rdf:type owl:Class .    
  } """

for row in g.query(simple_query):
    print(str(row.c), getLocalPart(str(row.c)), getNamespacePart(str(row.c)))

http://www.semanticweb.org/cyrine/ontologies/2022/8/kubernetes-resources#pod pod http://www.semanticweb.org/cyrine/ontologies/2022/8/kubernetes-resources#
http://www.semanticweb.org/cyrine/ontologies/2022/8/kubernetes-resources#replicaset replicaset http://www.semanticweb.org/cyrine/ontologies/2022/8/kubernetes-resources#


In [None]:
# get json data and convert to csv

import json
import pandas
import requests

def load_json(url: str) -> dict:

  response = requests.get(url)
  data = json.loads(response.content)

  return data


def create_dataframe(data: list) -> pandas.DataFrame:

	# Declare an empty dataframe to append records
	dataframe = pandas.DataFrame()

	# Looping through each record
	for d in data:
		
		# Normalize the column levels
		record = pandas.json_normalize(d)
		
		# Append it to the dataframe
		dataframe = dataframe.append(record, ignore_index=True)

	return dataframe


# Read the JSON file as python dictionary
pods_data = load_json(url='https://raw.githubusercontent.com/CyrineG/lfx-2022/main/data/pods.json')
rs_data = load_json(url='https://raw.githubusercontent.com/CyrineG/lfx-2022/main/data/replicasets.json')

# Generate the dataframe for the array items in
# details key
pods_dataframe = create_dataframe(data=pods_data['items'])
rs_dataframe = create_dataframe(data=rs_data['items'])

# Renaming columns of the dataframe
print("Normalized PODS Columns:", pods_dataframe.columns.to_list())
print("Normalized REPLICASETS Columns:", rs_dataframe.columns.to_list())

#dataframe.rename(columns={
#        "results.school": "school",
#        "results.high_school": "high_school",
#        "results.graduation": "graduation",
#        "education.graduation.major": "grad_major",
#        "education.graduation.minor": "grad_minor"
#    }, inplace=True)

#print("Renamed Columns:", dataframe.columns.to_list())

# Convert dataframe to CSV
pods_dataframe.to_csv("pods.csv", index=False)
rs_dataframe.to_csv("rs.csv", index=False)



Normalized PODS Columns: ['apiVersion', 'kind', 'metadata.creationTimestamp', 'metadata.generateName', 'metadata.labels.app', 'metadata.labels.pod-template-hash', 'metadata.name', 'metadata.namespace', 'metadata.ownerReferences', 'metadata.resourceVersion', 'metadata.uid', 'spec.containers', 'spec.dnsPolicy', 'spec.enableServiceLinks', 'spec.nodeName', 'spec.preemptionPolicy', 'spec.priority', 'spec.restartPolicy', 'spec.schedulerName', 'spec.serviceAccount', 'spec.serviceAccountName', 'spec.terminationGracePeriodSeconds', 'spec.tolerations', 'spec.volumes', 'status.conditions', 'status.containerStatuses', 'status.hostIP', 'status.phase', 'status.qosClass', 'status.startTime', 'status.podIP', 'status.podIPs']
Normalized REPLICASETS Columns: ['apiVersion', 'kind', 'metadata.annotations.deployment.kubernetes.io/desired-replicas', 'metadata.annotations.deployment.kubernetes.io/max-replicas', 'metadata.annotations.deployment.kubernetes.io/revision', 'metadata.creationTimestamp', 'metadata.

In [None]:
resourcesMappings = {}

podMapping = {}
podMapping["@fileName"] = "pods.csv"
podMapping["@uniqueId"] = "uid"
podMapping["uid"]="metadata.uid"
resourcesMappings["pod"] = podMapping

replicasetMapping = {}
replicasetMapping["@fileName"] = "rs.csv"
replicasetMapping["@uniqueId"] = "uid"
replicasetMapping["uid"]="metadata.uid"
resourcesMappings["replicaset"] = replicasetMapping

# show it?
resourcesMappings


{'pod': {'@fileName': 'pods.csv', '@uniqueId': 'uid', 'uid': 'metadata.uid'},
 'replicaset': {'@fileName': 'rs.csv',
  '@uniqueId': 'uid',
  'uid': 'metadata.uid'}}

In [None]:
# read the ontology and generate cypher with mapping

def getLoadersFromOnto(onto, rdf_format, mappings):
  g = rdflib.Graph()
  g.parse(onto, format= rdf_format)

  classes_and_props_query = """
  prefix owl: <http://www.w3.org/2002/07/owl#> 
  prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> 

  SELECT DISTINCT ?curi (GROUP_CONCAT( ?propTypePair ; SEPARATOR=",") AS ?props)
  WHERE {
      ?curi rdf:type owl:Class .
      optional { 
        ?prop rdfs:domain ?curi ;
          a owl:DatatypeProperty ;
          rdfs:range ?range .
        BIND (concat(str(?prop),';',str(?range)) AS ?propTypePair)
      }
    } GROUP BY ?curi  """

  cypher_import = {}
  export_ns = set()
  export_mappings = {}

  for row in g.query(classes_and_props_query):
      print("---row---")
      print(row)
      print(getLocalPart(row.curi))
      print(mappings[getLocalPart(row.curi)])
      print(mappings[getLocalPart(row.curi)]["@uniqueId"])
      print(mappings[getLocalPart(row.curi)][mappings[getLocalPart(row.curi)]["@uniqueId"]])

      print("--------")
      export_ns.add(getNamespacePart(row.curi))
      export_mappings[getLocalPart(row.curi)] = str(row.curi)
      cypher = []
      cypher.append("unwind $records AS record")
      cypher.append("merge (n:" + getLocalPart(row.curi) + " { `" + mappings[getLocalPart(row.curi)]["@uniqueId"] + "`: record.`" + mappings[getLocalPart(row.curi)][mappings[getLocalPart(row.curi)]["@uniqueId"]] + "`} )")
      for pair in row.props.split(","): 
        print("----row.props---")
        print(row.props.split(","))  
        print(pair.split(";")[0])   
        #print(pair.split(";")[1])
        print("-----")
        propName = pair.split(";")[0]
        #propType = pair.split(";")[1] 
        export_ns.add(getNamespacePart(propName))
        export_mappings[getLocalPart(propName)] = propName
        #if a mapping (a column in the source file) is defined for the property and property is not a unique id
        if getLocalPart(propName) in mappings[getLocalPart(row.curi)] and getLocalPart(propName) != mappings[getLocalPart(row.curi)]["@uniqueId"]:
          cypher.append("set n." + getLocalPart(propName) + " = record.`" + mappings[getLocalPart(row.curi)][getLocalPart(propName)] + "`")
      cypher.append("return count(*) as total") 
      cypher_import[getLocalPart(row.curi)] = ' \n'.join(cypher)


  rels_query = """
  prefix owl: <http://www.w3.org/2002/07/owl#> 
  prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> 

  SELECT DISTINCT ?rel ?dom ?ran #(GROUP_CONCAT(DISTINCT ?relTriplet ; SEPARATOR=",") AS ?rels)
  WHERE {
      ?rel a ?propertyClass .
      filter(?propertyClass in (rdf:Property, owl:ObjectProperty, owl:FunctionalProperty, owl:AsymmetricProperty, 
            owl:InverseFunctionalProperty, owl:IrreflexiveProperty, owl:ReflexiveProperty, owl:SymmetricProperty, owl:TransitiveProperty))
      
      ?rel rdfs:domain ?dom ;
        rdfs:range ?ran .
      
      #BIND (concat(str(?rel),';',str(?dom),';',str(?range)) AS ?relTriplet)
      
    }"""

  #for row in g.query(rels_query):
   # export_ns.add(getNamespacePart(row.rel))
  #  export_mappings[getLocalPart(row.rel)] = str(row.rel)
    #cypher = []
    #cypher.append("unwind $records AS record")
    #cypher.append("match (source:" + getLocalPart(row.dom) + " { `" + mappings[getLocalPart(row.dom)]["@uniqueId"] + "`: record.`" + mappings[getLocalPart(row.rel)]["@from"] + "`} )")
    #cypher.append("match (target:" + getLocalPart(row.ran) + " { `" + mappings[getLocalPart(row.ran)]["@uniqueId"] + "`: record.`" + mappings[getLocalPart(row.rel)]["@to"] + "`} )")
    #cypher.append("merge (source)-[r:`"+ getLocalPart(row.rel) +"`]->(target)")
    #cypher.append("return count(*) as total") 
    #cypher_import[getLocalPart(row.rel)] = ' \n'.join(cypher)


  nscount = 0
  mapping_export_cypher = []
  
  for ns in export_ns:
    mapping_export_cypher.append("call n10s.nsprefixes.add('ns" + str(nscount) + "','" + ns + "');")
    nscount+=1

  for k in export_mappings.keys():
    mapping_export_cypher.append("call n10s.mapping.add('" + export_mappings[k] + "','" + k + "');")
 
  return cypher_import ,  mapping_export_cypher

In [None]:
cypher_import , mapping_defs = getLoadersFromOnto("https://raw.githubusercontent.com/CyrineG/lfx-2022/main/onto/kubernetes-resources2.ttl","turtle",resourcesMappings)

print("#LOADERS:\n\n")
for q in cypher_import.keys():
  print(q + ": \n\nfile: " + resourcesMappings[q]["@fileName"] + "\n\n"+ cypher_import[q] + "\n\n")

print("#EXPORT MAPPINGS (for RDF API):\n\n")
for md in mapping_defs:
  print(md)

---row---
(rdflib.term.URIRef('http://www.semanticweb.org/cyrine/ontologies/2022/8/kubernetes-resources#pod'), rdflib.term.Literal('http://www.semanticweb.org/cyrine/ontologies/2022/8/kubernetes-resources#uid;http://www.w3.org/2001/XMLSchema#string'))
pod
{'@fileName': 'pods.csv', '@uniqueId': 'uid', 'uid': 'metadata.uid'}
uid
metadata.uid
--------
----row.props---
['http://www.semanticweb.org/cyrine/ontologies/2022/8/kubernetes-resources#uid;http://www.w3.org/2001/XMLSchema#string']
http://www.semanticweb.org/cyrine/ontologies/2022/8/kubernetes-resources#uid
-----
---row---
(rdflib.term.URIRef('http://www.semanticweb.org/cyrine/ontologies/2022/8/kubernetes-resources#replicaset'), rdflib.term.Literal('http://www.semanticweb.org/cyrine/ontologies/2022/8/kubernetes-resources#uid;http://www.w3.org/2001/XMLSchema#string'))
replicaset
{'@fileName': 'rs.csv', '@uniqueId': 'uid', 'uid': 'metadata.uid'}
uid
metadata.uid
--------
----row.props---
['http://www.semanticweb.org/cyrine/ontologies/2

In [None]:
# Utility function to write to Neo4j in batch mode.

def insert_data(session, query, frame, batch_size = 500):

    total = 0
    batch = 0
    start = time.time()
    result = None
    
    while batch * batch_size < len(frame):
        res = session.write_transaction( lambda tx: tx.run(query,
                      parameters = {'records': frame[batch*batch_size:(batch+1)*batch_size].to_dict('records')}).data())

        total += res[0]['total']
        batch += 1
        result = {"total":total, 
                  "batches":batch, 
                  "time":time.time()-start}
        print(result)
        
    return result

In [None]:
import pandas as pd
from neo4j import GraphDatabase, basic_auth

driver = GraphDatabase.driver(
  "bolt://52.3.236.85:7687",
  auth=basic_auth("neo4j", "resistors-typewriters-amperes"))

session = driver.session(database="neo4j")

cypher_import , mapping_defs = getLoadersFromOnto("https://raw.githubusercontent.com/CyrineG/lfx-2022/main/onto/kubernetes-resources2.ttl","turtle",resourcesMappings)

for q in cypher_import.keys():
  print("about to import " + q + " from file " + resourcesMappings[q]["@fileName"])
  df = pd.read_csv(resourcesMappings[q]["@fileName"])
  result = insert_data(session, cypher_import[q], df, batch_size = 300) 
  print(result)

for md in mapping_defs:
  session.run(md)

---row---
(rdflib.term.URIRef('http://www.semanticweb.org/cyrine/ontologies/2022/8/kubernetes-resources#pod'), rdflib.term.Literal('http://www.semanticweb.org/cyrine/ontologies/2022/8/kubernetes-resources#uid;http://www.w3.org/2001/XMLSchema#string'))
pod
{'@fileName': 'pods.csv', '@uniqueId': 'uid', 'uid': 'metadata.uid'}
uid
metadata.uid
--------
----row.props---
['http://www.semanticweb.org/cyrine/ontologies/2022/8/kubernetes-resources#uid;http://www.w3.org/2001/XMLSchema#string']
http://www.semanticweb.org/cyrine/ontologies/2022/8/kubernetes-resources#uid
-----
---row---
(rdflib.term.URIRef('http://www.semanticweb.org/cyrine/ontologies/2022/8/kubernetes-resources#replicaset'), rdflib.term.Literal('http://www.semanticweb.org/cyrine/ontologies/2022/8/kubernetes-resources#uid;http://www.w3.org/2001/XMLSchema#string'))
replicaset
{'@fileName': 'rs.csv', '@uniqueId': 'uid', 'uid': 'metadata.uid'}
uid
metadata.uid
--------
----row.props---
['http://www.semanticweb.org/cyrine/ontologies/2

  # This is added back by InteractiveShellApp.init_path()


{'total': 2, 'batches': 1, 'time': 0.8499026298522949}
{'total': 2, 'batches': 1, 'time': 0.8499026298522949}
about to import replicaset from file rs.csv
{'total': 2, 'batches': 1, 'time': 0.325575590133667}
{'total': 2, 'batches': 1, 'time': 0.325575590133667}
