# Ontology Taxonomy and Relationship Map Extractor

In [None]:
import pandas as pd
import numpy as np
from neo4j import GraphDatabase
import itertools
from tqdm import tqdm
import string

### FIBO Ontology in Neo4j Setup
1. Install APOC and neosemantics on your instance of Neo4j.<br />
2. Ingest FIBO into a Neo4j graph database by running the following two cypher queries:<br />
`CALL n10s.graphconfig.init();`<br />
`CALL n10s.onto.import.fetch("https://spec.edmcouncil.org/fibo/ontology/master/2020Q2/prod.fibo-quickstart.ttl","Turtle");`

## Setup Neo4j Python Driver
Use host where you implemented FIBO Neo4j setup above

In [12]:
#uri = "neo4j://localhost:7687"
uri = "neo4j://10.20.1.21:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "dt"))

## Determine the maximum length of relationship chain

In [39]:
match_statement = "MATCH (n)-[:SCO]->()"
addition = "-[:SCO]->()"
size_query = (match_statement + " RETURN count(distinct n.name) as node_count")
max_tax = 0

def get_taxonomy_size(tx):
    nodes = [record['node_count'] for record in tx.run(size_query)]
    return nodes

with driver.session() as session:
    nodes = session.read_transaction(get_taxonomy_size)
    max_tax = 1
    while nodes[0]>0:
        max_tax = max_tax+1
        match_statement = match_statement+addition
        size_query = (match_statement + " RETURN count(distinct n.name) as node_count")
        nodes = session.read_transaction(get_taxonomy_size)

driver.close()

print("Maximum length of relationship chain: "+str(max_tax))

Maximum length of relationship chain: 12


## Get tier 1 classifications

In [51]:
def get_top_tier(tx):
    query = (
        "MATCH ()-[:SCO]->(a) "
        "WHERE NOT (a)-[:SCO]->() "
        "RETURN distinct a.name as name")
    result_table = [record['name'] for record in tx.run(query)]
    return result_table

with driver.session() as session:
    segments = session.read_transaction(get_top_tier)

driver.close()

print(segments)

['CommercialActivity', 'Policy', 'Convention', 'Location', 'StatisticalProgram', 'CodeElement', 'Capital', 'Account', 'ThingInRole', 'Jurisdiction', 'Good', 'Situation', 'Currency', 'Collection', 'AutonomousAgent', 'Objective', 'Service', 'CollectionConstituent', 'Aspect', 'OccurrenceKind', 'TimeInterval', 'Language', 'Opinion', 'ControllingCapacity', 'ContractualElement', 'Exposure', 'Product', 'Constitution', 'TimeInstant', 'Law', 'LegalConstruct', 'Commitment', 'Strategy', 'Document', 'Equity', 'Agreement', 'Reference', 'Offering', 'Arrangement', 'QuantityValue', 'LegalSystem', 'Occurrence']


## Functions for building taxonomy

In [14]:
def get_tier(tx, name, query):
    result_table = [record for record in tx.run(query, name=name)]
    return result_table

def build_tax_df(result):
    taxonomy = pd.DataFrame(columns=columns)
    for i in range(0,len(result)):
        y = 0
        for column in list(taxonomy.columns):
            taxonomy.loc[i,column] = result[i][tiers[y]]
            y = y+1
    return taxonomy

def build_full_taxonomy(tiers):
    taxonomy = pd.DataFrame(columns=columns)

    for i in range(0,len(results_list)):
        taxonomy = taxonomy.append(build_tax_df(results_list[i]))
    
    return taxonomy.reset_index().drop(columns=['index'])

In [101]:
# build out column names list based on taxonomy maximum length
columns = ["tier_1"]
for i in range(2,max_tax+1):
    columns.append("tier_"+str(i))

max_tax = 8
variables = []
name_dict = {}

for i in range(0,max_tax):
    variables.append(letters[i])
    name_dict[variables[i]] = "null"
variables.append('')

x = ""
query_dict = {}

for i in range(1,max_tax+1):
    letter = letters[i]
    query = ("MATCH "+x+"(a) WHERE a.name = $name "
        "RETURN distinct a.name as tier_1, "
        ""+name_dict['b']+" as tier_2, "+name_dict['c']+" as tier_3, "+name_dict['d']+" as tier_4, "
        ""+name_dict['e']+" as tier_5, "+name_dict['f']+" as tier_6, "+name_dict['g']+" as tier_7, "
        ""+name_dict['h']+" as tier_8")
    x = "(" + letter + ")-[:SCO]->"+x
    name_dict[variables[i]] = letter+".name"
    query_dict[letters[i-1]] = query

In [114]:
results_list = []
with driver.session() as session:
    for Segment_Name in segments:
        for i in range(0,max_tax):
            result = session.read_transaction(get_tier, Segment_Name, query_dict[letters[i]])
            results_list.append(result)
    
driver.close()

taxonomy = build_full_taxonomy(columns)

In [123]:
taxonomy.head(10)

Unnamed: 0,Tier 1,Tier 2,Tier 3,Tier 4,Tier 5,Tier 6,Tier 7,Tier 8
0,CommercialActivity,,,,,,,
1,CommercialActivity,Commerce,,,,,,
2,Policy,,,,,,,
3,Policy,ReportingPolicy,,,,,,
4,Convention,,,,,,,
5,Convention,DividendDistributionMethod,,,,,,
6,Convention,BusinessDayConvention,,,,,,
7,Convention,RateBasisConvention,,,,,,
8,Convention,BusinessDayAdjustment,,,,,,
9,Convention,BusinessRecurrenceIntervalConvention,,,,,,


## Functions for building relationship map

In [117]:
def get_unique_elements(taxonomy):
    node_list = []
    for column in list(taxonomy.columns):
        for element in list(taxonomy[column].unique()):
            node_list.append(element)
    node_list = set(node_list)
    return list(node_list)

def get_relationship(tx, source_name, subject_name):
    query = (
        "MATCH (c:Class {name:$source_name})<-[:DOMAIN]-(r:Relationship)-[:RANGE]->(q:Class {name:$subject_name}) "
        "RETURN r.name as relationship")
    results = [record['relationship'] for record in tx.run(query, source_name=source_name, subject_name=subject_name)]
    return results

def get_all_direct_relationships(tx):
    query = (
        "MATCH p=(src:Class)<-[:DOMAIN]-(r:Relationship)-[:RANGE]->(sub:Class) "
        "return src.name as source, r.name as relation, sub.name as subject")
    results = [record for record in tx.run(query)]
    return results

In [119]:
node_list = get_unique_elements(taxonomy)
possibilities = list(itertools.combinations(node_list, 2))
map_df = pd.DataFrame(possibilities, columns=['Source','Subject'])

In [120]:
with driver.session() as session:
    all_relations = session.read_transaction(get_all_direct_relationships)
driver.close()

In [121]:
relations_df = pd.DataFrame(columns=['source','relation','subject'])
for i in range(0,len(all_relations)):
    relations_df.loc[i,'source'] = all_relations[i]['source']
    relations_df.loc[i,'relation'] = all_relations[i]['relation']
    relations_df.loc[i,'subject'] = all_relations[i]['subject']

In [124]:
relations_df.head(10)

Unnamed: 0,source,relation,subject
0,Ownership,hasOwningParty,Owner
1,RatingParty,producesRatingsFor,RatingIssuer
2,Contract,hasContractParty,ContractParty
3,Option,hasExerciseTerms,ExerciseTerms
4,Corporation,isConstitutedBy,Constitution
5,Exchange,operatesInCountry,Country
6,Payment,fulfillsObligation,PaymentObligation
7,Security,mayBeTradedIn,Exchange
8,Buyer,buysFrom,Seller
9,Security,isRegisteredWith,RegistrationAuthority
