## Summary
This Graph ML notebook will utilise Neo4j Native Algorithm to build upon the existing Graph Network,
in this follows steps:
### 1. Weakly Connected Components GraphML
- This is used to identify graph networks that are definitive isolated.

### 2. Similarity Algorithm
- Look for Similarity between 2 Stored Procedures networks amongst Tables

### 3. Community Algorithm (Label Propagation Algorithm)
- Using the Similarity Algorithm, to determine the Groupings of Stored Procedures

#### 1. Import Packages & Set Neo4j DB connection

In [2]:
from py2neo import Graph
import pandas as pd
from IPython import display

graph = Graph("bolt://localhost:7687", auth=("neo4j", "123"))

The nodes we have:

In [2]:
print("Nodes:")

result = {"label": [], "count": []}
for label in graph.run("CALL db.labels()").to_series():
    query = f"MATCH (:`{label}`) RETURN count(*) as count"
    count = graph.run(query).to_data_frame().iloc[0]['count']
    result["label"].append(label)
    result["count"].append(count)
nodes_df = pd.DataFrame(data=result)
nodes_df.sort_values("count")

Nodes:


Unnamed: 0,label,count
2,VIEW,557
0,SP,1199
1,TABLE,3296


The relationships we have:

In [3]:
print("Relationships:")

result = {"relType": [], "count": []}
for relationship_type in graph.run("CALL db.relationshipTypes()").to_series():
    query = f"MATCH ()-[:`{relationship_type}`]->() RETURN count(*) as count"
    count = graph.run(query).to_data_frame().iloc[0]['count']
    result["relType"].append(relationship_type)
    result["count"].append(count)
rels_df = pd.DataFrame(data=result)
rels_df.sort_values("count")

Relationships:


Unnamed: 0,relType,count
3,CALL_SP,67
1,SP_INSERT_UPDATE,1300
2,SP_SELECT_FROM,2598
0,RELY_ON,4394


## Graph ML

### 1. WCC : Looking for disjointed paths across segments using GraphML
WCC will find all isolated islands, these will form natural community

In [4]:
query = """
CALL gds.graph.drop('SP_ML_Graph', false)
"""
graph.run(query).to_data_frame()

query = """
CALL gds.graph.drop('SP_ML_Graph_v2', false)
"""
graph.run(query).to_data_frame()


query = """
CALL gds.graph.project(
    'SP_ML_Graph',
    {
        SP: {properties: ['EXCLUSION']}, 
        TABLE: {properties: ['EXCLUSION']}
    },
    {
        REL:{type: 'RELY_ON', orientation:'UNDIRECTED'}
    }
);
"""
graph.run(query).to_data_frame()

query = """
    CALL gds.beta.graph.project.subgraph(
      'SP_ML_Graph_v2', 
      'SP_ML_Graph', 
      'n.EXCLUSION=0 OR (n:SP AND n.EXCLUSION=0)', 
      '*' 
    )
    """
graph.run(query).to_data_frame()

query = """
    CALL gds.wcc.write('SP_ML_Graph_v2',
                      {
                          writeProperty: 'wcc_group'
                      }) YIELD nodePropertiesWritten, componentCount, componentDistribution
    """
print("Running WCC to the main graph---")
graph.run(query).to_data_frame()



Running WCC to the main graph---


Unnamed: 0,nodePropertiesWritten,componentCount,componentDistribution
0,4455,2043,"{'p99': 3, 'min': 1, 'max': 1771, 'mean': 2.18..."


### The WCC results
Each grouping represents one isolated islands
(i.e. countsize = 1 -> the SP utilise TABLES that no other SP uses)

In [5]:
query = """
match (s:SP)
where s.wcc_group is null
SET s.wcc_group_count = 0
"""

graph.run(query)

query = """
match (s:SP)
with s.wcc_group as wcc_group, size(collect(s.SP_ID)) as countsize
return wcc_group, countsize 
order by countsize desc
"""

df =graph.run(query).to_data_frame()
df

Unnamed: 0,wcc_group,countsize
0,1.0,784
1,,39
2,150.0,35
3,93.0,8
4,55.0,6
...,...,...
303,1150.0,1
304,1151.0,1
305,1152.0,1
306,1153.0,1


In [6]:
#Putting the wcc_group_count and wcc_group_wordcount back into the Node for easier manipulation

query = """
match (s:SP)
with s.wcc_group as wcc_group_select, size(collect(s.SP_ID)) as countsize, SUM(s.SYNTAX_WORDCOUNT) as syntax_wordcount

match (s2:SP)
where s2.wcc_group = wcc_group_select
SET s2.wcc_group_count = countsize,
    s2.wcc_group_wordcount = syntax_wordcount
return s2.wcc_group, countsize as wcc_group_count, syntax_wordcount as wcc_group_wordcount
order by countsize desc
"""

df =graph.run(query).to_data_frame()
df



Unnamed: 0,s2.wcc_group,wcc_group_count,wcc_group_wordcount
0,1,784,730901
1,1,784,730901
2,1,784,730901
3,1,784,730901
4,1,784,730901
...,...,...,...
1155,1150,1,0
1156,1151,1,0
1157,1152,1,0
1158,1153,1,0


## EDA on the Community built
- Community is based on the Weakly Connectyd Components
- 99991 : Singleton Group
- 99992 : Cluster with 2-9 Nodes
- 99993 : Cluster with 11-19 Nodes
- 99994 : Cluster with 20-29 Nodes
- 99995 : Cluster with 30-39 Nodes
- 99999 : SP in the Exclusion List
- Anything above 30, they will be assigned their own group

In [7]:
#Exclusion Groups

query = """
match (s:SP)
where s.EXCLUSION = 1
SET s.final_group = 99999
"""
graph.run(query).to_data_frame()

In [8]:
#First let group all singleton groups into the same subgroup first

query = """
match (s:SP)
where s.sp_community_group is null
with s.wcc_group as wcc_group_select, size(collect(s.SP_ID)) as countsize
where countsize = 1

with wcc_group_select
match (s2:SP)
where s2.wcc_group = wcc_group_select
SET s2.final_group = 99991
"""
graph.run(query).to_data_frame()

In [9]:
#If a wcc group between 2 - 9 counts, then put it in another supgroup

query = """
match (s:SP)
where s.sp_community_group is null
with s.wcc_group as wcc_group_select, size(collect(s.SP_ID)) as countsize
where countsize >= 2 and countsize <= 10 

with wcc_group_select
match (s2:SP)
where s2.wcc_group = wcc_group_select
SET s2.final_group = 99992
"""
graph.run(query).to_data_frame()


In [10]:
#If a wcc group between 11 - 19 counts, then put it in another supgroup

query = """
match (s:SP)
where s.sp_community_group is null
with s.wcc_group as wcc_group_select, size(collect(s.SP_ID)) as countsize
where countsize > 10 and countsize <= 19

with wcc_group_select
match (s2:SP)
where s2.wcc_group = wcc_group_select
SET s2.final_group = 99993
"""
graph.run(query).to_data_frame()

In [11]:
#If a wcc group between 20 - 29 counts, then put it in another supgroup

query = """
match (s:SP)
where s.sp_community_group is null
with s.wcc_group as wcc_group_select, size(collect(s.SP_ID)) as countsize
where countsize > 20 and countsize <= 29

with wcc_group_select
match (s2:SP)
where s2.wcc_group = wcc_group_select
SET s2.final_group = 99994
"""
graph.run(query).to_data_frame()

In [12]:
#If a wcc group between 30 - 39 counts, then put it in another supgroup

query = """
match (s:SP)
where s.sp_community_group is null
with s.wcc_group as wcc_group_select, size(collect(s.SP_ID)) as countsize
where countsize > 30 and countsize <= 39

with wcc_group_select
match (s2:SP)
where s2.wcc_group = wcc_group_select
SET s2.final_group = 99995
"""
graph.run(query).to_data_frame()

In [13]:
#Any WCC greater than 30 should be treated as single final group

query = """
match (s:SP)
with s.wcc_group as wcc_group_select, size(collect(s.SP_ID)) as countsize, s.final_group as final_group
where countsize > 30 and final_group is null

with wcc_group_select
match (s2:SP)
where s2.wcc_group = wcc_group_select
SET s2.final_group = wcc_group_select
"""
graph.run(query).to_data_frame()

In [14]:
query = """
match (s:SP)
return s.final_group, size(collect(s.SP_ID))
"""
graph.run(query).to_data_frame()

Unnamed: 0,s.final_group,size(collect(s.SP_ID))
0,99991,287
1,1,784
2,99992,54
3,99999,39
4,99995,35


In [15]:
query = """
match (s:SP)
return s.wcc_group, size(collect(s.SP_ID))
"""
graph.run(query).to_data_frame()

Unnamed: 0,s.wcc_group,size(collect(s.SP_ID))
0,0.0,1
1,1.0,784
2,2.0,1
3,4.0,1
4,6.0,5
...,...,...
303,1150.0,1
304,1151.0,1
305,1152.0,1
306,1153.0,1


## Community Detection 
We will utilise 2 community algorithms (Louvain and LPA) and comparing the results

Purpose : To see if these big wcc groups can be break down into smaller sup-group with minimal overlapping

Default we only running for WCC Groups with more than 40 SPs

In [16]:
query = """
CALL gds.graph.drop('SP_ML_Graph', false)
"""
graph.run(query).to_data_frame()

query = """
CALL gds.graph.drop('SP_ML_Graph_sub', false)
"""
graph.run(query).to_data_frame()


query = """
CALL gds.graph.project(
    'SP_ML_Graph',
    {
        SP: {properties: ['wcc_group_count', 'EXCLUSION']}, 
        TABLE: {properties: ['EXCLUSION']}
    },
    {
        REL:{type: 'RELY_ON', orientation:'UNDIRECTED'}
    }
);
"""
graph.run(query).to_data_frame()


Unnamed: 0,nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,projectMillis
0,"{'TABLE': {'label': 'TABLE', 'properties': {'E...","{'REL': {'orientation': 'UNDIRECTED', 'aggrega...",SP_ML_Graph,4495,7594,5


In [17]:
%%time

#You can charge it here n.wcc_group_count>=40 to any other numbers
query = """
    CALL gds.beta.graph.project.subgraph(
      'SP_ML_Graph_sub', 
      'SP_ML_Graph', 
      '(n:SP AND n.wcc_group_count>=40 AND n.EXCLUSION=0) OR (n:TABLE AND n.EXCLUSION=0)', 
      '*' 
    )
    """
graph.run(query).to_data_frame()

#print("This piece of GraphML applies to only the main group wcc_group=0")
print("    Creating Sub-Graphs for the main WCC group only")


query = """
    CALL gds.nodeSimilarity.mutate(
        'SP_ML_Graph_sub',
        {
        mutateRelationshipType: 'ML_SP_SIMILAR',
        mutateProperty: 'score',
        similarityCutoff: 0.0001,
        topK: 50000
        }
    ) YIELD nodesCompared, relationshipsWritten;
    """
print("            Running node Similarity : SimilarityCutOff=0.0001, topK=5000")
graph.run(query).to_data_frame()


#Use this to Plug the relationship back into the DB
query = """
    CALL gds.graph.writeRelationship(
        'SP_ML_Graph_sub',
        'ML_SP_SIMILAR',
        'score'
    )
    """
graph.run(query).to_data_frame()
    
    
print("            Running Community Detection (louvain)")
query = """
    CALL gds.louvain.write(
        'SP_ML_Graph_sub',
        {
        nodeLabels: ['SP'],
        relationshipTypes: ['ML_SP_SIMILAR'],
        writeProperty: 'louvain_community_group',
        relationshipWeightProperty: 'score'
        }
    ) YIELD modularity, ranLevels, communityCount;
    """

graph.run(query).to_data_frame()
    

print("            Running Community Detection (Label Propagation)")
query = """
CALL gds.labelPropagation.write('SP_ML_Graph_sub', { writeProperty: 'lpa_community_group' })
YIELD communityCount, ranIterations, didConverge
    """

graph.run(query).to_data_frame()

print("completed!")


This piece of GraphML applies to only the main group wcc_group=0
    Creating Sub-Graphs for the main WCC group only
            Running node Similarity : SimilarityCutOff=0.0001, topK=5000
            Running Community Detection (louvain)
            Running Community Detection (Label Propagation)
completed!
Wall time: 299 ms


In [18]:
#Use this to delete all the Machine Learning SIMILAR relationships amongst SP, since it is redundant

query = """
match (s1:SP)-[t:ML_SP_SIMILAR]-(s2:SP)
where s1<>s2
delete t
"""

df = graph.run(query).to_data_frame()


query = """
match (t1:TABLE)-[k:ML_SP_SIMILAR]-(t2:TABLE)
where t1<>t2
delete k
"""

df = graph.run(query).to_data_frame()

In [19]:
#lpa Results
print("lpa results")

query = """
    match(s:SP)
    where s.lpa_community_group is not null
    return  s.lpa_community_group, size(collect(s.SP_ID)) as lpa_count
    order by lpa_count desc
    """
df =graph.run(query).to_data_frame()
df

lpa results


Unnamed: 0,s.lpa_community_group,lpa_count
0,5,336
1,30,177
2,98,46
3,156,39
4,398,35
5,501,18
6,171,14
7,174,14
8,811,11
9,672,10


In [20]:
#louvain Results
print("louvain results")

query = """
    match(s:SP)
    where s.louvain_community_group is not null
    return s.louvain_community_group, size(collect(s.SP_ID)) as louvain_count
    order by louvain_count desc
    """
df =graph.run(query).to_data_frame()
df

louvain results


Unnamed: 0,s.louvain_community_group,louvain_count
0,655,143
1,642,133
2,478,120
3,636,63
4,230,44
5,454,44
6,439,42
7,97,37
8,339,21
9,143,21


In [21]:
exploratory_query = """
//pageRank algorithms

MATCH (s:SP)
where s.sp_community_group is not null
WITH s.sp_community_group as community, count(*) as communitySize, sum(s.SYNTAX_WORDCOUNT) as syntax_wordcount
ORDER BY communitySize DESC 

CALL gds.pageRank.stream(
    {nodeQuery: 'MATCH (s:SP) where s.sp_community_group=$community
    RETURN id(s) as id', 
    relationshipQuery: 'MATCH (s1:SP)-[s:ML_SP_SIMILAR]-(s2:SP)
    WHERE s1.purchase_community_louvain = $community and s2.purchase_community_louvain = $community
    RETURN id(s1) as source, id(s2) as target, s.score as weight',
    relationshipWeightProperty: 'weight',
    parameters:{community: community}}
) YIELD nodeId, score

WITH community, communitySize, nodeId, score, syntax_wordcount
ORDER BY score DESC
RETURN community, communitySize, syntax_wordcount, collect(gds.util.asNode(nodeId).SP_ID)[..10] as community_Name ORDER BY communitySize DESC
"""

print("Return top 10 SP per community")
#df=graph.run(exploratory_query).to_data_frame()
#df
#df.to_csv("abc.csv")

Return top 10 SP per community


In [22]:
#Downloading the inter-cluster OVERLAPPING results out
exploratory_query = """
MATCH (s1:SP)-[:RELY_ON]->(t:TABLE)<-[:RELY_ON]-(s2:SP)
where s1.lpa_community_group<>s2.lpa_community_group and s1.wcc_group = s2.wcc_group
return s1.lpa_community_group, s1.SP_ID,  t.TABLE_ID
order by t.TABLE_ID
"""

df_overlap=graph.run(exploratory_query).to_data_frame()
df_overlap
df_overlap.to_excel("lpa_overlapping.xlsx")

#Downloading the inter-cluster OVERLAPPING results out
exploratory_query = """
MATCH (s1:SP)-[:RELY_ON]->(t:TABLE)<-[:RELY_ON]-(s2:SP)
where s1.louvain_community_group<>s2.louvain_community_group and s1.wcc_group = s2.wcc_group
return s1.louvain_community_group, s1.SP_ID,  t.TABLE_ID
order by t.TABLE_ID
"""

df_overlap=graph.run(exploratory_query).to_data_frame()
df_overlap
df_overlap.to_excel("louvain_overlapping.xlsx")

#Downloading all the communities results
exploratory_query = """
MATCH (s:SP)-[:RELY_ON]->(t:TABLE)
return s.louvain_community_group, s.lpa_community_group, s.wcc_group, s.SP_ID,  t.TABLE_ID
order by t.TABLE_ID
"""

df_overlap=graph.run(exploratory_query).to_data_frame()
df_overlap
df_overlap.to_excel("sp_grouping.xlsx")

In [23]:
#Downloading all the communities results
exploratory_query = """
MATCH (s:SP)-[:RELY_ON]->(t:TABLE)
return s.final_group, s.wcc_group, s.louvain_community_group, s.lpa_community_group, s.SP_ID,  t.TABLE_ID
order by t.TABLE_ID
"""

df_overlap=graph.run(exploratory_query).to_data_frame()
df_overlap
df_overlap.to_excel("sp_grouping.xlsx")

In [24]:
df_overlap

Unnamed: 0,s.final_group,s.wcc_group,s.louvain_community_group,s.lpa_community_group,s.SP_ID,t.TABLE_ID
0,99991,454.0,,,PREPARE.SP_459,ANALYSE.TABLE_1001
1,1,1.0,642.0,5.0,PREPARE.SP_461,ANALYSE.TABLE_1004
2,1,1.0,642.0,5.0,PUBLISH.SP_958,ANALYSE.TABLE_1004
3,1,1.0,642.0,5.0,PUBLISH.SP_959,ANALYSE.TABLE_1004
4,1,1.0,642.0,5.0,PUBLISH.SP_960,ANALYSE.TABLE_1004
...,...,...,...,...,...,...
3792,1,1.0,439.0,98.0,PUBLISH.SP_819,UTIL.TABLE_3319
3793,1,1.0,478.0,30.0,PUBLISH.SP_821,UTIL.TABLE_3319
3794,1,1.0,478.0,30.0,PUBLISH.SP_822,UTIL.TABLE_3319
3795,1,1.0,478.0,30.0,PUBLISH.SP_823,UTIL.TABLE_3319


In [25]:
query = """
match (s:SP)
where s.sp_community_group is null
return s.sp_community_group as sp_community_group, size(collect(s.SP_ID)) as countsize, sum(s.SYNTAX_WORDCOUNT) as wordcounts
    """
df =graph.run(query).to_data_frame()
df

Unnamed: 0,sp_community_group,countsize,wordcounts
0,,1199,850072


In [26]:
query = """
match r1=(s1:SP)-[:CALL_SP]->(s2:SP)-[:CALL_SP]->(s3:SP)
where s1<>s2 and s2<>s3 and s3<>s1
return r1
"""
df =graph.run(query).to_data_frame()

print("Check if have nested SP:")

print("SP Calling SP (x1) :")
query = """
match r1=(s1:SP)-[:CALL_SP]->(s2:SP)
where s1<>s2
return count(*)
"""
df =graph.run(query).to_data_frame
print(df)

print("SP Calling SP Calling SP (x2) :")
query = """
match r1=(s1:SP)-[:CALL_SP]->(s2:SP)-[:CALL_SP]->(s3:SP)
where s1<>s2 and s1<>s3 and s2<>s3
return count(*)  as countsize
"""
df =graph.run(query).to_data_frame()
print(df)

print("Nested SP should have lower priority")

query = """
    MATCH r1=(s1:SP)-[:CALL_SP]->(s2:SP)
    SET s2.priority = 1
    """
print("            If an SP calls another SP, then the SP with dependencies should be worked on first")
graph.run(query).to_data_frame()


Check if have nested SP:
SP Calling SP (x1) :
<bound method Cursor.to_data_frame of  count(*) 
----------
        0 
>
SP Calling SP Calling SP (x2) :
   countsize
0          0
Nested SP should have lower priority
            If an SP calls another SP, then the SP with dependencies should be worked on first


In [27]:
query = """
match(s:SP) 
where s.lpa_community_group is not null
SET s.final_group = s.lpa_community_group
"""
graph.run(query).to_data_frame()


In [28]:
#If a final group =1 counts, then put it in group=99991

query = """
match (s:SP)
with s.final_group as final_group_select, size(collect(s.SP_ID)) as countsize
where countsize=1

with final_group_select
match (s2:SP)
where s2.final_group = final_group_select
SET s2.final_group = 99991
"""
graph.run(query).to_data_frame()

#If a final group between 2 - 10 counts, then put it in group=99993

query = """
match (s:SP)
with s.final_group as final_group_select, size(collect(s.SP_ID)) as countsize
where countsize >= 2 and countsize <= 10 

with final_group_select
match (s2:SP)
where s2.final_group = final_group_select
SET s2.final_group = 99992
"""
graph.run(query).to_data_frame()


#If a final group between 11 - 19 counts, then put it in group=99993

query = """
match (s:SP)
with s.final_group as final_group_select, size(collect(s.SP_ID)) as countsize
where countsize > 10 and countsize <= 19

with final_group_select
match (s2:SP)
where s2.final_group = final_group_select
SET s2.final_group = 99993
"""
graph.run(query).to_data_frame()


#If a final group between 21 - 29 counts, then put it in group=99993

query = """
match (s:SP)
with s.final_group as final_group_select, size(collect(s.SP_ID)) as countsize
where countsize > 20 and countsize <= 29

with final_group_select
match (s2:SP)
where s2.final_group = final_group_select
SET s2.final_group = 99994
"""
graph.run(query).to_data_frame()

In [29]:
query = """
match (s:SP)
return s.final_group, size(collect(s.SP_ID)) as countsize
order by countsize desc
"""
graph.run(query).to_data_frame()

Unnamed: 0,s.final_group,countsize
0,5,336
1,99991,288
2,30,177
3,99992,147
4,99993,57
5,98,46
6,99999,39
7,156,39
8,99995,35
9,398,35


In [30]:
exploratory_query = """
MATCH (n:SP)
WITH n, n.final_group as final_group
CALL apoc.create.addLabels(n, ['G_' + toString(final_group) ]) YIELD node
RETURN count(n) as count
"""
graph.run(exploratory_query).to_data_frame()


Unnamed: 0,count
0,1199


In [20]:
exploratory_query = """
MATCH (s:SP)-[r:RELY_ON]-(t:TABLE)
RETURN s.SP_ID as SP_NAME, 'TABLE' as TYPE, t.TABLE_ID AS TABLE_NAME, s.final_group, s.lpa_community_group, s.louvain_community_group 
UNION 
OPTIONAL MATCH (s)-[r:RELY_ON]-(v:VIEW)
RETURN s.SP_ID as SP_NAME, 'VIEW' as TYPE, v.VIEW_ID AS TABLE_NAME, s.final_group, s.lpa_community_group, s.louvain_community_group 
"""
df =graph.run(exploratory_query).to_data_frame()


In [21]:
df

Unnamed: 0,SP_NAME,TYPE,TABLE_NAME,s.final_group,s.lpa_community_group,s.louvain_community_group
0,ANALYSE.ANFIELD_TO_INGESTION_SUMMARY,TABLE,ANALYSE.ANFIELDALL_MD_SHAREPOINT_COST_OF_POINTS,1.0,2.0,480.0
1,ANALYSE.ANFIELD_TO_INGESTION_SUMMARY,TABLE,ANALYSE.ANFIELD_INGESTION_SUMMARY,1.0,2.0,480.0
2,ANALYSE.ANFIELD_TO_INGESTION_SUMMARY,TABLE,ANALYSE.ANFIELD_MD_STANDARD_REWARD_TYPE,1.0,2.0,480.0
3,ANALYSE.ANFIELD_TO_INGESTION_SUMMARY,TABLE,ANALYSE.ANFIELD_TX_TRANSACTION,1.0,2.0,480.0
4,ANALYSE.ANFIELD_TO_INGESTION_SUMMARY,TABLE,ANALYSE.ANFIELD_TX_REWARD_CANCELLED,1.0,2.0,480.0
...,...,...,...,...,...,...
6346,,VIEW,,,1488.0,
6347,,VIEW,,,1503.0,
6348,,VIEW,,,1326.0,
6349,,VIEW,,,1461.0,


In [15]:
exploratory_query = """
MATCH (s:SP)-[r:RELY_ON]-(k:TABLE|VIEW)
return s.SP_ID as SP_NAME, k.TYPE, k.TABLE_ID AS TABLE_NAME, s.final_group, s.lpa_community_group, s.louvain_community_group
"""
df =graph.run(exploratory_query).to_data_frame()

df.to_excel("abc.xlsx")

ClientError: [Statement.SyntaxError] Invalid input '|': expected ")", "WHERE", "{" or a parameter (line 2, column 34 (offset: 34))
"MATCH (s:SP)-[r:RELY_ON]-(k:TABLE|VIEW)"
                                  ^