# Detection Module

    The main goal of the detection module is to use the gazetteers out of the ontologies used to enrich PropaPhen into PropaPhen+ to discover relationships between network nodes/systems and the gufo:Entities by text.

In [1]:
%load_ext autoreload
%autoreload 2

## Libraries

### Installing

In [2]:
#!pip install pandas
#!pip install tqdm
#!pip install nltk
#!pip install gatenlp
#!pip install py4j
#!pip install pyodide
#!pip install ipywidgets
#!pip install neo4j

### Standard

In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import nltk
import glob

In [4]:
from gatenlp import Document
from gatenlp.gateworker import GateWorker

### Custom libraries

In [5]:
import sys
sys.path.append('lib/')

In [6]:
from detection.relationshipextraction import RelationshipDiscovery, GateExtractor, CleanDicts, rmToRelationCSV
from detection.schema import Term, Concept, df_to_concepts, cleaningPlaceStr, conceptsToGazetteer
from detection.worldumls import umlsConceptCleanner, isEnglish, worldConceptCleanner
from detection.worldumls import ClearnWorldKGGazetteer
#import detection.observationclustering

## Globals

In [7]:
path_to_covid_journals = "data/textual/covid/newspaper/"
path_to_kb_gazetteer = '../data/gazetteers/kbgazetteer.csv'
path_to_netwoork_gazetteer = '../data/gazetteers/world_gazetteer_en.csv'
path_to_lsts = "data/lst/"
path_to_relation_folder = "../data/neo4j/"
path_to_observationcsv = "../data/neo4j/observations.csv"

## Relationship Discovery

### KB Gazetteers

In [8]:
kb_concept_list = []
network_concept_list = []

In [9]:
df_kb = pd.read_csv(path_to_kb_gazetteer)

In [10]:
df_kb.head()

Unnamed: 0.1,Unnamed: 0,ID,Name
0,0,C0026106,Mild mental retardation
1,1,C0026351,Moderate mental retardation
2,2,C0036857,Severe mental retardation
3,3,C0020796,Profound mental retardation
4,4,C0025362,Unspecified mental retardation


In [11]:
kb_concept_list = df_to_concepts(df_kb)

Finding Terms


12620098it [12:56, 16258.97it/s]


Creating Term list


In [12]:
for i in tqdm(range(len(kb_concept_list))):
    kb_concept_list[i] = umlsConceptCleanner(kb_concept_list[i])
    kb_concept_list[i] = umlsConceptCleanner(kb_concept_list[i])

100%|█████████████████████████████████████████████████| 7892473/7892473 [00:34<00:00, 231919.08it/s]


In [13]:
umlsdict = conceptsToGazetteer(kb_concept_list,path_to_lsts+"umls.lst",cleaningPlaceStr)

100%|██████████████████████████████████████████████████| 7892473/7892473 [02:13<00:00, 59149.04it/s]


### Place Gazetteers

In [None]:
df_network = pd.read_csv(path_to_netwoork_gazetteer)

In [None]:
clear_net_list = ['"Nga"', '"Centre"', '"Kou"', '"San"']

In [None]:
df_network = ClearnWorldKGGazetteer(df_network,clear_net_list)

In [None]:
df_network.head()

In [None]:
network_concept_list = df_to_concepts(df_network)

In [None]:
# Pre-processing network
#for i in tqdm(range(len(network_concept_list))):
#    network_concept_list[i] = worldConceptCleanner(network_concept_list[i])

In [None]:
# Normal
print("Usual name")
normalplacesdict = conceptsToGazetteer(network_concept_list,path_to_lsts+"places.lst",cleaningPlaceStr)
# Cap
#print("Cap name")
#capdict = conceptsToGazetteer(network_concept_list,path_to_lsts+"places_cap.lst",capPlaceStr)
# Lower
#print("Lower name")
#lowerdict = conceptsToGazetteer(network_concept_list,path_to_lsts+"places_lower.lst",lowerPlaceStr)

### GATE

In [None]:
gs = GateWorker(start=False, auth_token="1234")

In [None]:
normalplacesdict, umlsdict = CleanDicts(normalplacesdict, umlsdict)

In [None]:
gateExtractor = GateExtractor(umlsdict,normalplacesdict)

In [None]:
# Annie
gs.worker.loadMavenPlugin("uk.ac.gate.plugins", "annie", "8.6")
# now load the prepared ANNIE pipeline from the plugin
pipeline = gs.worker.loadPipelineFromPlugin("uk.ac.gate.plugins","annie", "/resources/ANNIE_with_defaults.gapp")
pipeline.getName()

In [None]:
gateExtractor.extra_pr['annie'] = pipeline

## Medical Articles Relationship Discovery

In [None]:
corpus = gs.getCorpus4Name('PreDiViD-CORD19-2019-12')

In [None]:
rd = RelationshipDiscovery(corpus, gateExtractor,gs)

In [None]:
rmDoc = rd.rmGen.directTermMatching('PreDiViD-CORD19-Abstract-2019-12-Doc')

In [None]:
rmParagraph = rd.rmGen.paragraphTermMatching('PreDiViD-CORD19-Abstract-2019-12-Paragraph')

In [None]:
rmSentence = rd.rmGen.sentenceTermMatching('PreDiViD-CORD19-Abstract-2019-12-Sentence')

In [None]:
df_rm = rmToRelationCSV(rmDoc, 'Medical', 1, 'hasPresence') 
df_rm.to_csv(path_to_relation_folder+rmDoc.matrix_id+".csv", index=False)
df_rmParagraph = rmToRelationCSV(rmParagraph, 'Medical', 1, 'hasPresence') 
df_rmParagraph.to_csv(path_to_relation_folder+rmParagraph.matrix_id+".csv", index=False)
df_rmSentence = rmToRelationCSV(rmSentence, 'Medical', 1, 'hasPresence') 
df_rmSentence.to_csv(path_to_relation_folder+rmSentence.matrix_id+".csv", index=False)

## Online Newspaper Relationship Discovery

In [None]:
corpus = gs.getCorpus4Name('PreDiViD')
rd = RelationshipDiscovery(corpus, gateExtractor,gs)
rmDoc = rd.rmGen.directTermMatching('PreDiViD-Aylien-2019-11-Doc')
rmParagraph = rd.rmGen.paragraphTermMatching('PreDiViD-Aylien-2019-11-Paragraph')
rmSentence = rd.rmGen.sentenceTermMatching('PreDiViD-Aylien-2019-11-Sentence')
df_rm = rmToRelationCSV(rmDoc, 'Journal', 1, 'hasPresence') 
df_rm.to_csv(path_to_relation_folder+rmDoc.matrix_id+".csv", index=False)
df_rmParagraph = rmToRelationCSV(rmParagraph, 'Journal', 1, 'hasPresence') 
df_rmParagraph.to_csv(path_to_relation_folder+rmParagraph.matrix_id+".csv", index=False)
df_rmSentence = rmToRelationCSV(rmSentence, 'Journal', 1, 'hasPresence') 
df_rmSentence.to_csv(path_to_relation_folder+rmSentence.matrix_id+".csv", index=False)

### Observation Mining

In [12]:
from lib.kgce.schema.semantic.neo4jclasses import Neo4jRelation
from lib.kgce.neo4j.handler import Neo4jWrapper

In [13]:
from neo4j import GraphDatabase
from tqdm import tqdm


class Neo4jWrapper:

    def __init__(self, uri, userName, password):
        self.uri = uri
        self.userName = userName
        self.password = password
        # Connect to the neo4j database server
        self.graphDB_Driver  = GraphDatabase.driver(uri, auth=(userName, password)) 
        
    def sendQuery(self, cql_commands):
        result = []
        done_queries = []
        with self.graphDB_Driver.session() as graphDB_Session:
            for cqlCreate in tqdm(cql_commands):
                try:
                    result += [graphDB_Session.run(cqlCreate).to_df()]
                    done_queries.append(cqlCreate)
                except Exception as e:
                    tqdm.write(str(e))
                    tqdm.write(cqlCreate)
                    result += [str(e)]
        return result
    
    def closeConnection(self):
        self.graphDB_Driver.close()

In [104]:
neowrapper = Neo4jWrapper(uri="bolt://localhost:7687",userName="neo4j",password="test")

In [182]:
result = neowrapper.sendQuery([
    """MATCH (n:Country)<-[r:hasPresence]-(c) 
    WHERE toInteger(r.intensity) >= 1000 AND r.source = "Journal"
    RETURN n.wkgs_nameEn as System_Name, n.id, c.name, c.id, r.intensity as intensity;"""
])

100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.46it/s]


In [183]:
df_result = result[0].groupby(['System_Name','n.id'],as_index=False).agg(list)

In [184]:
df_result

Unnamed: 0,System_Name,n.id,c.name,c.id,intensity
0,"""Angola""",wkg:424310875,"[code, MK, II:23, MK, MK, MK, MK, II:23, MK, c...","[A18625219, A20722030, A34717172, A24370628, A...","[1104, 1016, 1040, 1016, 1016, 1016, 1016, 104..."
1,"""Belgium""",wkg:1684793666,"[MK, MK, MK, II:23, MK, code, code, MK, MK, MK...","[A24367753, A32664340, A24370628, A34717172, A...","[1016, 1016, 1016, 1040, 1016, 1104, 1104, 101..."
2,"""Bulgaria""",wkg:424315709,[stress test electrocardiogram: 2:1 atrioventr...,"[A17248370, A20722030, A7565400, A32664340, C1...","[4592, 3556, 1456, 3556, 1456, 3864, 1456, 355..."
3,"""China""",wkg:424313582,[stress test electrocardiogram: 2:1 atrioventr...,[A17248370],[1200]
4,"""Denmark""",wkg:432424968,"[II:23, MK, MK, MK, MK, II:23, MK, MK, code, s...","[C5761581, A16758859, A32664337, A20759385, C5...","[1040, 1016, 1016, 1016, 1016, 1040, 1016, 101..."
5,"""France""",wkg:1363947712,"[O/E: E.M. micr.: virus, Description, 6:2 FTAB...","[A22865475, A32797912, A33643234, C3639183, A1...","[1447, 1344, 1456, 1324, 8832, 1024, 1136, 146..."
6,"""Germany""",wkg:1683325355,"[MK, MK, MK, MK, code, MK, stress test electro...","[A32664340, A16758859, A24583083, A24370628, A...","[1524, 1524, 1524, 1524, 1656, 1524, 1968, 152..."
7,"""Grenada""",wkg:424316074,"[Description, electrocardiogram: 1:1 atriovent...","[A8317986, A17276557, C5761581, A8317350, A227...","[1680, 4160, 10400, 2080, 1040, 1120, 1000, 17..."
8,"""Haiti""",wkg:424297281,"[MK, code, MK, code, MK, MK, MK, MK, II:23, st...","[A32664337, A18625219, A24367753, A18553518, C...","[1016, 1104, 1016, 1104, 1016, 1016, 1016, 101..."
9,"""Kazakhstan""",wkg:424311521,"[virus, virus, virus, virus, virus, O/E: E.M. ...","[C0319157, A18650525, A9333675, A4387104, A186...","[1088, 1088, 1088, 1088, 1088, 1088, 1088, 108..."


In [179]:
result = neowrapper.sendQuery([
    """MATCH (n:Country)<-[r:hasPresence]-(c) 
    WHERE toInteger(r.intensity) >= 100 AND r.source = "Medical"
    RETURN n.wkgs_nameEn as System_Name, n.id, c.name, c.id, r.intensity as intensity;"""
])

100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.52it/s]


In [180]:
df_result = result[0].groupby(['System_Name','n.id'],as_index=False).agg(list)

In [181]:
df_result

Unnamed: 0,System_Name,n.id,c.name,c.id,intensity
0,"""Angola""",wkg:424310875,"[0: Eye problem(s) had no effect on my work, 0...","[A33693090, C5570791]","[144, 144]"
1,"""Belgium""",wkg:1684793666,"[study, virus, electrocardiogram: 1:1 atrioven...","[A18569647, A4387104, A17276557, A17263687, A2...","[101, 109, 139, 139, 109, 109, 109, 117, 109, ..."
2,"""Canada""",wkg:424313760,[electrocardiogram: 1:1 atrioventricular condu...,"[A17276557, A17248370, A17263687]","[133, 114, 133]"
3,"""China""",wkg:424313582,"[influenza, stress test electrocardiogram: 2:1...","[A14148709, A17248370, A18637092, A17276557, A...","[168, 114, 115, 127, 115, 115, 126, 115, 126, ..."
4,"""France""",wkg:1363947712,"[infection, A, A, infection, A, virus, associa...","[A4386825, A15562671, A15566736, A18647936, A1...","[171, 159, 159, 171, 159, 159, 114, 104, 152, ..."
5,"""Iran""",wkg:424298311,"[HIV: PrEP and PEP, HIV: PrEP and PEP, HIV: Pr...","[A32651247, C5400798, A32453499]","[112, 112, 112]"
6,"""Kazakhstan""",wkg:424311521,"[RP, FA, FA, RP, FA, FA, RP, RSV, RP, FA, FA, ...","[A21114029, A20737010, A12003596, A12031273, A...","[108, 108, 108, 108, 108, 108, 108, 108, 108, ..."
7,"""Russia""",wkg:424314830,"[B27, refill, FA, CI, years, hypocapnia, 2.5, ...","[A20723042, A18568488, A10776600, C1705941, C0...","[111, 114, 999, 343, 377, 148, 111, 148, 299, ..."
8,"""Saudi Arabia""",wkg:249399419,"[MERS-CoV, MERS, virus, O/E: E.M. micr.: virus...","[A26632894, A26716642, A18650525, A24082341, A...","[114, 136, 108, 108, 108, 108, 108, 114, 108, ..."
9,"""Spain""",wkg:148332300,[electrocardiogram: 1:1 atrioventricular condu...,"[A17276557, A18556155, A33693090, C5570791, A1...","[177, 138, 131, 131, 177]"
