# Detection Module

    The main goal of the detection module is to use the gazetteers out of the ontologies used to enrich PropaPhen into PropaPhen+ to discover relationships between network nodes/systems and the gufo:Entities by text.

In [1]:
%load_ext autoreload
%autoreload 2

## Libraries

### Installing

In [2]:
#!pip install pandas
#!pip install tqdm
#!pip install nltk
#!pip install gatenlp
#!pip install py4j
#!pip install pyodide
#!pip install ipywidgets
#!pip install neo4j

### Standard

In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import nltk
import glob

In [4]:
from gatenlp import Document
from gatenlp.gateworker import GateWorker

### Custom libraries

In [5]:
import sys
sys.path.append('lib/')

In [6]:
from detection.relationshipextraction import RelationshipDiscovery, GateExtractor, CleanDicts, rmToRelationCSV
from detection.schema import Term, Concept, df_to_concepts, cleaningPlaceStr, conceptsToGazetteer
from detection.worldumls import umlsConceptCleanner, isEnglish, worldConceptCleanner
from detection.worldumls import ClearnWorldKGGazetteer
#import detection.observationclustering

## Globals

In [7]:
path_to_covid_journals = "data/textual/covid/newspaper/"
path_to_kb_gazetteer = '../data/gazetteers/kbgazetteer.csv'
path_to_netwoork_gazetteer = '../data/gazetteers/world_gazetteer_en.csv'
path_to_lsts = "data/lst/"
path_to_relation_folder = "../data/neo4j/"
path_to_journalobservationcsv = "../data/neo4j/observations_journal.csv"
path_to_medicalobservationcsv = "../data/neo4j/observations_medical.csv"
path_to_socialobservationcsv = "../data/neo4j/observations_social.csv"

## Relationship Discovery

### KB Gazetteers

In [8]:
kb_concept_list = []
network_concept_list = []

In [9]:
df_kb = pd.read_csv(path_to_kb_gazetteer)

In [10]:
df_kb.head()

Unnamed: 0.1,Unnamed: 0,ID,Name
0,0,C0026106,Mild mental retardation
1,1,C0026351,Moderate mental retardation
2,2,C0036857,Severe mental retardation
3,3,C0020796,Profound mental retardation
4,4,C0025362,Unspecified mental retardation


In [11]:
kb_concept_list = df_to_concepts(df_kb)

Finding Terms


12620098it [13:09, 15993.15it/s]


Creating Term list


In [12]:
for i in tqdm(range(len(kb_concept_list))):
    kb_concept_list[i] = umlsConceptCleanner(kb_concept_list[i])
    kb_concept_list[i] = umlsConceptCleanner(kb_concept_list[i])

100%|█████████████████████████████████████████████████| 7892473/7892473 [00:35<00:00, 220861.90it/s]


In [13]:
umlsdict = conceptsToGazetteer(kb_concept_list,path_to_lsts+"umls.lst",cleaningPlaceStr)

100%|██████████████████████████████████████████████████| 7892473/7892473 [02:19<00:00, 56774.96it/s]


### Place Gazetteers

In [14]:
df_network = pd.read_csv(path_to_netwoork_gazetteer)

In [15]:
washingtonRemoveDoubles = ('wkg:158368533', "Washington")
bradFord = ("wkg:26701367","Bradford")
def removeDoublesInNet(df_network,tupleList):
    list_id_to_remove = []
    for index, row in df_network.iterrows():
        for tupleRemoveDoubles in tupleList:
            if tupleRemoveDoubles[1] in row['Name'] and row['ID']!= tupleRemoveDoubles[0]:
                list_id_to_remove.append(row['ID'])

    df_network = df_network.drop(df_network[df_network.ID.isin(list_id_to_remove)].index.tolist())
    return df_network

In [16]:
df_network = removeDoublesInNet(df_network, [washingtonRemoveDoubles,bradFord])

In [17]:
clear_net_list = ['"Nga"', '"Centre"', '"Kou"', '"San"','"Real"',
                 '"Vincent"', '"Lille"','"North"', '"Barr"', '"North"'
                 ,'"South"','"West"','"East"','"Brito"', '"Utrecht"', '"Bush"',
                 '"Bush"', '"Republic"','"Union"', '"Time"',
                 '"Institute"','"Carbon"','"Center"','"Delhi"','"Mendenhall"']

In [18]:
df_network = ClearnWorldKGGazetteer(df_network,clear_net_list)

In [19]:
df_network.head()

Unnamed: 0.1,Unnamed: 0,ID,Name
0,0,wkg:10,"""Mamassita"""
1,1,wkg:10,"""Mamacita"""
2,2,wkg:1000709658,"""Boulzazen"""
3,3,wkg:1000709658,"""Boulzazen"""
4,4,wkg:1000709660,"""Tizi El Oued"""


In [20]:
network_concept_list = df_to_concepts(df_network)

Finding Terms


1692247it [01:41, 16624.01it/s]


Creating Term list


In [21]:
# Pre-processing network
#for i in tqdm(range(len(network_concept_list))):
#    network_concept_list[i] = worldConceptCleanner(network_concept_list[i])

In [22]:
# Normal
print("Usual name")
normalplacesdict = conceptsToGazetteer(network_concept_list,path_to_lsts+"places.lst",cleaningPlaceStr)
# Cap
#print("Cap name")
#capdict = conceptsToGazetteer(network_concept_list,path_to_lsts+"places_cap.lst",capPlaceStr)
# Lower
#print("Lower name")
#lowerdict = conceptsToGazetteer(network_concept_list,path_to_lsts+"places_lower.lst",lowerPlaceStr)

Usual name


100%|███████████████████████████████████████████████████| 948962/948962 [00:03<00:00, 308038.79it/s]


### GATE

In [23]:
gs = GateWorker(start=False, auth_token="1234")

2024-04-05 01:24:04,601|ERROR|py4j.java_gateway|An error occurred while trying to connect to the Java server (127.0.0.1:25333)
Traceback (most recent call last):
  File "/data/dataRapide/gabriel/git/DDPF/Detection/dtvenv/lib/python3.8/site-packages/py4j/java_gateway.py", line 982, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/data/dataRapide/gabriel/git/DDPF/Detection/dtvenv/lib/python3.8/site-packages/py4j/java_gateway.py", line 1132, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:25333)

In [None]:
from nltk.corpus import stopwords
import string

def cleanKeys(dictionary, clean_list):
    for c in clean_list:
        if c in dictionary:
            del dictionary[c]
    return dictionary

def CleanDicts(netdict,kbdict):
    nltk.download('stopwords')
    stopwords_list = stopwords.words('english')
    punctuation = [i for i in string.punctuation  ]
    stopwords_list_maj = [s.title() for s in stopwords_list]
    months = ["January", "February", "March", "April", "May",
              "June", "July", "August", "September", "October", "November", "December"]
    months_lower = [m.lower() for m in months]
    clean_list = stopwords_list + punctuation + list(
        string.ascii_lowercase) + list(
        string.ascii_uppercase) + stopwords_list_maj + months + months_lower
    netdict = cleanKeys(netdict,clean_list) 
    kbdict = cleanKeys(kbdict,clean_list+list(netdict.keys()))
    return netdict, kbdict

In [None]:
normalplacesdict, umlsdict = CleanDicts(normalplacesdict, umlsdict)

In [None]:
gateExtractor = GateExtractor(umlsdict,normalplacesdict)

In [None]:
# Annie
gs.worker.loadMavenPlugin("uk.ac.gate.plugins", "annie", "8.6")
# now load the prepared ANNIE pipeline from the plugin
pipeline = gs.worker.loadPipelineFromPlugin("uk.ac.gate.plugins","annie", "/resources/ANNIE_with_defaults.gapp")
pipeline.getName()

In [None]:
gateExtractor.extra_pr['annie'] = pipeline

In [None]:
from detection.relationshipextraction import RelationMatrix

class RMGenerator():
    
    def __init__(self, corpus,gateExtractor, gs):
        self.corpus = corpus
        self.gateExtractor = gateExtractor
        self.gs = gs
    
    def directTermMatching(self, matrix_id):
        rm = RelationMatrix(matrix_id)
        # Per document
        for doc in tqdm(self.corpus):
            pdoc = self.gs.gdoc2pdoc(doc)
            pdoc = self.gateExtractor.tokenizer(pdoc)
            pdoc = self.gateExtractor.tok_gaz(pdoc)
            # Making the rm links
            for kb_annotation in pdoc.annset().with_type("kb"):
                for network_annoation in pdoc.annset().with_type("network"):
                    rm.increaseBy(self.gateExtractor.dict_kb[kb_annotation.features['key']], 
                                self.gateExtractor.dict_network[network_annoation.features['key']],1)
            self.gs.del_resource(doc)
        return rm
    
    def paragraphTermMatching(self, matrix_id):
        assert 'annie' in self.gateExtractor.extra_pr.keys()
        rm = RelationMatrix(matrix_id)
        # Per document
        for doc in tqdm(self.corpus):
            # Run annie
            if len(self.gs.gdoc2pdoc(doc).text) <= 0:
                self.gs.del_resource(doc)
                continue
            self.gs.worker.run4Document(self.gateExtractor.extra_pr['annie'], doc)
            pdoc = self.gs.gdoc2pdoc(doc)            
            # Get network and kb
            pdoc = self.gateExtractor.tok_gaz(pdoc)
            # Get paragraph
            praragraphann = pdoc.annset('Original markups').with_type("paragraph")
            # For each paragraph
            for ann in praragraphann:
                # Making the rm links
                for kb_annotation in pdoc.annset().within(ann).with_type('kb'):
                    for network_annoation in pdoc.annset().within(ann).with_type('network'):
                        rm.increaseBy(self.gateExtractor.dict_kb[kb_annotation.features['key']], 
                                self.gateExtractor.dict_network[network_annoation.features['key']],1)
            self.gs.del_resource(doc)
        return rm
    
    def paragraphTermMatchingTransitivity(self, matrix_id):
        assert 'annie' in self.gateExtractor.extra_pr.keys()
        rm = RelationMatrix(matrix_id)
        # Per document
        for doc in tqdm(self.corpus):
            # Run annie
            if len(self.gs.gdoc2pdoc(doc).text) <= 0:
                self.gs.del_resource(doc)
                continue
            self.gs.worker.run4Document(self.gateExtractor.extra_pr['annie'], doc)
            pdoc = self.gs.gdoc2pdoc(doc)            
            # Get network and kb
            pdoc = self.gateExtractor.tok_gaz(pdoc)
            # Get paragraph
            paragraphann = pdoc.annset('Original markups').with_type("paragraph")
            dictKbToKb = {}
            # For each paragraph
            for ann in paragraphann:
                # Making Refs between KB entities
                for kb_annotation1 in pdoc.annset().within(ann).with_type('kb'):
                    for kb_annotation2 in pdoc.annset().within(ann).with_type('kb'):
                        # if same annotation continue
                        if kb_annotation1 == kb_annotation2:
                            continue
                        # If empty list create list
                        if kb_annotation1.features['key'] not in dictKbToKb:
                                dictKbToKb[kb_annotation1.features['key']] = []
                        # Add key to list
                        dictKbToKb[kb_annotation1.features['key']] = dictKbToKb[
                            kb_annotation1.features['key']] +  [kb_annotation2.features['key']]
            for ann in paragraphann:
                # Making the rm links
                for kb_annotation in pdoc.annset().within(ann).with_type('kb'):
                    for network_annoation in pdoc.annset().within(ann).with_type('network'):
                        rm.increaseBy(self.gateExtractor.dict_kb[kb_annotation.features['key']], 
                                self.gateExtractor.dict_network[network_annoation.features['key']],1)
            # Adding transitivity relations
            for ann in paragraphann:
                # Making the rm links
                for network_annoation in pdoc.annset().within(ann).with_type('network'):
                    for kb_annotation in pdoc.annset().within(ann).with_type('kb'):
                        if kb_annotation.features['key'] not in dictKbToKb:
                            continue
                        for transitivityKey in dictKbToKb[kb_annotation.features['key']]:
                            if rm.getValue(self.gateExtractor.dict_kb[transitivityKey], 
                                self.gateExtractor.dict_network[network_annoation.features['key']]) is None:
                                # If link does not exists, then create one
                                rm.increaseBy(self.gateExtractor.dict_kb[transitivityKey], 
                                self.gateExtractor.dict_network[network_annoation.features['key']],1)
                            
            self.gs.del_resource(doc)
        return rm
    
    def sentenceTermMatching(self, matrix_id):
        assert 'annie' in self.gateExtractor.extra_pr.keys()
        rm = RelationMatrix(matrix_id)
        # Per document
        for doc in tqdm(self.corpus):
            # Run annie
            if len(self.gs.gdoc2pdoc(doc).text) <= 0:
                self.gs.del_resource(doc)
                continue
            self.gs.worker.run4Document(self.gateExtractor.extra_pr['annie'], doc)
            pdoc = self.gs.gdoc2pdoc(doc)            
            # Get network and kb
            pdoc = self.gateExtractor.tok_gaz(pdoc)
            # Get paragraph
            sentenceann = pdoc.annset('').with_type("Sentence")
            # For each paragraph
            for ann in sentenceann:
                # Making the rm links
                for kb_annotation in pdoc.annset().within(ann).with_type('kb'):
                    for network_annoation in pdoc.annset().within(ann).with_type('network'):
                        rm.increaseBy(self.gateExtractor.dict_kb[kb_annotation.features['key']], 
                                self.gateExtractor.dict_network[network_annoation.features['key']],1)
            self.gs.del_resource(doc)
        return rm
    
    def sentenceTermMatchingTransitivity(self, matrix_id):
        assert 'annie' in self.gateExtractor.extra_pr.keys()
        rm = RelationMatrix(matrix_id)
        # Per document
        for doc in tqdm(self.corpus):
            # Run annie
            if len(self.gs.gdoc2pdoc(doc).text) <= 0:
                self.gs.del_resource(doc)
                continue
            self.gs.worker.run4Document(self.gateExtractor.extra_pr['annie'], doc)
            pdoc = self.gs.gdoc2pdoc(doc)            
            # Get network and kb
            pdoc = self.gateExtractor.tok_gaz(pdoc)
            # Get paragraph
            sentenceann = pdoc.annset('').with_type("Sentence")
            dictKbToKb = {}
            # For each paragraph
            for ann in sentenceann:
                # Making Refs between KB entities
                for kb_annotation1 in pdoc.annset().within(ann).with_type('kb'):
                    for kb_annotation2 in pdoc.annset().within(ann).with_type('kb'):
                        # if same annotation continue
                        if kb_annotation1 == kb_annotation2:
                            continue
                        # If empty list create list
                        if kb_annotation1.features['key'] not in dictKbToKb:
                                dictKbToKb[kb_annotation1.features['key']] = []
                        # Add key to list
                        dictKbToKb[kb_annotation1.features['key']] = dictKbToKb[
                            kb_annotation1.features['key']] +  [kb_annotation2.features['key']]
            # For each paragraph
            for ann in sentenceann:
                # Making the rm links
                for kb_annotation in pdoc.annset().within(ann).with_type('kb'):
                    for network_annoation in pdoc.annset().within(ann).with_type('network'):
                        rm.increaseBy(self.gateExtractor.dict_kb[kb_annotation.features['key']], 
                                self.gateExtractor.dict_network[network_annoation.features['key']],1)
            # Adding transitivity relations
            for ann in sentenceann:
                # Making the rm links
                for network_annoation in pdoc.annset().within(ann).with_type('network'):
                    for kb_annotation in pdoc.annset().within(ann).with_type('kb'):
                        if kb_annotation.features['key'] not in dictKbToKb:
                            continue
                        for transitivityKey in dictKbToKb[kb_annotation.features['key']]:
                            if rm.getValue(self.gateExtractor.dict_kb[transitivityKey], 
                                self.gateExtractor.dict_network[network_annoation.features['key']]) is None:
                                # If link does not exists, then create one
                                rm.increaseBy(self.gateExtractor.dict_kb[transitivityKey], 
                                self.gateExtractor.dict_network[network_annoation.features['key']],1)
            self.gs.del_resource(doc)
        return rm

## Relationship Discovery

In [None]:
corpusJournal = gs.getCorpus4Name('PreDiViD-Journal-11-19')
rd = RelationshipDiscovery(corpusJournal, gateExtractor,gs)
rmSentence = rd.rmGen.sentenceTermMatching('KES-PreDiViD-Journal-2019-11-Sentence')
df_rmSentence = rmToRelationCSV(rmSentence, 'Journal', 1, 'hasPresence',cluster_date='2019-11') 
df_rmSentence.to_csv(path_to_relation_folder+rmSentence.matrix_id+".csv", index=False)

In [None]:
corpusMedical = gs.getCorpus4Name('PreDiViD-Medical-12-19')
rd = RelationshipDiscovery(corpusMedical, gateExtractor,gs)
rmSentence = rd.rmGen.sentenceTermMatching('KES-PreDiViD-Medical-2019-12-Sentence')
df_rmSentence = rmToRelationCSV(rmSentence, 'Medical', 1, 'hasPresence',cluster_date='2019-12') 
df_rmSentence.to_csv(path_to_relation_folder+rmSentence.matrix_id+".csv", index=False)

In [None]:
corpusSocial = gs.getCorpus4Name('PreDiViD-Social-2-22') # CORRECT corpus dates
rd = RelationshipDiscovery(corpusSocial, gateExtractor,gs)
rmSentence = rd.rmGen.sentenceTermMatching('KES-PreDiViD-Social-2020-20-Sentence')
df_rmSentence = rmToRelationCSV(rmSentence, 'Social', 1, 'hasPresence',cluster_date='2020-02') 
df_rmSentence.to_csv(path_to_relation_folder+rmSentence.matrix_id+".csv", index=False)

## Transitivity

In [None]:
corpus = gs.getCorpus4Name('PreDiViD')
rmGen = RMGenerator(corpus, gateExtractor, gs)
rd = RelationshipDiscovery(corpus, gateExtractor,gs,rmGen)

In [None]:
rmSentence = rd.rmGen.sentenceTermMatchingTransitivity('PreDiViD-COVID-Journal-2019-11-Sentence-Transitivity')

In [None]:
rmParagraph = rd.rmGen.paragraphTermMatching('PreDiViD-COVID-Journal-2019-11-Paragraph-Transitivity')

In [None]:
df_rmParagraph = rmToRelationCSV(rmParagraph, 'Journal', 1, 'hasPresence',cluster_date='2019-11') 
df_rmParagraph.to_csv(path_to_relation_folder+rmParagraph.matrix_id+".csv", index=False)

In [None]:
df_rmSentence = rmToRelationCSV(rmSentence, 'Journal', 1, 'hasPresence',cluster_date='2019-11') 
df_rmSentence.to_csv(path_to_relation_folder+rmSentence.matrix_id+".csv", index=False)

### Observation Mining

In [None]:
from lib.kgce.schema.semantic.neo4jclasses import Neo4jRelation
from lib.kgce.neo4j.handler import Neo4jWrapper

In [None]:
from neo4j import GraphDatabase
from tqdm import tqdm


class Neo4jWrapper:

    def __init__(self, uri, userName, password):
        self.uri = uri
        self.userName = userName
        self.password = password
        # Connect to the neo4j database server
        self.graphDB_Driver  = GraphDatabase.driver(uri, auth=(userName, password)) 
        
    def sendQuery(self, cql_commands):
        result = []
        done_queries = []
        with self.graphDB_Driver.session() as graphDB_Session:
            for cqlCreate in tqdm(cql_commands):
                try:
                    result += [graphDB_Session.run(cqlCreate).to_df()]
                    done_queries.append(cqlCreate)
                except Exception as e:
                    tqdm.write(str(e))
                    tqdm.write(cqlCreate)
                    result += [str(e)]
        return result
    
    def closeConnection(self):
        self.graphDB_Driver.close()

In [None]:
neowrapper = Neo4jWrapper(uri="bolt://localhost:7687",userName="neo4j",password="test")

In [None]:
result = neowrapper.sendQuery([
    """MATCH (n:Country)<-[r:hasPresence]-(c) 
    WHERE toInteger(r.intensity) >= 5 AND r.source = "Journal"
    RETURN n.wkgs_nameEn as System_Name, n.id, c.name, c.id, r.intensity as intensity;"""
])

In [None]:
df_result_journal = result[0].groupby(['System_Name','n.id'],as_index=False).agg(list)

In [None]:
df_result_journal

In [None]:
result = neowrapper.sendQuery([
    """MATCH (n:Country)<-[r:hasPresence]-(c) 
    WHERE toInteger(r.intensity) >= 6 AND r.source = "Medical"
    RETURN n.wkgs_nameEn as System_Name, n.id, c.name, c.id, r.intensity as intensity;"""
])

In [None]:
df_result_medical = result[0].groupby(['System_Name','n.id'],as_index=False).agg(list)

In [None]:
df_result_medical

In [None]:
result = neowrapper.sendQuery([
    """MATCH (n:Country)<-[r:hasPresence]-(c) 
    WHERE toInteger(r.intensity) >= 20 AND r.source = "Social"
    RETURN n.wkgs_nameEn as System_Name, n.id, c.name, c.id, r.intensity as intensity;"""
])

In [None]:
df_result_social = result[0].groupby(['System_Name','n.id'],as_index=False).agg(list)

In [None]:
df_result_social

In [None]:
# Saving
df_result_journal.to_csv(path_to_journalobservationcsv)
df_result_medical.to_csv(path_to_medicalobservationcsv)
df_result_social.to_csv(path_to_socialobservationcsv)