# Detection Module

    The main goal of the detection module is to use the gazetteers out of the ontologies used to enrich PropaPhen into PropaPhen+ to discover relationships between network nodes/systems and the gufo:Entities by text.

In [1]:
%load_ext autoreload
%autoreload 2

## Libraries

### Installing

In [293]:
#!pip install pandas
#!pip install tqdm
#!pip install nltk
#!pip install gatenlp
#!pip install py4j
#!pip install pyodide
#!pip install ipywidgets

### Standard

In [294]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import nltk
import glob

### Custom libraries

In [295]:
import detection.relationdiscovery
import detection.observationclustering

## Globals

In [317]:
path_to_covid_journals = "data/textual/covid/newspaper/"
path_to_kb_gazetteer = "data/gazetteers/kbgazetteer.csv"
path_to_netwoork_gazetteer = "data/gazetteers/world_gazetteer_en.csv"
path_to_lsts = "data/lst/"
path_to_relationcsv = "data/csv/discoveredRelationships.csv"
path_to_observationcsv = "data/csv/observations.csv"

## Utils

In [297]:
class Term:
    """A Term is a singleword or a multiword 
    string that refers to a single unit of knowledge.
    They represent the words of interest in the corpus.
    """
    def __init__(self, label : str) -> None:
        self.label = label
        self.termRepresentation = None
        
    def termRepresentationFunction(self, representationFunction) -> None:
        """Updates de termRepresentation variable

        Parameters
        ----------
        representationFunction : Function
            Function that extracts the representation of the term
        """
        self.termRepresentation = representationFunction(self.label)
        
    def similarityValue(self,similarityFunction, otherTerm) -> float:
        """Retrieves the similarity value out of two terms

        Parameters
        ----------
        similarityFunction : Function
            Function for similarity retrieval
        otherTerm : Term
            Second term for the similarity function
        Returns
        ----------
        Value of similarity between terms
        """
        assert self.termRepresentation is not None
        assert otherTerm.termRepresentation is not None
        return similarityFunction(self.termRepresentation,otherTerm.termRepresentation)

In [7]:
class Concept:
    """It is a conceptualization of an unit of 
    knowledge that may have multiple Terms associated with.
    """
    def __init__(self, list_of_terms=None,list_of_ids=None):
        if list_of_terms is None:
            self.list_of_terms = []
        else:
            self.list_of_terms = list_of_terms
        if list_of_ids is None:
            self.list_of_ids = []
        else:
            self.list_of_ids = list_of_ids
    
    def setOfTermStrings(self,cleaningFunction=None):
        """Returns a clean list of all term's strings

        Parameters
        ----------
        cleaningFunction : Function
            Function for normalizing and cleaning every string if necessary
        Returns
        ----------
        Cleanned string list
        """
        termList = list(set([term.label for term in self.list_of_terms]))
        if cleaningFunction is not None:
            for i in range(len(termList)):
                termList[i] = cleaningFunction(termList[i])
        return termList

In [8]:
def df_to_concepts(df):
    dict_id = {}
    dict_concept = {}
    print("Finding Terms")
    for index, row in tqdm(df.iterrows()):
        if row["ID"] in dict_id:
            inDict = False
            # Check for duplicatas
            for t in dict_id[row["ID"]].list_of_terms:
                if row["Name"] == t.label:
                    inDict = True
                    break
            # If no duplicatas
            if inDict == False:
                newTerm = Term(row["Name"])
                dict_id[row["ID"]].list_of_terms.append(newTerm)
                dict_concept[row["Name"]] = dict_id[row["ID"]]
        elif row["Name"] in dict_concept:
            dict_concept[row["Name"]].list_of_ids.append(row["ID"])
            dict_id[row["ID"]] = dict_concept[row["Name"]]
        else:
            newterm = Term(row["Name"])
            newconcept = Concept([newterm],[row["ID"]])
            dict_concept[row["Name"]] = newconcept
            dict_id[row["ID"]] = newconcept
    print("Creating Term list")
    listset = set()
    for key in dict_id:
        listset.add(dict_id[key])
    for key in dict_concept:
        listset.add(dict_concept[key])
    return list(listset)

In [9]:
def cleaningPlaceStr(string):
    return str(string).replace('"','')

def capPlaceStr(string):
    return cleaningPlaceStr(string).upper()

def lowerPlaceStr(string):
    return cleaningPlaceStr(string).lower()

In [10]:
def conceptsToGazetteer(concept_list,path_to_list, cleanningFunction=None):
    """Save the list of concetps into a gazetteer, while it returns the dict{term:concept}

        Parameters
        ----------
        concept_list : List of Concepts
            List of all concepts that are going to make part of the gazetteer
        cleaningFunction : Function
            Function for normalizing and cleaning every string if necessary
        Returns
        ----------
        Saves file into path and returns dictionary of terms from the concepts having their equivalent concept as value
    """
    list_str = ""
    dict_term_concept = {}
    for concept in tqdm(concept_list):
        for term in concept.setOfTermStrings(cleanningFunction):
            list_str += term + "\n"
            dict_term_concept[term] = concept
    with open(path_to_list, "w") as text_file:
        text_file.write(list_str)
    return dict_term_concept

## Relationship Discovery

### KB Gazetteers

In [11]:
kb_concept_list = []
network_concept_list = []

In [12]:
df_kb = pd.read_csv(path_to_kb_gazetteer)

In [13]:
df_kb.head()

Unnamed: 0.1,Unnamed: 0,ID,Name
0,0,C0026106,Mild mental retardation
1,1,C0026351,Moderate mental retardation
2,2,C0036857,Severe mental retardation
3,3,C0020796,Profound mental retardation
4,4,C0025362,Unspecified mental retardation


In [14]:
kb_concept_list = df_to_concepts(df_kb)

Finding Terms


12620098it [12:57, 16235.63it/s]


Creating Term list


In [15]:
len(kb_concept_list)

7892473

In [16]:
def umlsConceptCleanner(concept : Concept):
    terms_to_remove = []
    terms_to_add = []
    for term in concept.list_of_terms:
        label = str(term.label)
        if len(label) == 0:
            terms_to_remove.append(term)
        if len(label.split(':')) > 1:
            terms_to_remove.append(term)
            for l in label.split(':'):
                newterm = Term(l)
                terms_to_add.append(newterm)
    for term in terms_to_remove:
        concept.list_of_terms.remove(term)
    concept.list_of_terms += terms_to_add
    return concept

In [17]:
def isEnglish(s):
    return s.isascii()

In [18]:
def worldConceptCleanner(concept : Concept):
    terms_to_remove = []
    terms_to_add = []
    for term in concept.list_of_terms:
        label = str(term.label)
        if not isEnglish(label):
            terms_to_remove.append(term)
            continue
        if len(label.split(':')) > 1:
            terms_to_remove.append(term)
            for l in label.split(':'):
                newterm = Term(l)
                terms_to_add.append(newterm)
    for term in terms_to_remove:
        concept.list_of_terms.remove(term)
    concept.list_of_terms += terms_to_add
    return concept

In [19]:
for i in tqdm(range(len(kb_concept_list))):
    kb_concept_list[i] = umlsConceptCleanner(kb_concept_list[i])
    kb_concept_list[i] = umlsConceptCleanner(kb_concept_list[i])

100%|█████████████████████████████| 7892473/7892473 [00:33<00:00, 234677.68it/s]


In [20]:
umlsdict = conceptsToGazetteer(kb_concept_list,path_to_lsts+"umls.lst",cleaningPlaceStr)

100%|██████████████████████████████| 7892473/7892473 [02:16<00:00, 57729.56it/s]


### Place Gazetteers

In [273]:
def ClearnWorldKGGazetteer(df_network,clear_net_list):
    df_network = df_network.drop(df_network[(df_network["Name"]=='"China"') & (df_network["ID"]!="wkg:424313582")].index)
    df_network = df_network[~df_network['Name'].isin(clear_net_list)]
    return df_network

In [318]:
df_network = pd.read_csv(path_to_netwoork_gazetteer)

In [321]:
clear_net_list = ['"Nga"']

In [322]:
df_network = ClearnWorldKGGazetteer(df_network,clear_net_list)

In [348]:
df_network[df_network['Name'] == '"Nga"']

Unnamed: 0.1,Unnamed: 0,ID,Name


In [323]:
df_network.head()

Unnamed: 0.1,Unnamed: 0,ID,Name
0,0,wkg:10,"""Mamassita"""
1,1,wkg:10,"""Mamacita"""
2,2,wkg:1000709658,"""Boulzazen"""
3,3,wkg:1000709658,"""Boulzazen"""
4,4,wkg:1000709660,"""Tizi El Oued"""


In [324]:
network_concept_list = df_to_concepts(df_network)

Finding Terms


1692476it [03:36, 7830.04it/s] 


Creating Term list


In [325]:
print(len(network_concept_list))

948998


In [326]:
# Pre-processing network
#for i in tqdm(range(len(network_concept_list))):
#    network_concept_list[i] = worldConceptCleanner(network_concept_list[i])

In [327]:
# Normal
print("Usual name")
normalplacesdict = conceptsToGazetteer(network_concept_list,path_to_lsts+"places.lst",cleaningPlaceStr)
# Cap
#print("Cap name")
#capdict = conceptsToGazetteer(network_concept_list,path_to_lsts+"places_cap.lst",capPlaceStr)
# Lower
#print("Lower name")
#lowerdict = conceptsToGazetteer(network_concept_list,path_to_lsts+"places_lower.lst",lowerPlaceStr)

Usual name


100%|███████████████████████████████| 948998/948998 [00:03<00:00, 306281.96it/s]


### GATE

In [30]:
from gatenlp import Document
from gatenlp.gateworker import GateWorker

In [31]:
gs = GateWorker(start=False, auth_token="1234")

### Gate Gazetteer

In [33]:
import os
from gatenlp import Document
from gatenlp.processing.gazetteer import TokenGazetteer, StringGazetteer

In [34]:
# create a tokenizer based on the NLTK WordPunctTokenizer. 
from gatenlp.processing.tokenizer import NLTKTokenizer
from nltk.tokenize.regexp import WordPunctTokenizer

In [208]:
corpus = gs.getCorpus4Name('PreDiViD')

In [36]:
class RelationMatrix():
    
    def __init__(self, matrix_id):
        self.matrix_dict = {}
        self.matrix_id = matrix_id
        
    def getValue(self,i,j):
        if (i,j) in self.matrix_dict.keys():
            return self.matrix_dict[(i,j)]
        else:
            return None
        
    def setValue(self,i,j,v):
        self.matrix_dict[(i,j)] = v
        
    def increaseBy(self,i,j,v):
        v0 = self.getValue(i,j)
        if v0 is None:
            v0 = 0
        self.setValue(i,j,v+v0)

In [427]:
class RMGenerator():
    
    def __init__(self, corpus,gateExtractor):
        self.corpus = corpus
        self.gateExtractor = gateExtractor
    
    def directTermMatching(self, matrix_id):
        rm = RelationMatrix(matrix_id)
        # Per document
        for doc in tqdm(self.corpus):
            pdoc = gs.gdoc2pdoc(doc)
            pdoc = self.gateExtractor.tokenizer(pdoc)
            pdoc = self.gateExtractor.tok_gaz(pdoc)
            # Making the rm links
            for kb_annotation in pdoc.annset().with_type("kb"):
                for network_annoation in pdoc.annset().with_type("network"):
                    rm.increaseBy(self.gateExtractor.dict_kb[kb_annotation.features['key']], 
                                self.gateExtractor.dict_network[network_annoation.features['key']],1)
            gs.del_resource(doc)
        return rm
    
    def paragraphTermMatching(self, matrix_id):
        assert 'annie' in self.gateExtractor.extra_pr.keys()
        rm = RelationMatrix(matrix_id)
        # Per document
        for doc in tqdm(self.corpus):
            # Run annie
            gs.worker.run4Document(self.gateExtractor.extra_pr['annie'], doc)
            pdoc = gs.gdoc2pdoc(doc)            
            # Get network and kb
            pdoc = self.gateExtractor.tok_gaz(pdoc)
            # Get paragraph
            praragraphann = pdoc1.annset('Original markups').with_type("paragraph")
            # For each paragraph
            for ann in praragraphann:
                # Making the rm links
                for kb_annotation in pdoc.annset().within(ann).with_type('kb'):
                    for network_annoation in pdoc.annset().within(ann).with_type('network'):
                        rm.increaseBy(self.gateExtractor.dict_kb[kb_annotation.features['key']], 
                                self.gateExtractor.dict_network[network_annoation.features['key']],1)
            gs.del_resource(doc)
        return rm
    
    def sentenceTermMatching(self, matrix_id):
        assert 'annie' in self.gateExtractor.extra_pr.keys()
        rm = RelationMatrix(matrix_id)
        # Per document
        for doc in tqdm(self.corpus):
            # Run annie
            gs.worker.run4Document(self.gateExtractor.extra_pr['annie'], doc)
            pdoc = gs.gdoc2pdoc(doc)            
            # Get network and kb
            pdoc = self.gateExtractor.tok_gaz(pdoc)
            # Get paragraph
            sentenceann = pdoc1.annset('').with_type("Sentence")
            # For each paragraph
            for ann in sentenceann:
                # Making the rm links
                for kb_annotation in pdoc.annset().within(ann).with_type('kb'):
                    for network_annoation in pdoc.annset().within(ann).with_type('network'):
                        rm.increaseBy(self.gateExtractor.dict_kb[kb_annotation.features['key']], 
                                self.gateExtractor.dict_network[network_annoation.features['key']],1)
            gs.del_resource(doc)
        return rm

In [38]:
class RelationshipDiscovery():
    
    def __init__(self,corpus, gateExtractor, rmGen=None):
        self.corpus = corpus
        self.gateExtractor = gateExtractor
        if rmGen is not None:
            self.rmGen = rmGen
            assert self.rmGen.corpus == self.corpus
            assert self.rmGen.gateExtractor == self.gateExtractor
        else:
            self.rmGen = RMGenerator(self.corpus, self.gateExtractor)

In [349]:
class GateExtractor():
    
    def __init__(self, dict_kb, dict_network, extra_pr=None):
        self.tokenizer = NLTKTokenizer(
            nltk_tokenizer=WordPunctTokenizer(), 
            token_type="Token", outset_name="")
        self.dict_kb = dict_kb
        self.dict_network = dict_network
        print('Creating KB gazetteer...')
        self.kb_gazetteer = self.gazetteer_creator(self.dict_kb.keys())
        print('Creating Network gazetteer...')
        self.network_gazetteer = self.gazetteer_creator(self.dict_network.keys())
        print('Creating Merging gazetteer...')
        self.tok_gaz = TokenGazetteer(longest_only=False,
                          skip_longest=False, outset_name="", ann_type="Lookup",
                          annset_name="", token_type="Token")
        self.tok_gaz.append(source=self.kb_gazetteer, source_fmt="gazlist", list_type="kb")
        self.tok_gaz.append(source=self.network_gazetteer, source_fmt="gazlist", list_type="network")
        if extra_pr is None:
            self.extra_pr = {}
        else:
            self.extra_pr = extra_pr
        self.extra_pr['tokenizer'] = self.tokenizer
        self.extra_pr['tok_gaz'] = self.tok_gaz
        
    def _text2tokenstrings(self, text):
        tmpdoc = Document(text)
        self.tokenizer(tmpdoc)
        tokens = list(tmpdoc.annset().with_type("Token"))
        return [tmpdoc[tok] for tok in tokens]
    
    def gazetteer_creator(self, list_of_entries):
        return [(self._text2tokenstrings(txt),
                            {'key' : txt}) for txt in tqdm(list_of_entries)]

In [40]:
import nltk
from nltk.corpus import stopwords
 
nltk.download('stopwords')
stopwords_list = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gabriel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
# import string library function  
import string  
    
# Storing the sets of punctuation in variable result  
punctuation = [i for i in string.punctuation  ]

In [42]:
def cleanKeys(dictionary, clean_list):
    for c in clean_list:
        if c in dictionary:
            del dictionary[c]
    return dictionary

In [329]:
clean_list = stopwords_list + punctuation
normalplacesdict = cleanKeys(normalplacesdict,clean_list)
umlsdict = cleanKeys(umlsdict,clean_list+list(normalplacesdict.keys()))

In [332]:
gateExtractor = GateExtractor(umlsdict,normalplacesdict)

Creating KB gazetteer...


100%|███████████████████████████████| 7657627/7657627 [17:03<00:00, 7484.15it/s]


Creating Network gazetteer...


100%|██████████████████████████████| 1217468/1217468 [01:11<00:00, 17025.77it/s]


Creating Merging gazetteer...


In [425]:
# Annie
gs.worker.loadMavenPlugin("uk.ac.gate.plugins", "annie", "8.6")
# now load the prepared ANNIE pipeline from the plugin
pipeline = gs.worker.loadPipelineFromPlugin("uk.ac.gate.plugins","annie", "/resources/ANNIE_with_defaults.gapp")
pipeline.getName()

'ANNIE'

In [426]:
gateExtractor.extra_pr['annie'] = pipeline

In [428]:
rd = RelationshipDiscovery(corpus, gateExtractor)

In [429]:
rm = rd.rmGen.directTermMatching('PreDiViD-11-2019')

100%|█████████████████████████████████████████████████████████████████| 8/8 [00:25<00:00,  3.13s/it]


In [407]:
rm = rd.rmGen.paragraphTermMatching('PreDiViD-11-2019')

100%|█████████████████████████████████████████████| 8/8 [00:02<00:00,  2.80it/s]


In [430]:
rm = rd.rmGen.sentenceTermMatching('PreDiViD-11-2019')

100%|█████████████████████████████████████████████████████████████████| 8/8 [00:03<00:00,  2.59it/s]


### Neo4J

In [47]:
def rmToRelationCSV(rm, source_value, trustworthiness_value, typeof_value, cluster_date=None):
    list_start = []
    list_end = []
    date = []
    source = []
    trustworthiness = []
    typeof = []
    intensity = []
    if cluster_date is None:
        cluster_date = '-'.join(rm.matrix_id.split('-')[1:])
    for key in list(rm.matrix_dict.keys()):
        c1 = key[0]
        c2 = key[1]
        intentisyValue = rm.matrix_dict[key]
        for c1id in c1.list_of_ids:
            for c2id in c2.list_of_ids:
                list_start.append(c1id)
                list_end.append(c2id)
                intensity.append(intentisyValue)
    date = [cluster_date] * len(list_start)
    source = [source_value] * len(list_start)
    trustworthiness = [trustworthiness_value] * len(list_start)
    typeof = [typeof_value] * len(list_start)
    df = pd.DataFrame(data={
        ":START_ID" : list_start,
        ":END_ID" : list_end,
        ":TYPE" : typeof,
        "date" : date,
        "source" : source,
        "trustworthiness" : trustworthiness,
        "intensity" : intensity
    })
    return df

In [432]:
df_rm = rmToRelationCSV(rm, 'Journal', 1, 'hasPresence') 

In [433]:
df_rm.to_csv(path_to_relationcsv, index=False)

### Observation Mining

In [337]:
from lib.kgce.schema.semantic.neo4jclasses import Neo4jRelation
from lib.kgce.neo4j.handler import Neo4jWrapper



In [338]:
from neo4j import GraphDatabase
from tqdm import tqdm


class Neo4jWrapper:

    def __init__(self, uri, userName, password):
        self.uri = uri
        self.userName = userName
        self.password = password
        # Connect to the neo4j database server
        self.graphDB_Driver  = GraphDatabase.driver(uri, auth=(userName, password)) 
        
    def sendQuery(self, cql_commands):
        result = []
        done_queries = []
        with self.graphDB_Driver.session() as graphDB_Session:
            for cqlCreate in tqdm(cql_commands):
                try:
                    result += [graphDB_Session.run(cqlCreate).to_df()]
                    done_queries.append(cqlCreate)
                except Exception as e:
                    tqdm.write(str(e))
                    tqdm.write(cqlCreate)
                    result += [str(e)]
        return result
    
    def closeConnection(self):
        self.graphDB_Driver.close()

In [434]:
neowrapper = Neo4jWrapper(uri="bolt://localhost:7687",userName="neo4j",password="test")

In [438]:
result = neowrapper.sendQuery([
    """MATCH (n:Country)<-[r:hasPresence]-(c) 
    WHERE toInteger(r.intensity) >= 1
    RETURN n.wkgs_nameEn as System_Name, n.id, c.name, c.id, r.intensity as intensity;"""
])

100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.41it/s]


In [439]:
df_result = result[0].groupby(['System_Name','n.id'],as_index=False).agg(list)

In [440]:
df_result

Unnamed: 0,System_Name,n.id,c.name,c.id,intensity
0,"""China""",wkg:424313582,"[Province, Province, Province]","[C1514578, A7659903, A7850354]","[1, 1, 1]"
1,"""France""",wkg:1363947712,"[S, E, S, S, K, K, A, K, E, E, Additional, res...","[A32853699, A3122833, A20945140, A3196982, A29...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, ..."
2,"""Germany""",wkg:1683325355,"[research, activity, ulcer Braden scale: activ...","[A19722860, A18564727, C2171311, A19723452, A0...","[4, 2, 2, 4, 2, 2, 2, 2, 4, 2, 8, 4, 4, 3, 2]"
3,"""India""",wkg:424314145,"[Study, Viruses, Study, Presence of, Presence ...","[A18323259, A0132816, A16461341, A4791697, A30...","[2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, ..."
4,"""Kazakhstan""",wkg:424311521,"[PG, p, 31, G, f, DN, Filed, RS, OS, URL, PG, ...","[A26709416, A23010387, A20944698, A15569347, A...","[4, 4, 8, 4, 4, 8, 4, 4, 4, 4, 4, 4, 4, 4, 8, ..."
5,"""Netherlands""",wkg:424297217,"[various, Medicine, various, kidney, news, kid...","[C3540765, A10759069, A22723323, A29398775, A1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
6,"""Portugal""",wkg:2377028247,"[31, RSV, Parainfluenza virus 1, Parainfluenza...","[A34719971, A21144343, A29378504, A32816437, C...","[6, 6, 3, 3, 6, 3, 3, 6, 3, 3, 6, 6, 6, 12, 3,..."
7,"""Russia""",wkg:424314830,"[RS, PG, URL, u, 31, DN, G, 31, RS, PG, HLA-C*...","[A24583098, A32660236, C1710546, A12814951, A2...","[37, 37, 37, 37, 74, 74, 37, 74, 37, 37, 37, 3..."
8,"""United States""",wkg:424317935,"[Researchers, European, Department, D, related...","[A26601571, A32734221, A10825217, A12798025, A...","[1, 120, 1, 1, 60, 1, 6, 1, 1, 1, 1, 1, 1, 1, ..."


In [441]:
df_result.to_csv("data/csv/observations_phrase.csv", index=False)  