# Detection Module

    The main goal of the detection module is to use the gazetteers out of the ontologies used to enrich PropaPhen into PropaPhen+ to discover relationships between network nodes/systems and the gufo:Entities by text.

In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Libraries

### Installing

In [168]:
#!pip install pandas
#!pip install tqdm
#!pip install nltk
#!pip install gatenlp
#!pip install py4j
#!pip install pyodide
#!pip install ipywidgets

### Standard

In [77]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import nltk
import glob

### Custom libraries

In [12]:
import detection.relationdiscovery
import detection.observationclustering

## Globals

In [13]:
path_to_covid_journals = "data/textual/covid/newspaper/"
path_to_kb_gazetteer = "data/gazetteers/kbgazetteer.csv"
path_to_netwoork_gazetteer = "data/gazetteers/world_gazetteer.csv"

## Utils

In [190]:
class Term:
    """A Term is a singleword or a multiword 
    string that refers to a single unit of knowledge.
    They represent the words of interest in the corpus.
    """
    def __init__(self, label : str) -> None:
        self.label = label
        self.termRepresentation = None
        
    def termRepresentationFunction(self, representationFunction) -> None:
        """Updates de termRepresentation variable

        Parameters
        ----------
        representationFunction : Function
            Function that extracts the representation of the term
        """
        self.termRepresentation = representationFunction(self.label)
        
    def similarityValue(self,similarityFunction, otherTerm : Term) -> float:
        """Retrieves the similarity value out of two terms

        Parameters
        ----------
        similarityFunction : Function
            Function for similarity retrieval
        otherTerm : Term
            Second term for the similarity function
        Returns
        ----------
        Value of similarity between terms
        """
        assert self.termRepresentation is not None
        assert otherTerm.termRepresentation is not None
        return similarityFunction(self.termRepresentation,otherTerm.termRepresentation)

In [211]:
class Concept:
    """It is a conceptualization of an unit of 
    knowledge that may have multiple Terms associated with.
    """
    def __init__(self, list_of_terms=[],list_of_ids=[]):
            self.list_of_terms = list_of_terms
            self.list_of_ids = list_of_ids
    
    def setOfTermStrings(self,cleaningFunction=None):
        """Returns a clean list of all term's strings

        Parameters
        ----------
        cleaningFunction : Function
            Function for normalizing and cleaning every string if necessary
        Returns
        ----------
        Cleanned string list
        """
        termList = list(set([term.label for term in self.list_of_terms]))
        if cleaningFunction is not None:
            for i in range(len(termList)):
                termList[i] = cleaningFunction(termList[i])
        return termList

In [212]:
def df_to_concepts(df):
    dict_id = {}
    dict_concept = {}
    print("Finding Terms")
    for index, row in tqdm(df.iterrows()):
        if row["ID"] in dict_id:
            inDict = False
            # Check for duplicatas
            for t in dict_id[row["ID"]].list_of_terms:
                if row["Name"] == t.label:
                    inDict = True
                    break
            # If no duplicatas
            if inDict == False:
                newTerm = Term(row["Name"])
                dict_id[row["ID"]].list_of_terms.append(newTerm)
                dict_concept[row["Name"]] = dict_id[row["ID"]]
        elif row["Name"] in dict_concept:
            dict_concept[row["Name"]].list_of_ids.append(row["ID"])
            dict_id[row["ID"]] = dict_concept[row["Name"]]
        else:
            newterm = Term(row["Name"])
            newconcept = Concept([newterm])
            dict_concept[row["Name"]] = newconcept
            dict_id[row["ID"]] = newconcept
    print("Creating Term list")
    listset = set()
    for key in dict_id:
        listset.add(dict_id[key])
    for key in dict_concept:
        listset.add(dict_concept[key])
    return list(listset)

## Relationship Discovery

### KB Gazetteers

In [232]:
kb_concept_list = []
network_concept_list = []

In [230]:
df_kb = pd.read_csv(path_to_kb_gazetteer)

In [231]:
df_kb.head()

Unnamed: 0.1,Unnamed: 0,ID,Name
0,0,C0026106,Mild mental retardation
1,1,C0026351,Moderate mental retardation
2,2,C0036857,Severe mental retardation
3,3,C0020796,Profound mental retardation
4,4,C0025362,Unspecified mental retardation


In [233]:
kb_term_list = df_to_terms(df_kb)

Finding Terms


0it [00:01, ?it/s]


TypeError: __init__() takes 2 positional arguments but 3 were given

In [51]:
len(kb_term_list)

7892473

### Place Gazetteers

In [60]:
df_network = pd.read_csv(path_to_netwoork_gazetteer)

In [61]:
df_network.head()

Unnamed: 0.1,Unnamed: 0,ID,Name
0,0,wkg:10,"""Mamassita"""
1,1,wkg:10,"""Mamacita"""
2,2,wkg:1000709658,"""Boulzazen"""
3,3,wkg:1000709658,"""Boulzazen"""
4,4,wkg:1000709658,"""بولزازن"""


In [62]:
network_term_list = df_to_terms(df_network)

Finding Terms


2151469it [02:15, 15860.14it/s]


Creating Term list


In [64]:
len(network_term_list)

1379736

### Newspaper Corpus

In [69]:
# absolute path to search all text files inside a specific folder
path = path_to_covid_journals+"/2019-11-01/*.txt"
files = glob.glob(path)

### GATE

In [83]:
from gatenlp import Document
from gatenlp.gateworker import GateWorker

In [86]:
gs = GateWorker(start=False, auth_token="1234")

In [180]:
doc = gs.getDocument4Name('40837738.txt_00011')

2024-01-15 17:00:56,548|INFO|py4j.java_gateway|Error while receiving.
Traceback (most recent call last):
  File "/data/dataRapide/gabriel/git/DDPF/Detection/dtvenv/lib/python3.8/site-packages/py4j/java_gateway.py", line 1224, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty
2024-01-15 17:00:56,550|ERROR|root|Exception while sending command.
Traceback (most recent call last):
  File "/data/dataRapide/gabriel/git/DDPF/Detection/dtvenv/lib/python3.8/site-packages/py4j/java_gateway.py", line 1224, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/data/dataRapide/gabriel/git/DDPF/Detection/dtvenv/lib/python3.8/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.se

Py4JError: An error occurred while calling t.getDocument4Name

In [100]:
pdoc1 = gs.gdoc2pdoc(doc)

In [146]:
print(pdoc1.annset().type_names)

dict_keys(['Lookup', 'Token', 'SpaceToken', 'Split', 'Sentence', 'Percent', 'Location', 'Date', 'Person', 'Organization', 'Unknown', 'MultiWord', 'Possessor', 'Possessee', 'MOD', 'Head', 'EmbeddedHead1', 'EmbeddedHead2', 'EmbeddedHead3', 'TermCandidate', 'NamedEntity', 'Noun', 'SimpleNoun', 'Verb', 'CompoundNoun', 'deleted_NE_MultiWord'])


In [151]:
annset = pdoc1.annset().with_type("Location")

In [152]:
ann
for annc in annset.fast_iter():
    ann = annc

In [153]:
#print(annset.type_names)

In [159]:
pdoc1.text[3644:3662]

'Guangdong Province'