# Natural Language Project (Rule Based)

Álvaro Sáenz-Torre, Alejandra Reinares, Luis Domene and Joan Bayona

In [1]:
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import json
import spacy_udpipe
from langdetect import detect
import spacy_udpipe
from spacy import displacy
import nltk
from nltk.corpus import cess_esp, cess_cat

In [2]:
nltk.download('cess_esp')
nltk.download('cess_cat')
nltk.download('punkt')

[nltk_data] Downloading package cess_esp to
[nltk_data]     C:\Users\alvar\AppData\Roaming\nltk_data...
[nltk_data]   Package cess_esp is already up-to-date!
[nltk_data] Downloading package cess_cat to
[nltk_data]     C:\Users\alvar\AppData\Roaming\nltk_data...
[nltk_data]   Package cess_cat is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alvar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
f=open("training.json")
training_set=json.load(f)

f2 = open("test.json")
test_set = json.load(f2)

In [4]:
l_training=[]
for i in range(len(training_set)):
    l_training.append({"text":training_set[i]["data"]["text"], "results":training_set[i]["predictions"][0]["result"]})

l_test = []
for i in range(len(test_set)):
    l_test.append({"text":test_set[i]["data"]["text"], "results":test_set[i]["predictions"][0]["result"]})

In [5]:
l_neg=[]
l_unc=[]
l_nsco=[]
l_usco=[]
for dic in l_training:
    for dic2 in dic["results"]:
        if(dic2["value"]["labels"][0]=="NEG"):
            l_neg.append(dic["text"][dic2["value"]["start"]:dic2["value"]["end"]])
        
        elif(dic2["value"]["labels"][0]=="UNC"):
            l_unc.append(dic["text"][dic2["value"]["start"]:dic2["value"]["end"]])
        
        elif(dic2["value"]["labels"][0]=="NSCO"):
            l_nsco.append(dic["text"][dic2["value"]["start"]:dic2["value"]["end"]])
        
        elif(dic2["value"]["labels"][0]=="USCO"):
            l_usco.append(dic["text"][dic2["value"]["start"]:dic2["value"]["end"]])

In [6]:
for i in range(len(l_neg)):
    if(not l_neg[i][0].isalpha()):
        l_neg[i]=l_neg[i][1:]
    if(not l_neg[i][-1].isalpha()):
        l_neg[i]=l_neg[i][:-1]

for i in range(len(l_unc)):
    if(not l_unc[i][0].isalpha()):
        l_unc[i]=l_unc[i][1:]
    if(not l_unc[i][-1].isalpha()):
        l_unc[i]=l_unc[i][:-1]

for i in range(len(l_nsco)):
    if(not l_nsco[i][0].isalpha()):
        l_nsco[i]=l_nsco[i][1:]
    if(not l_nsco[i][-1].isalpha()):
        l_nsco[i]=l_nsco[i][:-1]

for i in range(len(l_usco)):
    if(not l_usco[i][0].isalpha()):
        l_usco[i]=l_usco[i][1:]
    if(not l_usco[i][-1].isalpha()):
        l_usco[i]=l_usco[i][:-1]

In [7]:
l_neg=list(set(l_neg))
l_unc=list(set(l_unc))
l_nsco=list(set(l_nsco))
l_usco=list(set(l_usco))

In [8]:
# Download the list of conjunctions
t=open("conjunctions.txt")
conjunctions=t.read()

l_conj=[]
s=""
for l in conjunctions:
    if(l!="\n"):
        s+=l
    else:
        l_conj.append(s)
        s=""

l_conj=list(set(l_conj[3:]))

In [9]:
def search_words(list_words, text):
    pattern = '|'.join(re.escape(palabra) for palabra in list_words)
    pattern= r"\b(" +pattern+ r")\b"
    
    coincidencias = []
    ini = []
    end = []
    
    # Finding all occurrences of any word in the text
    for match in re.finditer(pattern, text):
        coincidencias.append(match.group(0))
        ini.append(match.start())
        end.append(match.end())    
    return coincidencias, end, ini

Rule 1: if the sentence contains a termination term, the scope is extracted using this term. 

In [10]:
def rule1(start_keyword, end_keyword, text, l_term):
    """
    Parameters: {
    start_keyword: list of starting points.
    end_keyword: list of ending points.
    text: sentence
    l_term:  list of termination terms.
    }

    Output{
    l_pos_scope: list of scopes [[12,32],[40,78]]
    }
    
    """
    l_pos_scope=[]
    for i in range(len(end_keyword)):
        idx=end_keyword[i]
        found=False
        if(i==len(end_keyword)-1):
            while not found and idx<len(text):
                if(text[idx]==" "):
                    idx+=1
                    for term in l_term:
                        if((idx+len(term))<len(text) and term==text[idx:idx+len(term)]):
                            found=True
                            l_pos_scope.append([end_keyword[i]+1,idx-1])
                else:
                    idx+=1
        else:
            while not found and idx<start_keyword[i+1]:
                if(text[idx]==" "):
                    idx+=1
                    for term in l_term:
                        if((idx+len(term))<len(text) and term==text[idx:idx+len(term)]):
                            found=True
                            l_pos_scope.append([end_keyword[i]+1,idx-1])
                else:
                    idx+=1
        if not found:
            l_pos_scope.append([])
    return l_pos_scope

Rule 2: If a cue is detected in a sentence containing contiguous cues the scope will be given by the position Ci+1.

In [11]:
def rule2(keywords,ini_keyword,final_keyword,sentence):
    """
    Parameters{
    keywords: list of keywords (negations and uncertainty)
    final_keyword: list of endpoints
    sentence: sentence where you're applying rule 2. 
    }

    Output{
    l_scopes: list of scopes [[12,32],[40,78]]
    }
    
       """
    l_scopes = []

    #sentences = nltk.sent_tokenize(text)
    if (len(keywords)== 1):
        return [[]]
    
    if(len(keywords)>0):
        while len(keywords) > 1:
            if ',' in sentence[ini_keyword[1]-4:ini_keyword[1]]:
                l_scopes.append([final_keyword[0],ini_keyword[1]])
          
            else:
                l_scopes.append([])

            keywords.pop(0)
            ini_keyword.pop(0)
            final_keyword.pop(0)
    
    l_scopes.append([])

    return l_scopes

In [12]:
def rule3(keywords,final_keyword,sentence):

    """
    Parameters{
    keywords: list of keywords (negations and uncertainty)
    sentence: sentence where you're applying rule 3. 
    final_keyword: list of end points.
    }

    Output{
    l_scopes: list of scopes [[12,32],[40,78]]
    }
    
    """
    scopes = []
    if keywords:
        for i in range(len(keywords)):
            possible_scope = str(sentence[final_keyword[i]:]).split()[:3]
            dot = False
            for j in range(len(possible_scope)):
                if "." in possible_scope[j]: 
                    dot = True
                    break

            if dot:   
                dot_search = sentence[final_keyword[i]:]   
                for r in range(len(dot_search)):
                    if dot_search[r] == ".":
                            end = r + final_keyword[i]
                            break
                scopes.append([final_keyword[i], end])
            else:
                scopes.append([])     
    return scopes

Rule 4


In [13]:
corpus_esp = cess_esp.tagged_sents()
corpus_cat = cess_cat.tagged_sents()

tnt_tagger = nltk.tag.tnt.TnT()
tnt_tagger.train(corpus_esp)
tnt_tagger.train(corpus_cat)

In [14]:
def rule4(start_keyword, end_keyword, text, tnt_tagger):
  """
    Parameters: {
     start_keyword: list of starting points.
    end_keyword: list of ending points.
    text: sentence
    tnt_tagger:  model trained in catalan/spanish to POS tag.
    }

    Output{
    l_pos_scope: list of scopes [[12,32],[40,78]]
    }
    
  """
  l_pos_scope=[]
  tokens=word_tokenize(text)
  for i in tokens:
    if(not i.isalpha()):
      tokens.remove(i)
  tags = tnt_tagger.tag(tokens)
  l_text=text.split(" ")
  for i in l_text:
    if(not i.isalpha()):
      l_text.remove(i)
  l_pos=[0,]
  p=0
  for i in range(len(l_text)-1):
    p+=(len(l_text[i])+1)
    l_pos.append(p)
  for i in range(len(end_keyword)):
    idx=end_keyword[i]
    found=False
    x=False
    if(i==len(end_keyword)-1):
      while not found and idx<len(text):
        if(idx in l_pos):
          if x:
            index=l_pos.index(idx)
            if(tags[index][1][0]=="c"):
              found=True
              l_pos_scope.append((end_keyword[i]+1,idx-1))
            elif(tags[index][1][0]=="v" and tags[index][1][:3]!="vmp" and tags[index][1][:3]!="vmg"):
              found=True
              l_pos_scope.append((end_keyword[i]+1,idx-1))
          else:
            x=True
        idx+=1
    else:
      while not found and idx<start_keyword[i+1]:
        if(idx in l_pos):
          if x:
            index=l_pos.index(idx)
            if(tags[index][1][0]=="c"):
              found=True
              l_pos_scope.append((end_keyword[i]+1,idx-1))
            elif(tags[index][1][0]=="v" and tags[index][1][:3]!="vmp" and tags[index][1][:3]!="vmg"):
              found=True
              l_pos_scope.append((end_keyword[i]+1,idx-1))
          else:
            x=True
        idx+=1
    if not found:
      l_pos_scope.append([])
  return l_pos_scope

Rule 5: if the sentence does not match the previous rules, the algorithm generate a sentence parse tree. 

In [15]:
# Auxiliar Function Rule 5
def find_scope(sentence,word,nlp):
    """
    Parameters: {    
    sentence: Sentence to be parsed searching the scope
    word: That you are interested in finding the scope
    nlp: Object that is used to extract the sentence information it can be either spanish_nlp or catalan_nlp
    }
    
    Output{
    scope : example [12,39]
    }
    """
    doc = nlp(sentence)
    
    root = [token for token in doc if token.dep_ == "ROOT"][0]
    childs = [child for child in root.children]
    start =  4
    scope = []
    for child in childs: # Sugestivos -> nódulos -> fiebre
      if child.text == word:
        start = 0
      else:
        start += 1

      if start == 1:
          grandchildren = [grandchild.text for grandchild in child.children]
          for grandchild in child.children:
            greatgrandchildren = [greatgrandchild.text for greatgrandchild in grandchild.children]
        
            scope = [child.text] + grandchildren + greatgrandchildren  
    
    start =  4
    grandchildren_list = []
    for child in childs: 
      for grandchildren in child.children:

        if grandchildren.text == word:
          start = 1
          head = [grandchildren.head.text]

        elif start == 1: # It needs to enter here every time after grandchildren.text == word
            grandchildren_list.append(grandchildren.text)
            greatgrandchildren_list = [greatgrandchildren.text for greatgrandchildren in grandchildren.children]

            for greatgrandchildren in grandchildren.children:
              greatgreatgrandchildren_list = [greatgreatgrandchildren.text for greatgreatgrandchildren in greatgrandchildren.children ]

              scope = head + grandchildren_list + greatgrandchildren_list + greatgreatgrandchildren_list

        
    start_index = sentence.lower().find(word) + len(word)
    old_index = 1000
    end_index = -1
    distance = start_index

    for word in scope: 

        begin_index = sentence.lower().find(word)

        if begin_index > start_index and begin_index < old_index:
            old_index = begin_index

        after_cue = sentence[start_index:].split()[:len(scope)] 

        for i in range(len(after_cue)):
           distance += len(after_cue[i])
 
        last_index = sentence.lower().find(word)
       
        if last_index < distance and last_index > end_index:
            end_index = last_index + len(word)

    result = [old_index, end_index]
    if old_index == 1000 and end_index==-1:
      return []
    
    else: 
       return result

In [16]:
def rule5(keywords,sentence):
    """
    Parameters{
    keywords: list of keywords (negations and uncertainty)
    sentence: sentence where you're applying rule 3. 
    }

    Output{
    l_scopes: list of scopes [[12,32],[40,78]]
    }
    
    """
    l_scopes = []
    language = detect(sentence)
    if language == 'ca':  # Catalán
        nlp_ca = spacy_udpipe.load("ca")

        for word in keywords:
            l_scopes.append(find_scope(sentence,word,nlp_ca)) 
            

    else:  # Castellano
        nlp_es = spacy.load("es_core_news_md")
        for word in keywords:
            l_scopes.append(find_scope(sentence,word,nlp_es))
    
    return l_scopes

In [17]:
# Función para cortar el string a partir de la palabra "informe"
def eliminate_patient_information(string):
    index = string.find("informe")
    if index != -1:
        return string[index:]
    else:
        return string

def eliminate_final_information(string):
    index = string.find("destinacio a l'alta")
    if index != -1:
        return string[:index]
    else:
        return string

def check_complete(list):
    for sublist in list:
        if len(sublist)<1:
            return False
    return True

def update_lists(a, b):
    c = []
    for list_a, list_b in zip(a, b):
        if not list_a and list_b:
            c.append(list_b)
        else:
            c.append(list_a)
    return c

In [18]:
def model(input, l_neg, l_unc):
    l_res = [] # List to store the final results. 

    keywords_voc = l_neg + l_unc
    a=0
    for element in input:
        a+=1
        text = eliminate_patient_information(element['text'])
        counter = len(element['text']) - len(text)
        sentences = nltk.sent_tokenize(text)
        scopes = []
        dictionary_res_list = []

        for sentence in sentences:
            keywords_appear, end_keyword, start_keyword = search_words(keywords_voc,sentence)

            scopes = rule1(start_keyword,end_keyword,sentence,l_conj)

            if scopes:
                if check_complete(scopes) == False:
                    scopes2 = rule2(keywords_appear.copy(),start_keyword.copy(),end_keyword.copy(),sentence)
                    new_scope = update_lists(scopes,scopes2)
                    
                    if check_complete(new_scope) == False:
                        scopes3 = rule3(keywords_appear,end_keyword,sentence)
                        new_scope = update_lists(new_scope,scopes3)

                        if check_complete(new_scope)== False:
                            scopes4 = rule4(start_keyword,end_keyword,sentence,tnt_tagger)
                            new_scope= update_lists(new_scope,scopes4)

                            if check_complete(new_scope)== False:
                                scopes5 = rule5(keywords_appear,sentence)
                                new_scope= update_lists(new_scope,scopes5)

                else:
                    new_scope = scopes

            for i in range(len(keywords_appear)):
                if keywords_appear[i] in l_neg:
                    label = ["NEG"]
                    res = {"value":{"start": start_keyword[i] + counter, "end": end_keyword[i] + counter, "labels": label}}
                    dictionary_res_list.append(res)
                        
                    if new_scope[i] != []:
                        label = ["NSCO"]
                        res = {"value":{"start": new_scope[i][0]+ counter, "end": new_scope[i][1]+counter, "labels": label}}
                        dictionary_res_list.append(res)
                else:
                    label = ["UNC"]
                    res = {"value":{"start": start_keyword[i] + counter, "end": end_keyword[i]+ counter, "labels": label}}
                    dictionary_res_list.append(res)
                    if new_scope[i] != []:
                        label = ["USCO"]
                        res = {"value":{"start": new_scope[i][0]+counter, "end": new_scope[i][1] + counter, "labels": label}}
                        dictionary_res_list.append(res)

            counter+=len(sentence)

        l_res.append({'text':element['text'], 'result': dictionary_res_list})
    return l_res

In [None]:
l_res_training=model(l_training, l_neg, l_unc)
l_res_test=model(l_test, l_neg, l_unc)

In [None]:
with open("res_training_Rules.json", "w") as f:
    json.dump(l_res_training, f)

with open("res_test_Rules.json", "w") as f:
    json.dump(l_res_test, f)