In [2]:
import spacy
from spacy import displacy
import pandas as pd
import json



In [3]:
nlp = spacy.load("pt_core_news_lg")

## 1. Tokens e funçoes especificas.

In [4]:
doc = nlp("Quantos quilos de milho já foram entregues pela fazenda Santa Rita no contrato atual?")

In [5]:
seq_tokens_id = [token.i for token in doc]
seq_tokens_id

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

In [6]:
# Identificando os filhos

seq_tokens_id = [token.i for token in doc]

for idx in seq_tokens_id:
    give_children = doc[idx].children
    # print(doc[idx], doc[idx].pos_)
    texto = [t.text for t in give_children]
    print(f'token: {doc[idx].text:>10} | pos_: {doc[idx].pos_:>7} | chil: {texto}')

token:    Quantos | pos_:     DET | chil: []
token:     quilos | pos_:    NOUN | chil: ['Quantos', 'milho']
token:         de | pos_:     ADP | chil: []
token:      milho | pos_:    NOUN | chil: ['de']
token:         já | pos_:     ADV | chil: []
token:      foram | pos_:     AUX | chil: []
token:  entregues | pos_:    VERB | chil: ['quilos', 'já', 'foram', 'fazenda', 'contrato', '?']
token:       pela | pos_:     ADP | chil: []
token:    fazenda | pos_:    NOUN | chil: ['pela', 'Santa']
token:      Santa | pos_:   PROPN | chil: ['Rita']
token:       Rita | pos_:   PROPN | chil: []
token:         no | pos_:     ADP | chil: []
token:   contrato | pos_:    NOUN | chil: ['no', 'atual']
token:      atual | pos_:     ADJ | chil: []
token:          ? | pos_:   PUNCT | chil: []


In [7]:
# Identificando Subtree

seq_tokens_id = [token.i for token in doc]

for idx in seq_tokens_id:
    subtree = doc[idx].subtree
    # print(doc[idx], doc[idx].pos_)
    texto = [t.text for t in subtree]
    print(f'token: {doc[idx].text:>10} | pos_: {doc[idx].pos_:>7} | chil: {texto}')

token:    Quantos | pos_:     DET | chil: ['Quantos']
token:     quilos | pos_:    NOUN | chil: ['Quantos', 'quilos', 'de', 'milho']
token:         de | pos_:     ADP | chil: ['de']
token:      milho | pos_:    NOUN | chil: ['de', 'milho']
token:         já | pos_:     ADV | chil: ['já']
token:      foram | pos_:     AUX | chil: ['foram']
token:  entregues | pos_:    VERB | chil: ['Quantos', 'quilos', 'de', 'milho', 'já', 'foram', 'entregues', 'pela', 'fazenda', 'Santa', 'Rita', 'no', 'contrato', 'atual', '?']
token:       pela | pos_:     ADP | chil: ['pela']
token:    fazenda | pos_:    NOUN | chil: ['pela', 'fazenda', 'Santa', 'Rita']
token:      Santa | pos_:   PROPN | chil: ['Santa', 'Rita']
token:       Rita | pos_:   PROPN | chil: ['Rita']
token:         no | pos_:     ADP | chil: ['no']
token:   contrato | pos_:    NOUN | chil: ['no', 'contrato', 'atual']
token:      atual | pos_:     ADJ | chil: ['atual']
token:          ? | pos_:   PUNCT | chil: ['?']


In [8]:
# n_rights: O número de filhos imediatos à direita da palavra na análise de dependência sintática.

seq_tokens_id = [token.i for token in doc]

for idx in seq_tokens_id:
    n_rights = doc[idx].n_rights
    #print(n_rights)
    #texto = [t.i for t in n_rights]
    print(f'token: {doc[idx].text:>10} | pos_: {doc[idx].pos_:>7} | n_rights: {n_rights}')

token:    Quantos | pos_:     DET | n_rights: 0
token:     quilos | pos_:    NOUN | n_rights: 1
token:         de | pos_:     ADP | n_rights: 0
token:      milho | pos_:    NOUN | n_rights: 0
token:         já | pos_:     ADV | n_rights: 0
token:      foram | pos_:     AUX | n_rights: 0
token:  entregues | pos_:    VERB | n_rights: 3
token:       pela | pos_:     ADP | n_rights: 0
token:    fazenda | pos_:    NOUN | n_rights: 1
token:      Santa | pos_:   PROPN | n_rights: 1
token:       Rita | pos_:   PROPN | n_rights: 0
token:         no | pos_:     ADP | n_rights: 0
token:   contrato | pos_:    NOUN | n_rights: 1
token:      atual | pos_:     ADJ | n_rights: 0
token:          ? | pos_:   PUNCT | n_rights: 0


In [9]:
# Token.rights: Os filhos imediatos à direita da palavra na análise de dependência sintática.

seq_tokens_id = [token.i for token in doc]

for idx in seq_tokens_id:
    rights = doc[idx].rights
    #print(n_rights)
    texto = [t.text for t in rights]
    print(f'token: {doc[idx].text:>10} | pos_: {doc[idx].pos_:>7} | rights: {texto}')

token:    Quantos | pos_:     DET | rights: []
token:     quilos | pos_:    NOUN | rights: ['milho']
token:         de | pos_:     ADP | rights: []
token:      milho | pos_:    NOUN | rights: []
token:         já | pos_:     ADV | rights: []
token:      foram | pos_:     AUX | rights: []
token:  entregues | pos_:    VERB | rights: ['fazenda', 'contrato', '?']
token:       pela | pos_:     ADP | rights: []
token:    fazenda | pos_:    NOUN | rights: ['Santa']
token:      Santa | pos_:   PROPN | rights: ['Rita']
token:       Rita | pos_:   PROPN | rights: []
token:         no | pos_:     ADP | rights: []
token:   contrato | pos_:    NOUN | rights: ['atual']
token:      atual | pos_:     ADJ | rights: []
token:          ? | pos_:   PUNCT | rights: []


In [10]:
# n_lefts: O número de filhos imediatos à esquerda da palavra na análise de dependência sintática.

seq_tokens_id = [token.i for token in doc]

for idx in seq_tokens_id:
    n_lefts = doc[idx].n_lefts
    #print(n_rights)
    #texto = [t.i for t in n_rights]
    print(f'token: {doc[idx].text:>10} | pos_: {doc[idx].pos_:>7} | n_rights: {n_lefts}')

token:    Quantos | pos_:     DET | n_rights: 0
token:     quilos | pos_:    NOUN | n_rights: 1
token:         de | pos_:     ADP | n_rights: 0
token:      milho | pos_:    NOUN | n_rights: 1
token:         já | pos_:     ADV | n_rights: 0
token:      foram | pos_:     AUX | n_rights: 0
token:  entregues | pos_:    VERB | n_rights: 3
token:       pela | pos_:     ADP | n_rights: 0
token:    fazenda | pos_:    NOUN | n_rights: 1
token:      Santa | pos_:   PROPN | n_rights: 0
token:       Rita | pos_:   PROPN | n_rights: 0
token:         no | pos_:     ADP | n_rights: 0
token:   contrato | pos_:    NOUN | n_rights: 1
token:      atual | pos_:     ADJ | n_rights: 0
token:          ? | pos_:   PUNCT | n_rights: 0


In [11]:
# Token.lefts: Os filhos imediatos à esquerda da palavra na análise de dependência sintática.

seq_tokens_id = [token.i for token in doc]

for idx in seq_tokens_id:
    lefts = doc[idx].lefts
    #print(n_rights)
    texto = [t.text for t in lefts]
    print(f'token: {doc[idx].text:>10} | pos_: {doc[idx].pos_:>7} | lefts: {texto}')

token:    Quantos | pos_:     DET | lefts: []
token:     quilos | pos_:    NOUN | lefts: ['Quantos']
token:         de | pos_:     ADP | lefts: []
token:      milho | pos_:    NOUN | lefts: ['de']
token:         já | pos_:     ADV | lefts: []
token:      foram | pos_:     AUX | lefts: []
token:  entregues | pos_:    VERB | lefts: ['quilos', 'já', 'foram']
token:       pela | pos_:     ADP | lefts: []
token:    fazenda | pos_:    NOUN | lefts: ['pela']
token:      Santa | pos_:   PROPN | lefts: []
token:       Rita | pos_:   PROPN | lefts: []
token:         no | pos_:     ADP | lefts: []
token:   contrato | pos_:    NOUN | lefts: ['no']
token:      atual | pos_:     ADJ | lefts: []
token:          ? | pos_:   PUNCT | lefts: []


In [12]:
# Token.ancestors: Uma sequência dos ancestrais sintáticos do token (pais, avós, etc).

seq_tokens_id = [token.i for token in doc]

for idx in seq_tokens_id:
    ancestors = doc[idx].ancestors
    #print(n_rights)
    texto = [t.text for t in ancestors]
    print(f'token: {doc[idx].text:>10} | pos_: {doc[idx].pos_:>7} | ancestors: {texto}')

token:    Quantos | pos_:     DET | ancestors: ['quilos', 'entregues']
token:     quilos | pos_:    NOUN | ancestors: ['entregues']
token:         de | pos_:     ADP | ancestors: ['milho', 'quilos', 'entregues']
token:      milho | pos_:    NOUN | ancestors: ['quilos', 'entregues']
token:         já | pos_:     ADV | ancestors: ['entregues']
token:      foram | pos_:     AUX | ancestors: ['entregues']
token:  entregues | pos_:    VERB | ancestors: []
token:       pela | pos_:     ADP | ancestors: ['fazenda', 'entregues']
token:    fazenda | pos_:    NOUN | ancestors: ['entregues']
token:      Santa | pos_:   PROPN | ancestors: ['fazenda', 'entregues']
token:       Rita | pos_:   PROPN | ancestors: ['Santa', 'fazenda', 'entregues']
token:         no | pos_:     ADP | ancestors: ['contrato', 'entregues']
token:   contrato | pos_:    NOUN | ancestors: ['entregues']
token:      atual | pos_:     ADJ | ancestors: ['contrato', 'entregues']
token:          ? | pos_:   PUNCT | ancestors: ['ent

In [None]:
# Token.is_ancestorVerifique se este token é pai, avô etc. de outro na árvore de dependências.

quilos = doc[1]
quantos = doc[0]
quilos.is_ancestor(quantos)

In [None]:
# Token.nbor: Uma sequência nbor
print(doc)
print()

seq_tokens_id = [token.i for token in doc]

for idx in seq_tokens_id:
    nbor = doc[idx].nbor()
    #print(n_rights)
    # texto = [t.text for t in ancestors]
    print(f'token: {doc[idx].text:>10} | pos_: {doc[idx].pos_:>7} | nbor: {nbor.text}')

In [None]:
# Token.conjuncts: Uma tupla de tokens coordenados, não incluindo o próprio token.


doc = nlp("Eu gosto de maças e laranjas")


seq_tokens_id = [token.i for token in doc]

for idx in seq_tokens_id:
    conjuncts = doc[idx].conjuncts
    #print(n_rights)
    texto = [t.text for t in conjuncts]
    print(f'token: {doc[idx].text:>10} | pos_: {doc[idx].pos_:>7} | conjuncts: {texto}')

In [None]:
from spacy.morphology import Morphology

In [None]:
from spacy.strings import StringStore

In [None]:
stringstore = StringStore(["apple", "orange"])

In [None]:
morphology = Morphology(stringstore)

In [None]:
d = morphology.feats_to_dict

In [None]:
d

## 2. noun_chunks e similaridade 

In [None]:
doc = nlp("Quantos quilos de milho já foram entregues pela fazenda Santa Rita no contrato atual?")

In [None]:
seq_tokens_id = [token.i for token in doc]
seq_tokens_id

In [None]:
seq_tokens_id = [token.i for token in doc]

significative_tokens = pd.DataFrame(data=[], \
  columns=["idx", "T_texto","T_shape_", "T_ent_type_", "T_ent_id_", "T_ent_iob_", "T_pos_", "T_lemma_", "T_dep_", "T_head"])
i = 0
for idx in seq_tokens_id:
    significative_tokens.loc[i,"idx"] = doc[idx].i
    significative_tokens.loc[i,"T_texto"] = doc[idx].text
    significative_tokens.loc[i,"T_shape_"] = doc[idx].shape_
    significative_tokens.loc[i,"T_ent_type_"] = doc[idx].ent_type_
    significative_tokens.loc[i,"T_ent_id_"] = doc[idx].ent_id_
    significative_tokens.loc[i,"T_ent_iob_"] = doc[idx].ent_iob_
    significative_tokens.loc[i,"T_pos_"] = doc[idx].pos_
    significative_tokens.loc[i,"T_lemma_"] = doc[idx].lemma_
    significative_tokens.loc[i,"T_dep_"] = doc[idx].dep_
    significative_tokens.loc[i,"T_head"] = doc[idx].head

    

    i = i+1

significative_tokens

In [None]:
doc = nlp("Gostaria de saber o saldo do meu contrato")

In [None]:
def relevant_chunk(doc, param):
    for chunk in doc.noun_chunks:
        chunk_root_head_lemma_ = (chunk.root.head.lemma_).lower()
        if chunk_root_head_lemma_ in ["gostaria", "qual"]:
            if chunk.root.dep_ == param:
                return chunk.text

# chunk.text, chunk.start, chunk.end, chunk.root.head.lemma_, chunk.root.dep_, chunk.doc
def load_json(filename):
    with open(filename, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data
            

In [None]:
filename = "/home/wklinux/spaCy/query_utter.json"  

data = load_json(filename)

In [None]:
# VISUALIAZACAO ESTRUTURA DEP, HEAD, ROOT, LEMMA

for chunk in doc.noun_chunks:
  chunk_text = chunk.text
  chunk_root = chunk.root.text
  chunk_root_dep = chunk.root.dep_
  chunk_root_head = chunk.root.head.text
  chunk_root_head_dep = chunk.root.head.dep_
  chunk_root_head_lemma = chunk.root.head.lemma_
  
  chunk_ents = chunk.ents
  chunk_root_ent_type = chunk.root.ent_type_
  
  print(f'1.chunk.text: {chunk_text:>15} | 2.ch.root: {chunk_root:>10} | 3.chunk.root.dep_: {chunk_root_dep:>6} | 4.ch.root.head: {chunk_root_head:>12} | 5.ch.root.head.dep_: {chunk_root_head_dep:>6} |  6.chunk.root.head.lemma_: {chunk_root_head_lemma:>9}')

In [None]:
relevant_chunks = []
for chunk in doc.noun_chunks:
  #valor = chunk.text.similarity(chunk.root.head.lemma_)
  #print(valor)
  chunk_text = nlp(chunk.text)
  print(chunk_text)
  param_chunk_root_dep_ = chunk.root.dep_
  chunk_root_head = chunk.root.head.text
  chunk_root_head_lemma = nlp(chunk.root.head.lemma_)
  print(chunk_root_head_lemma)
  # if chunk_text != chunk_root_head_lemma:
  #     relevant_chunks.append(chunk_root_head_lemma)
  #     relevant_chunks.append(chunk_text)
  # elif   chunk_text  
      
print(chunk_text.similarity(chunk_root_head_lemma))
print()
print(chunk_root_head_lemma)
print()
print(relevant_chunks)

In [None]:
def relevant_chunk(doc, param):
    for chunk in doc.noun_chunks:
        chunk_root_head_lemma_ = (chunk.root.head.lemma_).lower()
        if chunk_root_head_lemma_ in ["gostaria", "qual"]:
            if chunk.root.dep_ == param:
                return chunk.text

In [None]:
print(doc)
print()
print("==============================================================")

for chunk in doc.noun_chunks:
  chunk_text = chunk.text 
  param_chunk_root_dep_ = chunk.root.dep_
  chunk_root_head = chunk.root.head.text
  chunk_root_head_lemma = chunk.root.head.lemma_
  
  #print(f'\n1. dentro primeiro loop - chunk: {chunk.text} | param_chunk_root_dep_(chunk): {param_chunk_root_dep_} | {chunk.doc} \n')
  # print(param_chunk_root_dep_)
  i = 0
  for i in range(len(data)):
     doc_query = nlp(data[i]["user_utter"])
     
     print(f'\n2. indice: {i} | doc_query: {doc_query} ===================\n')
     
     for chunk_query in doc_query.noun_chunks:
         chunk_query_root_head_lemma = (chunk_query.root.head.lemma_).lower()
         chunk_query_root_dep = chunk_query.root.dep_
         print(f'\n3. dentro segundo loop - chunk_query: {chunk_query.text} | chunk_query_root_head_lemma: {chunk_query_root_head_lemma} | chunk_query_root_dep: {chunk_query_root_dep}')
         if chunk_query_root_head_lemma == chunk_root_head_lemma:
            print(f'\n4. primeira condicao:  True para chunk_root_head_lemma_: {chunk_query_root_head_lemma}')
            if chunk_query_root_dep == param_chunk_root_dep_:
               #print(f'\n5. segunda cond:  True para chunk_query_root_dep: {chunk_query_root_dep}')
               bot_utter = nlp(data[i]["bot_utter"])
               user_keys = nlp(data[i]["user_keys"])
               user_desire = nlp(data[i]["user_des"])
               missed_keys = nlp(data[i]["miss_key"])
               print(f'\n1. Resposta do bot: {bot_utter}   | 2. user_keys: {user_keys}  |  3. user_desire: {user_desire} | 4. missed_keys: {missed_keys}')
               #break
            else:
                print(f'\n5. segunda cond:  False para chunk_query_root_dep') 
                print() 
                
         else:
           print(f'\n4. deu False')
           print()  
         #break   
      # i = i+1

In [None]:
[token.text for token in doc]

In [None]:
{"user_utter":"Qual seria o saldo do meu contrato?","bot_utter":"Claro, por favor, informe o número do contrato.", "user_keys":"NM_FAZENDA,ID_SAFRA", "user_des":"QT_SALDO_CONTRATO", "miss_key":"NR_CONTRATO"},

In [None]:
i = 0
for i in range(len(data)):
    print(data[i]["bot_utter"])

In [None]:
for chunk in doc.noun_chunks:
  param_chunk_root_dep_ = chunk.root.dep_
  print(f'start: {chunk.start:>2} end: {chunk.end:>2} | chunk.text: {chunk.text:>12} || chunk.root: {chunk.root.text:>12} | chunk.root.head: {chunk.root.head.text:>12} | chunk.root.head.lemma_: {chunk.root.head.lemma_:>9} | chunk.root.dep_: {chunk.root.dep_:>6} || chunk.root.head.dep_: {chunk.root.head.dep_:>6}')