## 1. Ents

In [1]:
import json
import spacy
from spacy import displacy
import pandas as pd
from spacy.tokens import Span


nlp = spacy.load("pt_core_news_sm")

In [2]:
ner = nlp.remove_pipe('ner')
ruler = nlp.add_pipe("entity_ruler")

In [3]:
# Principais funcoes

def show_ent_new(text, patterns):
    #nlp = spacy.blank("pt")
    #ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    doc = nlp(text)
    
    tokens = []
    ents = []
    
    for ent in doc.ents:
        span = doc.char_span(ent.start_char, ent.end_char, label=ent.label_)
        ents.append(span)
        
    for token in doc:
        start = token.idx
        end = start + len(token)
        tokens.append((token.text, start, end))
        
    return doc, tokens, ents


def write_patterns_to_file(patterns, colors, filename):
    data = {"patterns": patterns, "colors": colors}
    with open(filename, "w") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
        
def load_patterns_and_colors(filename):
    with open(filename, "r", encoding="utf-8") as f:
        data = json.load(f)
        patterns = data["patterns"]
        colors = data["colors"]
    return patterns, colors   




# chunk.text, chunk.start, chunk.end, chunk.root.head.lemma_, chunk.root.dep_, chunk.doc
def load_json(filename):
    with open(filename, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

In [4]:
colors = {
        "CULTURA": "linear-gradient(90deg, #2ADB5E, #1FA346)", 
        "TOTAL": "linear-gradient(90deg, #09D6FF, #08A0D1)",
        "ENTREGUE": "linear-gradient(90deg, #09D6FF, #08A0D1)",
        "SALDO": "linear-gradient(90deg, #09D6FF, #08A0D1)", 
        "FAZENDA": "linear-gradient(90deg, #aa9cfc, #fc9ce7)", 
        "SAFRA": "linear-gradient(90deg, #FFC90E, #BA930A)", 
        "CONTRATO": "linear-gradient(90deg, #B5B5B5, #8A8A8A)"}


patternsOthers = [{"label": "PERSON", "pattern": "Daniel", "id": "daniel"},
                  {"label": "ORG", "pattern": [{"LOWER": "fast"}, {"LOWER": "innovation"}], "id": "fast-innovation"}] 
 

patternsCult = [
    {
        "label":"CULTURA",
        "pattern": [
            {"LOWER": "soja", "OP":"?"},
            {"LOWER": "milho", "OP":"?"},
        ]    
    }
]  

patternsQuant = [
    {
        "label":"TOTAL",
        "pattern": [
            {"LOWER": "total"},
            {"LOWER": "quantidade", "OP":"?"},
            
        ]    
    },
    {
        "label":"ENTREGUE",
        "pattern": [
            {"LOWER": "quantidade", "OP":"?"},
            {"LOWER": "entregue","OP":"?"},
            {"LOWER": "entregues","OP":"?"},
            {"LOWER": "entregado","OP":"?"},
            {"LOWER": "entreguei","OP":"?"},
        ]    
    },
    {
        "label":"SALDO",
        "pattern": [
            {"LOWER": "saldo"},
            {"LOWER": "quantidade", "OP":"?"},
            {"LOWER": "total", "OP":"?"},
        ]    
    }
    
]

patternsSafra = [
    {
        "label":"SAFRA",
        "pattern": [
            {"LOWER": "safra", "OP":"?"},
            {"LOWER": "safras", "OP":"?"},
            {"SHAPE": "dd/dd"},
        ]    
    }
]


patternsFazenda = [
    {
        "label":"FAZENDA",
        "pattern":[
            {"ORTH": "Santa"}, {"ORTH": "Rita"},
        ]
        
    }
]


patternsContrato = [
    {
        "label":"CONTRATO",
        "pattern": [
            {"LOWER": "contrato", "OP":"*"},
            {"SHAPE": "dddX", "OP":"*"},
            {"LOWER": "contratos", "OP":"*"},
            
        ]
        
    }
    
]

patterns = patternsCult + patternsQuant + patternsFazenda + patternsSafra + patternsContrato + patternsOthers

In [5]:
text = "Quantos quilos de milho já foram entregues pela fazenda Santa Rita no contrato atual?"

In [6]:
doc, tokens, ents = show_ent_new(text, patterns=patterns)

## GARANTIR que a ordem dos Tokens esteja correta (ascendente)
seq_tokens_id = []

seq_tokens_valor = []


## tokens_ids + Lista de todos os Tokens da frase
for token in doc:
    seq_tokens_id.append(token.i)

In [7]:
displacy.render(doc, style="ent", options={"colors": colors})

In [8]:
for ent in doc.ents:
    print(f'{ent.start:>2} | {ent.text:>20} | {ent.label_:>8} | {ent.end:>2} || {ent.start_char:>2} | {ent.end_char:>2}')
    seq_tokens_valor.append(ent.text)

 3 |                milho |  CULTURA |  4 || 18 | 23
 6 |            entregues | ENTREGUE |  7 || 33 | 42
 9 |           Santa Rita |  FAZENDA | 11 || 56 | 66
12 |             contrato | CONTRATO | 13 || 70 | 78


## 2. Noun_chunks

2.1 - analises noun_chunks

In [9]:
# VISUALIAZACAO RESUMIDA _ ENTS

for chunk in doc.noun_chunks:
  chunk_text = chunk.text
  chunk_root = chunk.root.text
  chunk_root_dep = chunk.root.dep_
  chunk_root_head = chunk.root.head.text
  chunk_root_head_dep = chunk.root.head.dep_
  chunk_root_head_lemma = chunk.root.head.lemma_
  
  chunk_ents = chunk.ents
  chunk_root_ent_type = chunk.root.ent_type_
  
  print(f'1.chunk.text: {chunk_text:>15} |  {chunk.start:>2}  {chunk.end:>2} |  {chunk.start_char:>2} {chunk.end_char:>2} ')

1.chunk.text:  Quantos quilos |   0   2 |   0 14 
1.chunk.text:           milho |   3   4 |  18 23 
1.chunk.text:         fazenda |   8   9 |  48 55 
1.chunk.text:      Santa Rita |   9  11 |  56 66 
1.chunk.text:  contrato atual |  12  14 |  70 84 


In [16]:
# VISUALIAZACAO RESUMIDA _ ENTS

for chunk in doc.noun_chunks:
  chunk_text = chunk.text
  chunk_root = chunk.root.text
  chunk_root_dep = chunk.root.dep_
  chunk_root_head = chunk.root.head.text
  chunk_root_head_dep = chunk.root.head.dep_
  chunk_root_head_lemma = chunk.root.head.lemma_
  
  chunk_ents = chunk.ents
  chunk_root_ent_type = chunk.root.ent_type_
  
  print(f'1.chunk.text: {chunk_text:>15} | 2.ch.root: {chunk_root:>10} | 3.chunk.root.dep_: {chunk_root_dep:>10} | 4.chunk_root_ent_type {chunk_root_ent_type:>12} | 5.chunk_ents {chunk_ents} ')

1.chunk.text:  Quantos quilos | 2.ch.root:     quilos | 3.chunk.root.dep_: nsubj:pass | 4.chunk_root_ent_type              | 5.chunk_ents [] 
1.chunk.text:           milho | 2.ch.root:      milho | 3.chunk.root.dep_:       nmod | 4.chunk_root_ent_type      CULTURA | 5.chunk_ents [milho] 
1.chunk.text:         fazenda | 2.ch.root:    fazenda | 3.chunk.root.dep_:  obl:agent | 4.chunk_root_ent_type              | 5.chunk_ents [] 
1.chunk.text:      Santa Rita | 2.ch.root:      Santa | 3.chunk.root.dep_:      appos | 4.chunk_root_ent_type      FAZENDA | 5.chunk_ents [Santa Rita] 
1.chunk.text:  contrato atual | 2.ch.root:   contrato | 3.chunk.root.dep_:       nmod | 4.chunk_root_ent_type     CONTRATO | 5.chunk_ents [contrato] 


In [17]:
# VISUALIAZACAO ESTRUTURA DEP, HEAD, ROOT, LEMMA
chunks_valor = []


for chunk in doc.noun_chunks:
  chunk_text = chunk.text
  chunks_valor.append(chunk_text)
  chunk_root = chunk.root.text
  chunk_root_dep = chunk.root.dep_
  chunk_root_head = chunk.root.head.text
  chunk_root_head_dep = chunk.root.head.dep_
  chunk_root_head_lemma = chunk.root.head.lemma_
  chunk_ents = chunk.ents
  chunk_root_ent_type = chunk.root.ent_type_
  
  print(f'1.chunk.text: {chunk_text:>15} | 2.ch.root: {chunk_root:>10} | 3.chunk.root.dep_: {chunk_root_dep:>15} | 4.ch.root.head: {chunk_root_head:>12} | 5.ch.root.head.dep_: {chunk_root_head_dep:>10} |  6.chunk.root.head.lemma_: {chunk_root_head_lemma:>9}')

1.chunk.text:  Quantos quilos | 2.ch.root:     quilos | 3.chunk.root.dep_:      nsubj:pass | 4.ch.root.head:    entregues | 5.ch.root.head.dep_:       ROOT |  6.chunk.root.head.lemma_:  entregar
1.chunk.text:           milho | 2.ch.root:      milho | 3.chunk.root.dep_:            nmod | 4.ch.root.head:       quilos | 5.ch.root.head.dep_: nsubj:pass |  6.chunk.root.head.lemma_:     quilo
1.chunk.text:         fazenda | 2.ch.root:    fazenda | 3.chunk.root.dep_:       obl:agent | 4.ch.root.head:    entregues | 5.ch.root.head.dep_:       ROOT |  6.chunk.root.head.lemma_:  entregar
1.chunk.text:      Santa Rita | 2.ch.root:      Santa | 3.chunk.root.dep_:           appos | 4.ch.root.head:      fazenda | 5.ch.root.head.dep_:  obl:agent |  6.chunk.root.head.lemma_:   fazenda
1.chunk.text:  contrato atual | 2.ch.root:   contrato | 3.chunk.root.dep_:            nmod | 4.ch.root.head:      fazenda | 5.ch.root.head.dep_:  obl:agent |  6.chunk.root.head.lemma_:   fazenda


<h3><mark>Merging noun_chunks</mark></h3>

In [10]:
filename = "/home/wklinux/spaCy/query_utter.json"  

data = load_json(filename)

In [11]:
text = "Quantos quilos de milho já foram entregues pela fazenda Santa Rita no contrato atual?"

In [12]:
doc2, tokens, ents = show_ent_new(text, patterns=patterns)

## GARANTIR que a ordem dos Tokens esteja correta (ascendente)
seq_tokens_id = []

seq_tokens_valor = []


## tokens_ids + Lista de todos os Tokens da frase
for token in doc2:
    seq_tokens_id.append(token.i)

In [13]:
doc2

Quantos quilos de milho já foram entregues pela fazenda Santa Rita no contrato atual?

In [14]:
nlp.add_pipe("merge_noun_chunks")

<function spacy.pipeline.functions.merge_noun_chunks(doc: spacy.tokens.doc.Doc) -> spacy.tokens.doc.Doc>

In [15]:
# VISUALIAZACAO ESTRUTURA DEP, HEAD, ROOT, LEMMA
chunks_valor = []


for chunk in doc.noun_chunks:
  chunk_text = chunk.text
  chunks_valor.append(chunk_text)
  chunk_root = chunk.root.text
  chunk_root_dep = chunk.root.dep_
  chunk_root_head = chunk.root.head.text
  chunk_root_head_dep = chunk.root.head.dep_
  chunk_root_head_lemma = chunk.root.head.lemma_
  chunk_ents = chunk.ents
  chunk_root_ent_type = chunk.root.ent_type_
  
  print(f'1.chunk.text: {chunk_text:>15} | 2.ch.root: {chunk_root:>10} | 3.chunk.root.dep_: {chunk_root_dep:>15} | 4.ch.root.head: {chunk_root_head:>12} | 5.ch.root.head.dep_: {chunk_root_head_dep:>10} |  6.chunk.root.head.lemma_: {chunk_root_head_lemma:>9}')

1.chunk.text:  Quantos quilos | 2.ch.root:     quilos | 3.chunk.root.dep_:      nsubj:pass | 4.ch.root.head:    entregues | 5.ch.root.head.dep_:       ROOT |  6.chunk.root.head.lemma_:  entregar
1.chunk.text:           milho | 2.ch.root:      milho | 3.chunk.root.dep_:            nmod | 4.ch.root.head:       quilos | 5.ch.root.head.dep_: nsubj:pass |  6.chunk.root.head.lemma_:     quilo
1.chunk.text:         fazenda | 2.ch.root:    fazenda | 3.chunk.root.dep_:       obl:agent | 4.ch.root.head:    entregues | 5.ch.root.head.dep_:       ROOT |  6.chunk.root.head.lemma_:  entregar
1.chunk.text:      Santa Rita | 2.ch.root:      Santa | 3.chunk.root.dep_:           appos | 4.ch.root.head:      fazenda | 5.ch.root.head.dep_:  obl:agent |  6.chunk.root.head.lemma_:   fazenda
1.chunk.text:  contrato atual | 2.ch.root:   contrato | 3.chunk.root.dep_:            nmod | 4.ch.root.head:      fazenda | 5.ch.root.head.dep_:  obl:agent |  6.chunk.root.head.lemma_:   fazenda


In [16]:
# VISUALIAZACAO ESTRUTURA DEP, HEAD, ROOT, LEMMA
chunks_valor = []


for chunk in doc.noun_chunks:
  chunk_text = chunk.text
  chunks_valor.append(chunk_text)
  chunk_root = chunk.root.text
  chunk_root_dep = chunk.root.dep_
  chunk_root_head = chunk.root.head.text
  chunk_root_head_dep = chunk.root.head.dep_
  chunk_root_head_lemma = chunk.root.head.lemma_
  chunk_ents = chunk.ents
  chunk_root_ent_type = chunk.root.ent_type_
  
  print(f'1.chunk.text: {chunk_text:>15} | 2.ch.root: {chunk_root:>10} | 3.chunk.root.dep_: {chunk_root_dep:>15} | 4.ch.root.head: {chunk_root_head:>12} | 5.ch.root.head.dep_: {chunk_root_head_dep:>10} |  6.chunk.root.head.lemma_: {chunk_root_head_lemma:>9}')

1.chunk.text:  Quantos quilos | 2.ch.root:     quilos | 3.chunk.root.dep_:      nsubj:pass | 4.ch.root.head:    entregues | 5.ch.root.head.dep_:       ROOT |  6.chunk.root.head.lemma_:  entregar
1.chunk.text:           milho | 2.ch.root:      milho | 3.chunk.root.dep_:            nmod | 4.ch.root.head:       quilos | 5.ch.root.head.dep_: nsubj:pass |  6.chunk.root.head.lemma_:     quilo
1.chunk.text:         fazenda | 2.ch.root:    fazenda | 3.chunk.root.dep_:       obl:agent | 4.ch.root.head:    entregues | 5.ch.root.head.dep_:       ROOT |  6.chunk.root.head.lemma_:  entregar
1.chunk.text:      Santa Rita | 2.ch.root:      Santa | 3.chunk.root.dep_:           appos | 4.ch.root.head:      fazenda | 5.ch.root.head.dep_:  obl:agent |  6.chunk.root.head.lemma_:   fazenda
1.chunk.text:  contrato atual | 2.ch.root:   contrato | 3.chunk.root.dep_:            nmod | 4.ch.root.head:      fazenda | 5.ch.root.head.dep_:  obl:agent |  6.chunk.root.head.lemma_:   fazenda


In [17]:
seq_tokens_id

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

In [18]:
displacy.render(doc2, style='ent',
                jupyter=True, options={'distance': 120, "colors": colors})


In [46]:
[chunk for chunk in doc.noun_chunks]

[Quantos quilos, milho, fazenda, Santa Rita, contrato atual]

In [20]:
chunk_text = [chunk.text for chunk in doc.noun_chunks][0]
chunk_text

'Quantos quilos'

In [21]:
chunk_root_dep = [chunk.root.dep_ for chunk in doc.noun_chunks][0]
chunk_root_dep

'nsubj:pass'

In [33]:
def relevant_chunk(doc, param, i):
    for chunk in doc.noun_chunks:
        chunk_root_head_lemma_ = (chunk.root.head.lemma_).lower()
        if chunk_root_head_lemma_ in ["gostaria", "qual", "entregar"]:
            if chunk.root.dep_ == param:
                return chunk.text, i, chunk.doc

In [22]:
chunk_root_head_lemma = [chunk.root.head.lemma_ for chunk in doc.noun_chunks][0].lower()
chunk_root_head_lemma

'entregar'

In [36]:
i = 0
for i in range(len(data)):
    doc_query = nlp(data[i]["user_utter"])
    result = relevant_chunk(doc=doc_query, param=chunk_root_dep, i=i)
    if result:
        bot_utter = nlp(data[i]["bot_utter"])
        user_keys = nlp(data[i]["user_keys"])
        user_desire = nlp(data[i]["user_des"])
        missed_keys = nlp(data[i]["miss_key"])      
        print(bot_utter, user_keys, user_desire, missed_keys)
        print(data[i]["user_utter"])
    

Claro, por favor, informe o número do contrato. NM_FAZENDA,ID_SAFRA QT_SALDO_CONTRATO NR_CONTRATO
Quantos quilos de trigo foram entregues pela fazenda Passo Fundo?


In [25]:
# POS Tagging
pos_tagging = pd.DataFrame(data=[], \
  columns=["id", "T_texto","T_lemma_", "T_pos_", "T_tag_", "T_dep_", "T_head", "T_is_sent_start", "T_shape_", "T_is_alpha", "T_is_stop"])
i = 0
for token in doc2:
    pos_tagging.loc[i,"id"] = token.i
    pos_tagging.loc[i,"T_texto"] = token.text
    pos_tagging.loc[i,"T_lemma_"] = token.lemma_
    pos_tagging.loc[i,"T_pos_"] = token.pos_
    pos_tagging.loc[i,"T_tag_"] = token.tag_
    pos_tagging.loc[i,"T_dep_"] = token.dep_
    pos_tagging.loc[i,"T_head"] = token.head
    pos_tagging.loc[i,"T_is_sent_start"] = token.is_sent_start
    pos_tagging.loc[i,"T_shape_"] = token.shape_
    pos_tagging.loc[i,"T_is_alpha"] = token.is_alpha
    pos_tagging.loc[i,"T_is_stop"] = token.is_stop

    i = i+1

pos_tagging

Unnamed: 0,id,T_texto,T_lemma_,T_pos_,T_tag_,T_dep_,T_head,T_is_sent_start,T_shape_,T_is_alpha,T_is_stop
0,0,Quantos quilos,quanto quilo,NOUN,NOUN,nsubj:pass,entregues,True,Xxxxx xxxx,False,False
1,1,de,de,ADP,ADP,case,milho,False,xx,True,True
2,2,milho,milho,NOUN,NOUN,nmod,Quantos quilos,False,xxxx,True,False
3,3,já,já,ADV,ADV,advmod,entregues,False,xx,True,True
4,4,foram,ser,AUX,AUX,aux:pass,entregues,False,xxxx,True,True
5,5,entregues,entregar,VERB,VERB,ROOT,entregues,False,xxxx,True,False
6,6,pela,por o,ADP,ADP,case,fazenda,False,xxxx,True,True
7,7,fazenda,fazenda,NOUN,NOUN,obl:agent,entregues,False,xxxx,True,False
8,8,Santa Rita,Santa Rita,PROPN,PROPN,appos,fazenda,False,Xxxxx Xxxx,False,False
9,9,no,em o,ADP,ADP,case,contrato atual,False,xx,True,True


In [26]:
displacy.render(doc2, style='dep',
                jupyter=True, options={'distance': 120})

<h3><mark>Merge entities</mark></h3>

In [None]:
for ent in doc.ents:
    print(f'{ent.start:>2} | {ent.text:>20} | {ent.label_:>8} | {ent.end:>2} || {ent.start_char:>2} | {ent.end_char:>2}')
    seq_tokens_valor.append(ent.text)

In [None]:
nlp.add_pipe("merge_entities")

In [None]:
doc3, tokens, ents = show_ent_new(text, patterns=patterns)

## GARANTIR que a ordem dos Tokens esteja correta (ascendente)
seq_tokens_id = []

seq_tokens_valor = []


## tokens_ids + Lista de todos os Tokens da frase
for token in doc3:
    seq_tokens_id.append(token.i)

In [None]:
for ent in doc3.ents:
    print(f'{ent.start:>2} | {ent.text:>20} | {ent.label_:>8} | {ent.end:>2} || {ent.start_char:>2} | {ent.end_char:>2}')
    seq_tokens_valor.append(ent.text)

In [None]:
[t.text for t in doc]

### Analise de dep

In [None]:
print(chunks_valor)
print(tokens)
print(ents)

In [None]:
displacy.render(doc3, style='dep',
                jupyter=True, options={'distance': 120})

In [None]:
# POS Tagging
pos_tagging = pd.DataFrame(data=[], \
  columns=["id", "T_texto","T_lemma_", "T_pos_", "T_tag_", "T_dep_", "T_head", "T_is_sent_start", "T_shape_", "T_is_alpha", "T_is_stop"])
i = 0
for token in doc:
    pos_tagging.loc[i,"id"] = token.i
    pos_tagging.loc[i,"T_texto"] = token.text
    pos_tagging.loc[i,"T_lemma_"] = token.lemma_
    pos_tagging.loc[i,"T_pos_"] = token.pos_
    pos_tagging.loc[i,"T_tag_"] = token.tag_
    pos_tagging.loc[i,"T_dep_"] = token.dep_
    pos_tagging.loc[i,"T_head"] = token.head
    pos_tagging.loc[i,"T_is_sent_start"] = token.is_sent_start
    pos_tagging.loc[i,"T_shape_"] = token.shape_
    pos_tagging.loc[i,"T_is_alpha"] = token.is_alpha
    pos_tagging.loc[i,"T_is_stop"] = token.is_stop

    i = i+1

pos_tagging

In [None]:
# Lemmatization for tokens 
lemmatization = pd.DataFrame(data=[], \
  columns=["id", "Texto","Lemma", "Tag", "Tag_explainned", "token_POS", "POS_explainned", "dep", "T. Head", "dep explained"])
i = 0
for token in doc:
    lemmatization.loc[i,"id"] = token.i
    lemmatization.loc[i,"Texto"] = token.text
    lemmatization.loc[i,"Lemma"] = token.lemma_
    lemmatization.loc[i,"Tag"] = token.tag_
    lemmatization.loc[i,"Tag_explainned"] = spacy.explain(token.tag_)
    lemmatization.loc[i,"token_POS"] = token.pos_
    lemmatization.loc[i,"POS_explainned"] = spacy.explain(token.pos_)
    lemmatization.loc[i,"dep"] = token.dep_
    lemmatization.loc[i,"T. Head"] = token.head.text
    lemmatization.loc[i,"dep explained"] = token.morph
    
    i = i+1

lemmatization

### Tokens e funcoes

In [None]:
seq_tokens_id = [token.i for token in doc]
seq_tokens_id

In [None]:
# Identificando os filhos

seq_tokens_id = [token.i for token in doc]

for idx in seq_tokens_id:
    give_children = doc[idx].children
    # print(doc[idx], doc[idx].pos_)
    texto = [t.text for t in give_children]
    print(f'token: {doc[idx].text:>10} | pos_: {doc[idx].pos_:>7} | chil: {texto}')

In [None]:
# Identificando Subtree

seq_tokens_id = [token.i for token in doc]

for idx in seq_tokens_id:
    subtree = doc[idx].subtree
    # print(doc[idx], doc[idx].pos_)
    texto = [t.text for t in subtree]
    print(f'token: {doc[idx].text:>10} | pos_: {doc[idx].pos_:>7} | chil: {texto}')

In [None]:
# n_rights: O número de filhos imediatos à direita da palavra na análise de dependência sintática.

seq_tokens_id = [token.i for token in doc]

for idx in seq_tokens_id:
    n_rights = doc[idx].n_rights
    #print(n_rights)
    #texto = [t.i for t in n_rights]
    print(f'token: {doc[idx].text:>10} | pos_: {doc[idx].pos_:>7} | n_rights: {n_rights}')

In [None]:
# Token.rights: Os filhos imediatos à direita da palavra na análise de dependência sintática.

seq_tokens_id = [token.i for token in doc]

for idx in seq_tokens_id:
    rights = doc[idx].rights
    #print(n_rights)
    texto = [t.text for t in rights]
    print(f'token: {doc[idx].text:>10} | pos_: {doc[idx].pos_:>7} | rights: {texto}')

In [None]:
# n_lefts: O número de filhos imediatos à esquerda da palavra na análise de dependência sintática.

seq_tokens_id = [token.i for token in doc]

for idx in seq_tokens_id:
    n_lefts = doc[idx].n_lefts
    #print(n_rights)
    #texto = [t.i for t in n_rights]
    print(f'token: {doc[idx].text:>10} | pos_: {doc[idx].pos_:>7} | n_rights: {n_lefts}')

In [None]:
# Token.lefts: Os filhos imediatos à esquerda da palavra na análise de dependência sintática.

seq_tokens_id = [token.i for token in doc]

for idx in seq_tokens_id:
    lefts = doc[idx].lefts
    #print(n_rights)
    texto = [t.text for t in lefts]
    print(f'token: {doc[idx].text:>10} | pos_: {doc[idx].pos_:>7} | lefts: {texto}')

In [None]:
# Token.ancestors: Uma sequência dos ancestrais sintáticos do token (pais, avós, etc).

seq_tokens_id = [token.i for token in doc]

for idx in seq_tokens_id:
    ancestors = doc[idx].ancestors
    #print(n_rights)
    texto = [t.text for t in ancestors]
    print(f'token: {doc[idx].text:>10} | pos_: {doc[idx].pos_:>7} | ancestors: {texto}')

In [None]:
seq_tokens_id = [token.i for token in doc]

for idx in seq_tokens_id:
    nbor = doc[idx].nbor()
    #print(n_rights)
    # texto = [t.text for t in ancestors]
    print(f'token: {doc[idx].text:>10} | pos_: {doc[idx].pos_:>7} | nbor: {nbor.text}')

In [None]:
doc.user_data.values('my_id':'00102333')

<mark> Using morph </mark>

In [None]:
# Matches "love cats" or "likes flowers"
pattern1 = [{"LEMMA": {"IN": ["like", "love"]}},
            {"POS": "NOUN"}]

# Matches tokens of length >= 10
pattern2 = [{"LENGTH": {">=": 10}}]

# Match based on morph attributes
pattern3 = [{"MORPH": {"IS_SUBSET": ["Number=Sing", "Gender=Neut"]}}]
# "", "Number=Sing" and "Number=Sing|Gender=Neut" will match as subsets
# "Number=Plur|Gender=Neut" will not match
# "Number=Sing|Gender=Neut|Polite=Infm" will not match because it's a superset