## Solucao para identificacao das entidades do documento

### 1 - Importaçoes de modulos, pipeline e funcoes

In [2]:
import json
import spacy
from spacy import displacy
import pandas as pd
from spacy.tokens import Span


nlp = spacy.load("pt_core_news_sm")

In [3]:
ner = nlp.remove_pipe('ner')
ruler = nlp.add_pipe("entity_ruler")

In [4]:
# Principais funcoes

def show_ent_new(text, patterns):
    #nlp = spacy.blank("pt")
    #ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    doc = nlp(text)
    
    tokens = []
    ents = []
    
    for ent in doc.ents:
        span = doc.char_span(ent.start_char, ent.end_char, label=ent.label_)
        ents.append(span)
        
    for token in doc:
        start = token.idx
        end = start + len(token)
        tokens.append((token.text, start, end))
        
    return doc, tokens, ents


def write_patterns_to_file(patterns, colors, filename):
    data = {"patterns": patterns, "colors": colors}
    with open(filename, "w") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
        
def load_patterns_and_colors(filename):
    with open(filename, "r", encoding="utf-8") as f:
        data = json.load(f)
        patterns = data["patterns"]
        colors = data["colors"]
    return patterns, colors   


In [4]:
nlp.analyze_pipes(pretty=True)

[1m

#   Component         Assigns               Requires   Scores             Retokenizes
-   ---------------   -------------------   --------   ----------------   -----------
0   tok2vec           doc.tensor                                          False      
                                                                                     
1   morphologizer     token.morph                      pos_acc            False      
                      token.pos                        morph_acc                     
                                                       morph_per_feat                
                                                                                     
2   parser            token.dep                        dep_uas            False      
                      token.head                       dep_las                       
                      token.is_sent_start              dep_las_per_type              
                      doc.sents                 

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'morphologizer': {'assigns': ['token.morph', 'token.pos'],
   'requires': [],
   'scores': ['pos_acc', 'morph_acc', 'morph_per_feat'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'entity_ruler': {'assigns': ['doc.ents', 'token.ent_type', 'token.ent_iob'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  'morphologizer': [],
  'parser':

### 2 - Execucao das configuraçoes e testes (neste caso, em processo de criacao)

Ja temos o processo de transferir e carregar patterns de um json, mas estamos em processo de testes e ajustes do modelo de pattern, neste caso e melhor ter aqui o bloco de configuraçao

In [36]:
colors = {
        "CULTURA": "linear-gradient(90deg, #2ADB5E, #1FA346)", 
        "TOTAL": "linear-gradient(90deg, #09D6FF, #08A0D1)",
        "ENTREGUE": "linear-gradient(90deg, #09D6FF, #08A0D1)",
        "SALDO": "linear-gradient(90deg, #09D6FF, #08A0D1)", 
        "FAZENDA": "linear-gradient(90deg, #aa9cfc, #fc9ce7)", 
        "SAFRA": "linear-gradient(90deg, #FFC90E, #BA930A)", 
        "CONTRATO": "linear-gradient(90deg, #B5B5B5, #8A8A8A)"}


patternsOthers = [{"label": "PERSON", "pattern": "Daniel", "id": "daniel"},
                  {"label": "PERSON", "pattern": "Antônio", "id": "antonio"},
                  {"label": "ORG", "pattern": [{"LOWER": "fast"}, {"LOWER": "innovation"}], "id": "fast-innovation"},
                  {"label": "ORG", "pattern": {"LOWER": "agrobi"}, "id": "agrobi"},
                  {"label": "ORG", "pattern": {"LOWER": "AgroBi"}, "id": "agrobi"},
                  {"label": "ORG", "pattern": [{"LOWER": "agro"}, {"LOWER": "bi"}], "id": "agrobi"},
                  ] 
 

patternsCult = [
    {
        "label":"CULTURA",
        "pattern": [
            {"LOWER": "soja", "OP":"?"},
            {"LOWER": "milho", "OP":"?"},
            {"LOWER": "sorgo", "OP":"?"},
            {"LOWER": "trigo", "OP":"?"},
            
        ]    
    }
]


patternsQuant = [
    {
        "label":"TOTAL",
        "pattern": [
            {"LOWER": "total"},
            {"LOWER": "quantidade", "OP":"?"},
            
        ]    
    },
    {
        "label":"ENTREGUE",
        "pattern": [
            {"LOWER": "quantidade", "OP":"?"},
            {"LOWER": "entregue","OP":"?"},
            {"LOWER": "entregues","OP":"?"},
            {"LOWER": "entregado","OP":"?"},
            {"LOWER": "entreguei","OP":"?"},
        ]    
    },
    {
        "label":"SALDO",
        "pattern": [
            {"LOWER": "saldo"},
            {"LOWER": "quantidade", "OP":"?"},
            {"LOWER": "total", "OP":"?"},
        ]    
    }
    
]

patternsSafra = [
    {
        "label":"SAFRA",
        "pattern": [
            {"LOWER": "safra", "OP":"?"},
            {"LOWER": "safras", "OP":"?"},
            {"SHAPE": "dd/dd"},
        ]    
    }
]


patternsFazenda_antigo = [
    {
        "label":"FAZENDA",
        "pattern":[
            {"ORTH": "Santa"}, {"ORTH": "Rita"},
            {"ORTH": "Passo"}, {"ORTH": "Fundo"},
            {"ORTH": "Bela"}, {"ORTH": "Vista"},
            [{"LOWER": "bela"}, {"LOWER": "vista"}],
        ]
        
    }
]


patternsOthersFazenda = [{"label": "FAZENDA", "pattern": [{"LOWER": "santa"}, {"LOWER": "rita"}], "id": "faz-santa-rita"},
                         {"label": "FAZENDA", "pattern": [{"LOWER": "bela"}, {"LOWER": "vista"}], "id": "faz-bela-vista"},
                         {"label": "FAZENDA", "pattern": [{"LOWER": "passo"}, {"LOWER": "fundo"}], "id": "faz-passo-fundo"},
                         {"label": "FAZENDA", "pattern": [{"LOWER": "minha"}, {"LOWER": "fazenda"}], "id": "faz-produtor"}
                        ]
  





patternsContrato = [
    {
        "label":"CONTRATO",
        "pattern": [
            {"LOWER": "contrato", "OP":"*"},
            {"SHAPE": "dddX", "OP":"*"},
            {"LOWER": "contratos", "OP":"*"},
            
        ]
        
    }
    
]

patterns = patternsCult + patternsQuant + patternsOthersFazenda + patternsSafra + patternsContrato + patternsOthers

In [37]:
text = "Quanto de milho eu já entreguei pelo contrato 501S pela minha fazenda?"



doc, tokens, ents = show_ent_new(text, patterns=patterns)

## GARANTIR que a ordem dos Tokens esteja correta (ascendente)
seq_tokens_id = []

seq_tokens_valor = []


## tokens_ids + Lista de todos os Tokens da frase
for token in doc:
    seq_tokens_id.append(token.i)
    
    
displacy.render(doc, style="ent", options={"colors": colors})    

In [41]:
for ent in doc.ents:
    print(f'{ent.start:>2} | {ent.text:>20} | {ent.label_:>8} | {ent.id_:>12}  | {ent.end:>2} || {ent.start_char:>2} | {ent.end_char:>2}')
    seq_tokens_valor.append(ent.text)

 2 |                milho |  CULTURA |               |  3 || 10 | 15
 5 |            entreguei | ENTREGUE |               |  6 || 22 | 31
 7 |        contrato 501S | CONTRATO |               |  9 || 37 | 50
10 |        minha fazenda |  FAZENDA | faz-produtor  | 12 || 56 | 69


In [38]:
seq_tokens_valor.append([ent for ent in doc.ents])
seq_tokens_valor

[[milho, entreguei, contrato 501S, minha fazenda]]

In [7]:
token_root = [token.text for token in doc if token.dep_ == "ROOT"][0]
token_root

'entreguei'

In [8]:
# modelo de criaçao de uma lista com tokens de valor para o processo

seq_tokens_valor.append(token_root)

In [26]:
displacy.render(doc, style="ent", options={"colors": colors})

In [10]:
seq_tokens_valor.append([ent for ent in doc.ents])
seq_tokens_valor

['entreguei', [milho, entreguei, contrato 501S]]

Listagem das entidades com posicao dos tokens e posicao de inicio e fim de caracteres

In [11]:
for ent in doc.ents:
    print(f'{ent.start:>2} | {ent.text:>20} | {ent.label_:>8} | {ent.end:>2} || {ent.start_char:>2} | {ent.end_char:>2}')
    seq_tokens_valor.append(ent.text)

 2 |                milho |  CULTURA |  3 || 10 | 15
 5 |            entreguei | ENTREGUE |  6 || 22 | 31
 7 |        contrato 501S | CONTRATO |  9 || 37 | 50


### Algumas rotinas de tratamento de algumas entidades como contratos e safra que podem ter mais dados a serem descobertos

In [12]:
# Tratamento para contratos
for token in doc:
    if token.ent_type_ == "CONTRATO":
        # print(token.text, token.ent_type_, token.shape_)
        qtd_contratops = token.morph.get("Number")
        # print(qtd_contratops)
        if "Plur" in qtd_contratops:
            print("Ele esta falando mais de um contrato")
        else:
             print("Ele esta falando somente de um contrato")   
        if token.shape_ == "dddX":
            nro_contrato = token.text
            print(f'esse e o nro do contrato: {nro_contrato}')
        else:
            print(f'nao ha numero do contrato')
    
    # elif token.ent_type_ == "CONTRATO":    

Ele esta falando somente de um contrato
nao ha numero do contrato
Ele esta falando somente de um contrato
esse e o nro do contrato: 501S


In [13]:
seq_tokens_valor

['entreguei',
 [milho, entreguei, contrato 501S],
 'milho',
 'entreguei',
 'contrato 501S']

In [14]:
[ent for ent in doc.ents]

[milho, entreguei, contrato 501S]

In [15]:
[ent for ent in doc.ents if ent.label_ == "CONTRATO"]

[contrato 501S]

### Analise de noun_chunks

In [16]:
# VISUALIAZACAO RESUMIDA _ ENTS

for chunk in doc.noun_chunks:
  chunk_text = chunk.text
  chunk_root = chunk.root.text
  chunk_root_dep = chunk.root.dep_
  chunk_root_head = chunk.root.head.text
  chunk_root_head_dep = chunk.root.head.dep_
  chunk_root_head_lemma = chunk.root.head.lemma_
  
  chunk_ents = chunk.ents
  chunk_root_ent_type = chunk.root.ent_type_
  
  print(f'1.chunk.text: {chunk_text:>15} |  {chunk.start:>2}  {chunk.end:>2} |  {chunk.start_char:>2} {chunk.end_char:>2} ')

1.chunk.text: Quanto de milho |   0   3 |   0 15 
1.chunk.text:              eu |   3   4 |  16 18 
1.chunk.text:   contrato 501S |   7   9 |  37 50 


In [17]:
# VISUALIAZACAO RESUMIDA _ ENTS

for chunk in doc.noun_chunks:
  chunk_text = chunk.text
  chunk_root = chunk.root.text
  chunk_root_dep = chunk.root.dep_
  chunk_root_head = chunk.root.head.text
  chunk_root_head_dep = chunk.root.head.dep_
  chunk_root_head_lemma = chunk.root.head.lemma_
  
  chunk_ents = chunk.ents
  chunk_root_ent_type = chunk.root.ent_type_
  
  print(f'1.chunk.text: {chunk_text:>15} | 2.ch.root: {chunk_root:>10} | 3.chunk.root.dep_: {chunk_root_dep:>10} | 4.chunk_root_ent_type {chunk_root_ent_type:>12} | 5.chunk_ents {chunk_ents} ')

1.chunk.text: Quanto de milho | 2.ch.root:      milho | 3.chunk.root.dep_:        obl | 4.chunk_root_ent_type      CULTURA | 5.chunk_ents [milho] 
1.chunk.text:              eu | 2.ch.root:         eu | 3.chunk.root.dep_:      nsubj | 4.chunk_root_ent_type              | 5.chunk_ents [] 
1.chunk.text:   contrato 501S | 2.ch.root:   contrato | 3.chunk.root.dep_:        obl | 4.chunk_root_ent_type     CONTRATO | 5.chunk_ents [contrato 501S] 


In [18]:
# VISUALIAZACAO ESTRUTURA DEP, HEAD, ROOT, LEMMA
chunks_valor = []


for chunk in doc.noun_chunks:
  chunk_text = chunk.text
  chunks_valor.append(chunk_text)
  chunk_root = chunk.root.text
  chunk_root_dep = chunk.root.dep_
  chunk_root_head = chunk.root.head.text
  chunk_root_head_dep = chunk.root.head.dep_
  chunk_root_head_lemma = chunk.root.head.lemma_
  chunk_ents = chunk.ents
  chunk_root_ent_type = chunk.root.ent_type_
  
  print(f'1.chunk.text: {chunk_text:>15} | 2.ch.root: {chunk_root:>10} | 3.chunk.root.dep_: {chunk_root_dep:>15} | 4.ch.root.head: {chunk_root_head:>12} | 5.ch.root.head.dep_: {chunk_root_head_dep:>10} |  6.chunk.root.head.lemma_: {chunk_root_head_lemma:>9}')

1.chunk.text: Quanto de milho | 2.ch.root:      milho | 3.chunk.root.dep_:             obl | 4.ch.root.head:    entreguei | 5.ch.root.head.dep_:       ROOT |  6.chunk.root.head.lemma_: entregueir
1.chunk.text:              eu | 2.ch.root:         eu | 3.chunk.root.dep_:           nsubj | 4.ch.root.head:    entreguei | 5.ch.root.head.dep_:       ROOT |  6.chunk.root.head.lemma_: entregueir
1.chunk.text:   contrato 501S | 2.ch.root:   contrato | 3.chunk.root.dep_:             obl | 4.ch.root.head:    entreguei | 5.ch.root.head.dep_:       ROOT |  6.chunk.root.head.lemma_: entregueir


In [None]:
chunks_valor

In [None]:
# POS Tagging
pos_tagging = pd.DataFrame(data=[], \
  columns=["id", "T_texto","T_lemma_", "T_pos_", "T_tag_", "T_dep_", "T_shape_", "T_is_alpha", "T_is_stop"])
i = 0
for token in doc:
    pos_tagging.loc[i,"id"] = token.i
    pos_tagging.loc[i,"T_texto"] = token.text
    pos_tagging.loc[i,"T_lemma_"] = token.lemma_
    pos_tagging.loc[i,"T_pos_"] = token.pos_
    pos_tagging.loc[i,"T_tag_"] = token.tag_
    pos_tagging.loc[i,"T_dep_"] = token.dep_
    pos_tagging.loc[i,"T_shape_"] = token.shape_
    pos_tagging.loc[i,"T_is_alpha"] = token.is_alpha
    pos_tagging.loc[i,"T_is_stop"] = token.is_stop

    i = i+1

pos_tagging

In [None]:
print(tokens)
print(ents)

In [None]:
displacy.render(doc, style='dep',
                jupyter=True, options={'distance': 120})

In [None]:
# Lemmatization for tokens 
lemmatization = pd.DataFrame(data=[], \
  columns=["id", "Texto","Lemma", "Tag", "Tag_explainned", "token_POS", "POS_explainned", "dep", "T. Head", "dep explained"])
i = 0
for token in doc:
    lemmatization.loc[i,"id"] = token.i
    lemmatization.loc[i,"Texto"] = token.text
    lemmatization.loc[i,"Lemma"] = token.lemma_
    lemmatization.loc[i,"Tag"] = token.tag_
    lemmatization.loc[i,"Tag_explainned"] = spacy.explain(token.tag_)
    lemmatization.loc[i,"token_POS"] = token.pos_
    lemmatization.loc[i,"POS_explainned"] = spacy.explain(token.pos_)
    lemmatization.loc[i,"dep"] = token.dep_
    lemmatization.loc[i,"T. Head"] = token.head.text
    lemmatization.loc[i,"dep explained"] = token.morph
    
    i = i+1

lemmatization

In [None]:
safra_ents = [ent for ent in doc.ents if ent.label_ == "SAFRA"]

In [None]:
for ent in doc.ents:
    print(f'{ent.start:>2} | {ent.text:>20} | {ent.label_:>8} | {ent.end:>2} || {ent.start_char:>2} | {ent.end_char:>2}')

In [None]:
# Iterar char vs i do Token
chars_to_tokens = {}
for token in doc:
    for i in range(token.idx, token.idx + len(token.text)):
        chars_to_tokens[i] = token.i
        #print(i, chars_to_tokens[i])
        
chars_to_tokens[24]        

In [None]:
# Iterar i vs token.text
tokens_id_text = {}
for token in doc:
    tokens_id_text[token.i] = token.text
    
for key, value in tokens_id_text.items():
    if key == 4:
        print(value)    # saldo

In [None]:
for token in doc:    
    token_i =  token.i
    token_text = token.text
    token_ent_id = token.ent_id
    token_lemma = token.lemma_
    token_pos = token.pos_

    token_dep = token.dep_
    token_shape = token.shape_


In [None]:
[ent for ent in doc.ents if ent.label_ == "CONTRATO"]

In [None]:
[ent for ent in doc.ents]

In [None]:
for token in doc:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

In [None]:
i = 4

In [None]:
print(doc[i].i, doc[i].text, doc[i].ent_type_, doc[i].shape_, doc[i].ent_iob_, doc[i].idx)

In [None]:
# EXEMPLOS: Busca de informaçoes morph

{doc[i].morph.get("VerbForm")} | {doc[i].morph.get("Tense")} | {doc[i].morph.get("Number")}

In [None]:
significative_tokens = pd.DataFrame(data=[], \
  columns=["idx", "T_texto","T_shape_", "T_ent_type_", "T_ent_id_", "T_ent_iob_", "T_pos_", "T_lemma_", "T_dep_", "T_head"])
i = 0
for idx in seq_tokens_id:
    significative_tokens.loc[i,"idx"] = doc[idx].i
    significative_tokens.loc[i,"T_texto"] = doc[idx].text
    significative_tokens.loc[i,"T_shape_"] = doc[idx].shape_
    significative_tokens.loc[i,"T_ent_type_"] = doc[idx].ent_type_
    significative_tokens.loc[i,"T_ent_id_"] = doc[idx].ent_id_
    significative_tokens.loc[i,"T_ent_iob_"] = doc[idx].ent_iob_
    significative_tokens.loc[i,"T_pos_"] = doc[idx].pos_
    significative_tokens.loc[i,"T_lemma_"] = doc[idx].lemma_
    significative_tokens.loc[i,"T_dep_"] = doc[idx].dep_
    significative_tokens.loc[i,"T_head"] = doc[idx].head

    

    i = i+1

significative_tokens

In [None]:
# POS Tagging
pos_tagging = pd.DataFrame(data=[], \
  columns=["id", "T_texto","T_lemma_", "T_pos_", "T_tag_", "T_dep_", "T_head", "T_is_sent_start", "T_shape_", "T_is_alpha", "T_is_stop"])
i = 0
for token in doc:
    pos_tagging.loc[i,"id"] = token.i
    pos_tagging.loc[i,"T_texto"] = token.text
    pos_tagging.loc[i,"T_lemma_"] = token.lemma_
    pos_tagging.loc[i,"T_pos_"] = token.pos_
    pos_tagging.loc[i,"T_tag_"] = token.tag_
    pos_tagging.loc[i,"T_dep_"] = token.dep_
    pos_tagging.loc[i,"T_head"] = token.head
    pos_tagging.loc[i,"T_is_sent_start"] = token.is_sent_start
    pos_tagging.loc[i,"T_shape_"] = token.shape_
    pos_tagging.loc[i,"T_is_alpha"] = token.is_alpha
    pos_tagging.loc[i,"T_is_stop"] = token.is_stop

    i = i+1

pos_tagging

## Ruled-based-matching and spans

In [None]:
from spacy.tokens import Span
from spacy.matcher import Matcher

In [None]:
matcher = Matcher(nlp.vocab)

<h3>Regular expressions</h3>

In [None]:
pattern = [{"TEXT": {"REGEX": "^[Cc](\\.?|omo)$"}},
           {"TEXT": {"REGEX": "^[Pp](\\.?|osso)$"}},
           {"LOWER": "saber"}]

In [None]:
matcher.add("DESEJO", [pattern])

In [None]:
doc = nlp("Como posso saber o total de milho que tenho para entregar para meu cliente?")
matches = matcher(doc)

In [None]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    #span = doc[start:end]  # The matched span
    span = Span(doc, start, end, label="PERGUNTA")
    print(f'match_id: {match_id:>20} | string_id: {string_id:>10} | start: {start} | end: {end} | span.text: {span.text:>12} | span.label_: {span.label_}')

In [None]:
displacy.render(doc, style="ent") 

In [None]:
# 2. Return Span objects directly
matches = matcher(doc, as_spans=True)
for span in matches:
    print(span.text, span.label_)

    colors = {"PERGUNTA": "linear-gradient(90deg, #09D6FF, #08A0D1)"}
    options = {"span": ["PERGUNTA"], "colors": colors}
    doc.spans["sc"] = [
    Span(doc, start, end, "PERGUNTA"),
    ]

    displacy.render(doc, style="span", options=options) 

<h3>Fuzzy matching</h3>

In [None]:
# Add attribute ruler with exception for "A Santa Rita" as NNP/PROPN NNP/PROPN
ruler = nlp.get_pipe("attribute_ruler")

In [None]:
pattern = [{"TEXT": {"FUZZY": {"IN": ["contrato", "contratos", "contract"]}}}]

In [None]:
# Pattern to match "A Santa Rita"
patterns = [[{"TEXT": {"FUZZY": "Fazenda"}}, {"LOWER": "Santa"}, {"TEXT": {"FUZZY": "Rita"}}]]
# The attributes to assign to the matched token
attrs = {"TAG": "NNP", "POS": "PROPN"}
# Add rules to the attribute ruler

In [None]:
# Add rules to the attribute ruler
ruler.add(patterns=patterns, attrs=attrs, index=0)  # "A" em "A Santa Rita"
ruler.add(patterns=patterns, attrs=attrs, index=1)  # "Santa" em "A Santa Rita"
ruler.add(patterns=patterns, attrs=attrs, index=2)

In [None]:
text = "A Fazenda Santa Rita ira produzir 10 toneladas de arroz este ano."
doc = nlp(text)

In [None]:
[(token.i, token.text) for token in doc]

In [None]:
tokens_to_check = [0, 1, 2, 3]

tokens_to_check

In [None]:
doc = doc
print(doc)
text_ruler = pd.DataFrame(data=[], \
  columns=["T_idx", "T_texto","T_Tag_", "T_Tag_explained", "T_pos_", "T_pos_explained"])
i = 0
for idx in tokens_to_check:
    text_ruler.loc[i,"T_idx"] = doc[idx].i
    text_ruler.loc[i,"T_texto"] = doc[idx].text
    text_ruler.loc[i,"T_Tag_"] = doc[idx].tag_
    text_ruler.loc[i,"T_Tag_explained"] = spacy.explain(doc[idx].tag_)
    text_ruler.loc[i,"T_pos_"] = doc[idx].pos_
    text_ruler.loc[i,"T_pos_explained"] = spacy.explain(doc[idx].pos_)
 
    
    i = i+1
text_ruler

<h3>Regex and fuzzy with lists</h3>

In [None]:
pattern = [{"TEXT": {"FUZZY": {"IN": ["fantastico", "top", "maravilhosa"]}}}]

pattern = [{"TEXT": {"REGEX": {"NOT_IN": ["^fan(tastico)?$", "^mara(vilhosa)?"]}}}]

### Adding on_match rules

In [None]:
from spacy.lang.pt import Portuguese

nlp = Portuguese()
matcher = Matcher(nlp.vocab)

In [None]:
def add_event_ent(matcher, doc, i, matches):
    # Get the current match and create tuple of entity label, start and end.
    # Append entity to the doc's entity. (Don't overwrite doc.ents!)
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="FAZENDA")
    doc.ents += (entity,)
    print(entity.text, entity.label_)

In [None]:
pattern = [{"ORTH": "Santa"}, {"ORTH": "Rita"}]
matcher.add("SantaRita", [pattern], on_match=add_event_ent)

In [None]:
doc = nlp("Estamos nos aproximando da fazenda Santa Rita.")

In [None]:
matches = matcher(doc)

In [None]:
from spacy import displacy
html = displacy.render(doc, style="ent", page=True,
                       options={"ents": ["FAZENDA"]})

<h3>Importante: using <mark>on_match</mark> event</h3>

In [None]:
import spacy
from spacy import displacy
from spacy.matcher import Matcher

nlp = spacy.load("pt_core_news_sm")
matcher = Matcher(nlp.vocab)
matched_sents = []  # Collect data of matched sentences to be visualized

In [None]:
def collect_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]  # Matched span
    sent = span.sent  # Sentence containing matched span
    # Append mock entity for match in displaCy style to matched_sents
    # get the match span by ofsetting the start and end of the span with the
    # start and end of the sentence in the doc
    match_ents = [{
        "start": span.start_char - sent.start_char,
        "end": span.end_char - sent.start_char,
        "label": "MATCH",
    }]
    matched_sents.append({"text": sent.text, "ents": match_ents}) 

In [None]:
pattern = [{"LOWER": "agrobi"}, {"LEMMA": "ser"}, {"POS": "ADV", "OP": "*"},
           {"POS": "ADJ"}]

In [None]:
matcher.add("AgrobiIs", [pattern], on_match=collect_sents)  # add pattern

In [None]:
doc = nlp("eu diria que a Agrobi seria legal se ela desse panettone no natal. – Agrobi é muito legal, certo?")

In [None]:
matches = matcher(doc)

In [None]:
displacy.render(matched_sents, style="ent", manual=True)

## Efficient phrase matching

If you need to match large terminology lists, you can also use the PhraseMatcher and create Doc objects instead of token patterns, which is much more efficient overall. The Doc patterns can contain single or multiple tokens.

In [None]:
import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load("pt_core_news_sm")
matcher = PhraseMatcher(nlp.vocab)

In [None]:
terms = ["Santa Rita", "Passo Fundo", "Trem Bom"]

In [None]:
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)

In [None]:
doc = nlp("Fui passear na fazenda Santa Rita e descrobri que "
          "as bebidas que eles produzem se equiparam com as produzidas pela fazenda Passo Fundo.")

In [None]:
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text, span.ents, span.ent_id_)

# Resultados e reserva de codigo

Esta funcionando

## Alguns exemplos e fragmentos que podem ser uteis no curto prazo

### 1. Processo de salvamento e recuperaçao de patterns e parametros das entidades

In [31]:
filename = "/home/wklinux/spaCy/configuracoes/patterns_padrao.json"  



In [None]:
patterns, colors = load_patterns_and_colors(filename)

In [32]:
write_patterns_to_file(patterns=patterns, colors=colors, filename=filename)

UnicodeEncodeError: 'ascii' codec can't encode character '\xf4' in position 4: ordinal not in range(128)

In [None]:
[token for token in head.children if token.dep_ == "prep"]

In [None]:
for ent in doc:
    # Because the entity is a span, we need to use its root token. The head
    # is the syntactic governor of the person, e.g. the verb
    head = ent.root.head
    print(head)
    if head.lemma_ == "milho":
        # Check if the children contain a preposition
        preps = [token for token in head.children if token.dep_ == "prep"]
        print(preps)
        for prep in preps:
            # Check if tokens part of ORG entities are in the preposition's
            # children, e.g. at -> Acme Corp Inc.
            orgs = [token for token in prep.children if token.ent_type_ == "ORG"]
            # If the verb is in past tense, the company was a previous company
            print({"person": ent, "orgs": orgs, "past": head.tag_ == "VBD"})

In [None]:
# Modelo superado - utilizado na versao 1

colors = {
        "CULTURA": "linear-gradient(90deg, #2ADB5E, #1FA346)", 
        "QUANTIDADE": "linear-gradient(90deg, #09D6FF, #08A0D1)", 
        "FAZENDA": "linear-gradient(90deg, #aa9cfc, #fc9ce7)", 
        "SAFRA": "linear-gradient(90deg, #FFC90E, #BA930A)", 
        "CONTRATO": "linear-gradient(90deg, #B5B5B5, #8A8A8A)"}


patternsOthers = [{"label": "PERSON", "pattern": "Daniel", "id": "daniel"},
                  {"label": "ORG", "pattern": [{"LOWER": "fast"}, {"LOWER": "innovation"}], "id": "fast-innovation"}] 
 

patternsCult = [
    {
        "label":"CULTURA",
        "pattern": [
            {"LOWER": "soja", "OP":"?"},
            {"LOWER": "milho", "OP":"?"},
        ]    
    }
]  

patternsQuant = [
    {
        "label":"QUANTIDADE",
        "pattern": [
            {"LOWER": "saldo", "OP":"?"},
            {"LOWER": "total"},
            {"LOWER": "entregue", "OP":"?"},
        ]    
    }
]

patternsSafra = [
    {
        "label":"SAFRA",
        "pattern": [
            {"LOWER": "safra", "OP":"?"},
            {"LOWER": "safras", "OP":"?"},
            {"SHAPE": "dd/dd"},
        ]    
    }
]


patternsFazenda = [
    {
        "label":"FAZENDA",
        "pattern":[
            {"LOWER": "fazenda", "OP":"?"},
            {"ORTH": "Santa"}, {"ORTH": "Rita"}
        ]
        
    }
]


patternsContrato = [
    {
        "label":"CONTRATO",
        "pattern": [
            {"LOWER": "contrato", "OP":"?"},
            {"LOWER": "contratos", "OP":"?"},
            {"SHAPE": "dddX", "OP":"?"},
        ]
        
    }
    
]

patterns = patternsCult + patternsQuant + patternsFazenda + patternsSafra + patternsContrato + patternsOthers

In [None]:
# Adicionar um novo entity_ruler + patterns

new_ruler = nlp.add_pipe("entity_ruler").from_disk("/home/wklinux/spaCy/configuracoes/pattern_test.json")

In [None]:
[ent for ent in doc.ents if ent.label_ == "ENTREGUE"]

In [None]:
entrega_entidade = [ent for ent in doc.ents if ent.label_ == "ENTREGUE"]
for ent in entrega_entidade:
    # Because the entity is a span, we need to use its root token. The head
    # is the syntactic governor of the person, e.g. the verb
    head = ent.root.head
    print(head)
    if head.lemma_ == "entregar":
        # Check if the children contain a preposition
        preps = [token for token in head.children if token.dep_ == "aux:pass"]
        print("preps: ", preps)
        for prep in preps:
            # Check if tokens part of ORG entities are in the preposition's
            # children, e.g. at -> Acme Corp Inc.
            prep_children = [token for token in prep.children]
            print(prep_children)
            orgs = [token for token in prep.children if token.ent_type_ == "ENTREGUE"]
            # If the verb is in past tense, the company was a previous company
            print({"acao": ent, "orgs": orgs, "past": head.tag_ == "VBD"})

In [None]:
for ent in doc.ents:
    head = ent.root.head
    print(f'{ent.start:>2} | {ent.text:>20} | {ent.label_:>8} | head: {head.text:>10} |  lemma: {head.lemma_:>8}  | {ent.end:>2} || {ent.start_char:>2} | {ent.end_char:>2}')

In [None]:
doc = nlp("Give it back! He pleaded.")
give_children = doc[0].children
assert [t.text for t in give_children] == ["it", "back", "!"]

In [None]:
person_entities

In [None]:
# Lemmatization for tokens 
lemmatization = pd.DataFrame(data=[], \
  columns=["id", "Texto","Lemma", "Tag", "Tag_explainned", "token_POS", "POS_explainned", "dep", "T. Head", "dep explained"])
i = 0
for token in doc:
    lemmatization.loc[i,"id"] = token.i
    lemmatization.loc[i,"Texto"] = token.text
    lemmatization.loc[i,"Lemma"] = token.lemma_
    lemmatization.loc[i,"Tag"] = token.tag_
    lemmatization.loc[i,"Tag_explainned"] = spacy.explain(token.tag_)
    lemmatization.loc[i,"token_POS"] = token.pos_
    lemmatization.loc[i,"POS_explainned"] = spacy.explain(token.pos_)
    lemmatization.loc[i,"dep"] = token.dep_
    lemmatization.loc[i,"T. Head"] = token.head.text
    lemmatization.loc[i,"dep explained"] = token.morph
    
    i = i+1

lemmatization

In [None]:
person_entities = [ent for ent in doc.ents]
for ent in person_entities:
    # Because the entity is a span, we need to use its root token. The head
    # is the syntactic governor of the person, e.g. the verb
    head = ent.root.head
    print(head)
    if head.lemma_ == "entregar":
        # Check if the children contain a preposition
        preps = [token for token in head.children if token.dep_ == "aux:pass"]
        print("preps: ", preps)
        for prep in preps:
            # Check if tokens part of ORG entities are in the preposition's
            # children, e.g. at -> Acme Corp Inc.
            prep_children = [token for token in prep.children]
            print(prep_children)
            orgs = [token for token in prep.children if token.ent_type_ == "ENTREGUE"]
            # If the verb is in past tense, the company was a previous company
            print({"acao": ent, "orgs": orgs, "past": head.tag_ == "VBD"})

In [None]:
print(tokens)
print(ents)

In [None]:
ruler.to_disk("/home/wklinux/spaCy/configuracoes/")

In [None]:
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk("/path/to/patterns.jsonl")  # loads patterns only
ruler.from_disk("/path/to/entity_ruler")    # loads patterns and config

In [None]:
type(tokens)

In [None]:
doc.ents = ents

In [None]:
for i in list(ents):
    print(i)

In [None]:
for i in list(tokens):
    print(i[0], i[1], i[2])

In [None]:
# POS Tagging
pos_tagging = pd.DataFrame(data=[], \
  columns=["id", "T_texto","T_lemma_", "T_pos_", "T_tag_", "T_dep_", "T_head", "T_is_sent_start", "T_shape_", "T_is_alpha", "T_is_stop"])
i = 0
for token in doc:
    pos_tagging.loc[i,"id"] = token.i
    pos_tagging.loc[i,"T_texto"] = token.text
    pos_tagging.loc[i,"T_lemma_"] = token.lemma_
    pos_tagging.loc[i,"T_pos_"] = token.pos_
    pos_tagging.loc[i,"T_tag_"] = token.tag_
    pos_tagging.loc[i,"T_dep_"] = token.dep_
    pos_tagging.loc[i,"T_head"] = token.head
    pos_tagging.loc[i,"T_is_sent_start"] = token.is_sent_start
    pos_tagging.loc[i,"T_shape_"] = token.shape_
    pos_tagging.loc[i,"T_is_alpha"] = token.is_alpha
    pos_tagging.loc[i,"T_is_stop"] = token.is_stop

    i = i+1

pos_tagging

In [None]:
nlp = spacy.blank("pt")
nlp.add_pipe("tagger")


In [None]:
nlp = spacy.blank("pt")
nlp.add_pipe("morphologizer")


In [None]:
nlp = spacy.blank("pt")
nlp.add_pipe("tagger")
nlp.add_pipe("morphologizer")


In [None]:
nlp.analyze_pipes(pretty=True)

### PS>: Novo modelo proposto - preservar as caracateristicas do doc

In [None]:
import spacy
from spacy import displacy
import pandas as pd


nlp = spacy.load("pt_core_news_sm")
# text = "Eu gostaria de saber o saldo total do meu contrato de soja e total entregue de milho para as safras 22/23 e 23/24."
# doc = nlp(text)

In [None]:
nlp.analyze_pipes(pretty=True)

In [None]:
ner = nlp.get_pipe('ner')

In [None]:
ner

In [None]:
attibute_ruler = nlp.get_pipe('attribute_ruler')

In [None]:
attibute_ruler

In [None]:
ner = nlp.remove_pipe('ner')

In [None]:
ner = nlp.add_pipe("ner")

In [None]:
from spacy.pipeline import EntityRecognizer

In [None]:
ner = EntityRecognizer(nlp.vocab, "pt_core_news_sm")

In [None]:
ner.initialize(lambda: examples, nlp=nlp)

In [None]:
nlp.analyze_pipes(pretty=True)

In [None]:
nlp = spacy.load("pt_core_news_sm")

In [None]:
ruler = nlp.add_pipe("entity_ruler")

In [None]:
ruler

In [None]:
colors = {
        "CULTURA": "linear-gradient(90deg, #2ADB5E, #1FA346)", 
        "QUANTIDADE": "linear-gradient(90deg, #09D6FF, #08A0D1)", 
        "FAZENDA": "linear-gradient(90deg, #aa9cfc, #fc9ce7)", 
        "SAFRA": "linear-gradient(90deg, #FFC90E, #BA930A)", 
        "CONTRATO": "linear-gradient(90deg, #B5B5B5, #8A8A8A)"}

patternsCult = [
    {
        "label":"CULTURA",
        "pattern": [
            {"LOWER": "soja", "OP":"?"},
            {"LOWER": "milho", "OP":"?"},
        ]    
    }
]  

patternsQuant = [
    {
        "label":"QUANTIDADE",
        "pattern": [
            {"LOWER": "saldo", "OP":"?"},
            {"LOWER": "total"},
            {"LOWER": "entregue", "OP":"?"},
        ]    
    }
]

patternsSafra = [
    {
        "label":"SAFRA",
        "pattern": [
            {"LOWER": "safra", "OP":"?"},
            {"LOWER": "safras", "OP":"?"},
            {"SHAPE": "dd/dd"},
        ]    
    }
]


patternsFazenda = [
    {
        "label":"FAZENDA",
        "pattern":[
            {"LOWER": "fazenda", "OP":"?"},
            {"ORTH": "Santa"}, {"ORTH": "Rita"}
        ]
        
    }
]

patternsContrato = [
    {
        "label":"CONTRATO",
        "pattern": [
            {"LOWER": "contrato", "OP":"?"},
            {"LOWER": "contratos", "OP":"?"},
            {"SHAPE": "dddX", "OP":"?"},
        ]
        
    }
]

patterns = patternsCult + patternsQuant + patternsFazenda + patternsSafra + patternsContrato

In [None]:
ruler.add_patterns(patterns)

In [None]:
text = "Eu gostaria de saber o saldo total do meu contrato 658S de soja e total entregue de milho para as safras 22/23 e 23/24 pela fazenda Santa Rita."
doc = nlp(text)

In [None]:
nlp.remove_pipe('entity_ruler')

In [None]:
# Adicionar novas entidades ao documento existente
new_ents = []
for ent in doc.ents:
    new_ents.append({
        "start": ent.start_char,
        "end": ent.end_char,
        "label": ent.label_
    })
new_ents    

In [None]:
for ent in nlp(text).ents:
    new_ents.append({
        "start": ent.start_char,
        "end": ent.end_char,
        "label": ent.label_
    })
    
new_ents    

In [None]:
doc.ents = new_ents

In [None]:
displacy.render(doc, style="ent", options={"colors": colors})

In [None]:
# Imprimir as entidades identificadas e as informações de POS, TAG e lemma
for token in doc:
    print(token.text, token.pos_, token.tag_, token.lemma_, token.ent_type_, token.ent_iob_)

In [None]:
# Lemmatization for tokens 
lemmatization = pd.DataFrame(data=[], \
  columns=["id", "Texto","Lemma", "Tag", "Tag_explainned", "token_POS", "POS_explainned", "dep", "T. Head", "dep explained"])
i = 0
for token in doc:
    lemmatization.loc[i,"id"] = token.i
    lemmatization.loc[i,"Texto"] = token.text
    lemmatization.loc[i,"Lemma"] = token.lemma_
    lemmatization.loc[i,"Tag"] = token.tag_
    lemmatization.loc[i,"Tag_explainned"] = spacy.explain(token.tag_)
    lemmatization.loc[i,"token_POS"] = token.pos_
    lemmatization.loc[i,"POS_explainned"] = spacy.explain(token.pos_)
    lemmatization.loc[i,"dep"] = token.dep_
    lemmatization.loc[i,"T. Head"] = token.head.text
    lemmatization.loc[i,"dep explained"] = token.morph
    
    i = i+1

lemmatization

In [None]:
# POS Tagging
pos_tagging = pd.DataFrame(data=[], \
  columns=["id", "T_texto","T_lemma_", "T_pos_", "T_tag_", "T_dep_", "T_head", "T_is_sent_start", "T_shape_", "T_is_alpha", "T_is_stop"])
i = 0
for token in doc:
    pos_tagging.loc[i,"id"] = token.i
    pos_tagging.loc[i,"T_texto"] = token.text
    pos_tagging.loc[i,"T_lemma_"] = token.lemma_
    pos_tagging.loc[i,"T_pos_"] = token.pos_
    pos_tagging.loc[i,"T_tag_"] = token.tag_
    pos_tagging.loc[i,"T_dep_"] = token.dep_
    pos_tagging.loc[i,"T_head"] = token.head
    pos_tagging.loc[i,"T_is_sent_start"] = token.is_sent_start
    pos_tagging.loc[i,"T_shape_"] = token.shape_
    pos_tagging.loc[i,"T_is_alpha"] = token.is_alpha
    pos_tagging.loc[i,"T_is_stop"] = token.is_stop

    i = i+1

pos_tagging

In [None]:
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patternsCult)
nlp.remove_pipe(ruler)

In [None]:
nlp = spacy.blank("pt")
nlp.add_pipe("tagger")


In [None]:
nlp = spacy.blank("pt")
nlp.add_pipe("morphologizer")


In [None]:
nlp = spacy.blank("pt")
nlp.add_pipe("tagger")
nlp.add_pipe("morphologizer")


### Reserva

In [None]:
def show_ent(text, patterns):
    nlp = spacy.blank("pt")
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    doc = nlp(text)
    return doc

In [None]:
def save_patterns_to_file(patterns, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(patterns, f, ensure_ascii=False)

def load_patterns_from_file(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        patterns = json.load(f)
    return patterns


In [None]:
def show_ent_new(text, patterns):
    nlp = spacy.blank("pt")
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    doc = nlp(text)
    
    tokens = []
    ents = []
    
    for ent in doc.ents:
        span = doc.char_span(ent.start_char, ent.end_char, label=ent.label_)
        ents.append(span)
        
    for token in doc:
        start = token.idx
        end = start + len(token)
        tokens.append((token.text, start, end))
        
    return tokens, ents

In [None]:
patternsCult = [
    {
        "label":"CULTURA",
        "pattern": [
            {"LOWER": "soja", "OP":"?"},

        ]    
    }
]

In [None]:
patternsQuant = [
    {
        "label":"QUANTIDADE",
        "pattern": [
            {"LOWER": "saldo", "OP":"?"},
            # {"LOWER": {"IN": ["total", "entregue"]}},
            {"LOWER": "total"},
            {"LOWER": "entregue", "OP":"?"},
        ]    
    }
]

In [None]:
text = "Eu gostaria de saber o saldo total do meu contrato de soja e total entregue de milho para as safras 22/23 e 23/24 pela fazenda Santa Rita."

In [None]:
text = "Eu gostaria de saber o saldo total do meu contrato de soja e total entregue de milho para as safras 22/23 e 23/24."

In [None]:
patterns = patternsCult + patternsQuant

In [None]:
doc = show_ent(text, patterns=patterns)



In [None]:
colors = {"CULTURA": "green", "QUANTIDADE": "orange"}

In [None]:
displacy.render(doc, style="ent", options={"colors": colors})

In [None]:
patternsOthers = [  
                    {"label": "ORG", "pattern": [{"LOWER": "fast"}, {"lower": "innovation"}], "id": "fastinnovation"},
                    {"label": "PERSON", "pattern": "LOWER": "daniel", "id": "daniel-nascimento"},
                    {"label": "PERSON", "pattern": [{"LOWER": "daniel"}, {"LOWER": "silva"}, {"LOWER": "do"}, {"LOWER": "nascimento"}], "id": "daniel-nascimento"},
                    {"label": "PERSON", "pattern": [{"LOWER": "daniel"}, {"LOWER": "nascimento"}], "id": "daniel-nascimento"} ]  