<h3>Overwriting custom extension attributes</h3>

Iterate over the base noun phrases in the span. Yields base noun-phrase Span objects, if the document has been syntactically parsed. A base noun phrase, or “NP chunk”, is a noun phrase that does not permit other NPs to be nested within it – so no NP-level coordination, no prepositional phrases, and no relative clauses.

If the noun_chunk syntax iterator has not been implemeted for the given language, a NotImplementedError is raised.


https://spacy.io/usage/linguistic-features#retokenization

In [None]:
import spacy
from spacy.tokens import Token

# Register a custom token attribute, token._.is_musician
Token.set_extension("is_musician", default=False)

nlp = spacy.load("en_core_web_sm")
doc = nlp("I like David Bowie")
print("Before:", [(token.text, token._.is_musician) for token in doc])

with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[2:4], attrs={"_": {"is_musician": True}})
print("After:", [(token.text, token._.is_musician) for token in doc])

### What does the "yield" keyword do in Python?

In [None]:
def _get_child_candidates(self, distance, min_dist, max_dist):
    if self._leftchild and distance - max_dist < self._median:
        yield self._leftchild
    if self._rightchild and distance + max_dist >= self._median:
        yield self._rightchild 

In [None]:
result, candidates = [], [self]
while candidates:
    node = candidates.pop()
    distance = node._get_dist(obj)
    if distance <= max_dist and distance >= min_dist:
        result.extend(node._values)
    candidates.extend(node._get_child_candidates(distance, min_dist, max_dist))
return result

### Teste substituiÇao

In [None]:
import spacy
from spacy.language import Language
from spacy.matcher import Matcher
from spacy.tokens import Token

In [None]:
# We're using a component factory because the component needs to be
# initialized with the shared vocab via the nlp object
@Language.factory("html_merger")
def create_bad_html_merger(nlp, name):
    return BadHTMLMerger(nlp.vocab)

class BadHTMLMerger:
    def __init__(self, vocab):
        patterns = [
            [{"ORTH": "<"}, {"LOWER": "r"}, {"ORTH": ">"}],
            [{"ORTH": "<"}, {"LOWER": "/r"}, {"ORTH": ">"}],
            [{"ORTH": "<"}, {"LOWER": "user_keys"}, {"ORTH": ">"}],
            [{"ORTH": "<"}, {"LOWER": "/user_keys"}, {"ORTH": ">"}],
            
            [{"ORTH": "<"}, {"LOWER": "user_intent"}, {"ORTH": ">"}],
            [{"ORTH": "<"}, {"LOWER": "/user_intent"}, {"ORTH": ">"}],
            
            [{"ORTH": "<"}, {"LOWER": "missing_keys"}, {"ORTH": ">"}],
            [{"ORTH": "<"}, {"LOWER": "/missing_keys"}, {"ORTH": ">"}],           
            
            [{"ORTH": "<"}, {"LOWER": "/n"}, {"ORTH": ">"}],
            {"LOWER": "QT_SALDO_CONTRATO"}, {"ORTH": ">"},
            [{"ORTH": "<"}, {"LOWER": "nro_safra"}, {"ORTH": ">"}],
        ]
        # Register a new token extension to flag bad HTML
        Token.set_extension("bad_html", default=False)
        self.matcher = Matcher(vocab)
        self.matcher.add("BAD_HTML", patterns)

    def __call__(self, doc):
        # This method is invoked when the component is called on a Doc
        matches = self.matcher(doc)
        spans = []  # Collect the matched spans here
        for match_id, start, end in matches:
            spans.append(doc[start:end])
        with doc.retokenize() as retokenizer:
            for span in spans:
                retokenizer.merge(span)
                for token in span:
                    token._.bad_html = True  # Mark token as bad HTML
        return doc

In [None]:
nlp = spacy.load("en_core_web_sm")

nlp.add_pipe("html_merger", last=True)  # Add component to the pipeline

In [None]:
print("Pipeline:", nlp.pipe_names)

In [None]:
doc = nlp("Gostaria de saber o saldo do meu contrato.<r>Claro, por favor, informe o número do contrato<\r><user_keys>NM_FAZENDA</user_keys><user_intent>QT_SALDO_CONTRATO</user_intent><missing_keys>NR_CONTRATO</missing_keys>")

In [None]:
doc = nlp("Hello<br>world! <br/> This is a test </n> for the contract: <nro_contrato> and the safra: <nro_safra>.")

In [None]:
for token in doc:
    token_bad_html = token._.bad_html
    print(f'idx: {token.i:>3} | token.text: {token.text:>15} | token._.bad_html: {token_bad_html}')

Instead of hard-coding the patterns into the component, you could also make it take a path to a JSON file containing the patterns. This lets you reuse the component with different patterns, depending on your application. When adding the component to the pipeline with nlp.add_pipe, you can pass in the argument via the config:

In [None]:
@Language.factory("html_merger", default_config={"path": None})
def create_bad_html_merger(nlp, name, path):
    return BadHTMLMerger(nlp, path=path)

nlp.add_pipe("html_merger", config={"path": "/path/to/patterns.json"})

In [None]:
doc = nlp("I like New York in Autumn.")

In [None]:
i, like, new, york, in_, autumn, dot = range(len(doc))

In [None]:
dot

In [None]:
doc[new].head.text == "York"

In [None]:
doc[york].head.text == "like"

In [None]:
new_york = doc[new:york+1]

In [None]:
new_york.root.text == "York"

In [None]:


assert doc[new].head.text == "York"
assert doc[york].head.text == "like"
new_york = doc[new:york+1]
assert new_york.root.text == "York"

### Span

In [None]:
import spacy
from spacy.language import Language
from spacy.matcher import Matcher
from spacy.tokens import Token
from spacy.tokens import Span

In [None]:
doc = nlp("I like New York in Autumn.")

In [None]:
lefts = [t.text for t in doc[3:7].lefts]

In [None]:
lefts == ["New"]

In [None]:
doc = nlp("I like New York in Autumn.")

In [None]:
doc[3:7].n_lefts == 1

In [None]:
doc = nlp("Give it back! He pleaded.")


In [None]:
doc[:1]

In [None]:
doc[:2]

In [None]:
doc[:3]

In [None]:
doc[:5]

In [None]:
doc[1:4]

In [None]:
subtree = [t.text for t in doc[:5].subtree]

In [None]:
subtree == ["Give", "it", "back", "!"]

In [None]:
subtree

In [None]:
doc = nlp("Give it back! He pleaded.")
span = doc[4:5]

In [None]:
span

In [None]:
span.sent.text == "He pleaded."

In [None]:
doc = nlp("I like New York")
span = doc.char_span(7, 15, label="GPE")


In [None]:
span.text == "New York"

In [None]:
span.label_

### Retokenizer.merge

Mark a span for merging. The attrs will be applied to the resulting token (if they’re context-dependent token attributes like LEMMA or DEP) or to the underlying lexeme (if they’re context-independent lexical attributes like LOWER or IS_STOP). Writable custom extension attributes can be provided using the "_" key and specifying a dictionary that maps attribute names to values.

In [None]:
doc = nlp("I like David Bowie")
with doc.retokenize() as retokenizer:
    attrs = {"LEMMA": "David Bowie"}
    retokenizer.merge(doc[2:4], attrs=attrs)

In [None]:
doc[2]

### Retokenizer.split


Mark a token for splitting, into the specified orths. The heads are required to specify how the new subtokens should be integrated into the dependency tree. The list of per-token heads can either be a token in the original document, e.g. doc[2], or a tuple consisting of the token in the original document and its subtoken index. For example, (doc[3], 1) will attach the subtoken to the second subtoken of doc[3].

This mechanism allows attaching subtokens to other newly created subtokens, without having to keep track of the changing token indices. If the specified head token will be split within the retokenizer block and no subtoken index is specified, it will default to 0. Attributes to set on subtokens can be provided as a list of values. They’ll be applied to the resulting token (if they’re context-dependent token attributes like LEMMA or DEP) or to the underlying lexeme (if they’re context-independent lexical attributes like LOWER or IS_STOP).

In [None]:
import spacy
from spacy import displacy
import pandas as pd

In [None]:
nlp = spacy.load("pt_core_news_sm")

In [None]:
doc = nlp("Eu moro em NovoHamburgo")

In [None]:
doc = nlp("Gostaria de saber o saldo do meu contrato.<&>Claro, por favor, informe o número do contrato<\r><user_keys>NM_FAZENDA</user_keys><user_intent>QT_SALDO_CONTRATO</user_intent><missing_keys>NR_CONTRATO</missing_keys>")

In [None]:
displacy.render(doc, style='dep',
                jupyter=True, options={'distance': 120})

In [None]:
# Lemmatization for tokens 
lemmatization = pd.DataFrame(data=[], \
  columns=["id", "Texto","Lemma", "Tag", "Tag_explainned", "token_POS", "POS_explainned", "dep", "T. Head", "dep explained"])
i = 0
for token in doc:
    lemmatization.loc[i,"id"] = token.i
    lemmatization.loc[i,"Texto"] = token.text
    lemmatization.loc[i,"Lemma"] = token.lemma_
    lemmatization.loc[i,"Tag"] = token.tag_
    lemmatization.loc[i,"Tag_explainned"] = spacy.explain(token.tag_)
    lemmatization.loc[i,"token_POS"] = token.pos_
    lemmatization.loc[i,"POS_explainned"] = spacy.explain(token.pos_)
    lemmatization.loc[i,"dep"] = token.dep_
    lemmatization.loc[i,"T. Head"] = token.head.text
    lemmatization.loc[i,"dep explained"] = token.morph
    
    i = i+1

lemmatization  

In [None]:
patterns = [
            [{"ORTH": "<"}, {"LOWER": "r"}, {"ORTH": ">"}],
            [{"ORTH": "<"}, {"LOWER": "/r"}, {"ORTH": ">"}],
            [{"ORTH": "<"}, {"LOWER": "user_keys"}, {"ORTH": ">"}],
            [{"ORTH": "<"}, {"LOWER": "/user_keys"}, {"ORTH": ">"}],
            
            [{"ORTH": "<"}, {"LOWER": "user_intent"}, {"ORTH": ">"}],
            [{"ORTH": "<"}, {"LOWER": "/user_intent"}, {"ORTH": ">"}],
            
            [{"ORTH": "<"}, {"LOWER": "missing_keys"}, {"ORTH": ">"}],
            [{"ORTH": "<"}, {"LOWER": "/missing_keys"}, {"ORTH": ">"}],           
            
            [{"ORTH": "<"}, {"LOWER": "/n"}, {"ORTH": ">"}],
            {"LOWER": "QT_SALDO_CONTRATO"}, {"ORTH": ">"},
            [{"ORTH": "<"}, {"LOWER": "nro_safra"}, {"ORTH": ">"}],
        ]

In [None]:
with doc.retokenize() as retokenizer:
    heads = [(doc[3], 1), doc[2]]
    attrs = {"POS": ["PROPN", "PROPN"],
             "DEP": ["pobj", "compound"]}
    retokenizer.split(doc[3], ["Novo", "Hamburgo"], heads=heads, attrs=attrs)

In [None]:
doc

In [None]:
# Lemmatization for tokens 
lemmatization = pd.DataFrame(data=[], \
  columns=["id", "Texto","Lemma", "Tag", "Tag_explainned", "token_POS", "POS_explainned", "dep", "T. Head", "dep explained"])
i = 0
for token in doc:
    lemmatization.loc[i,"id"] = token.i
    lemmatization.loc[i,"Texto"] = token.text
    lemmatization.loc[i,"Lemma"] = token.lemma_
    lemmatization.loc[i,"Tag"] = token.tag_
    lemmatization.loc[i,"Tag_explainned"] = spacy.explain(token.tag_)
    lemmatization.loc[i,"token_POS"] = token.pos_
    lemmatization.loc[i,"POS_explainned"] = spacy.explain(token.pos_)
    lemmatization.loc[i,"dep"] = token.dep_
    lemmatization.loc[i,"T. Head"] = token.head.text
    lemmatization.loc[i,"dep explained"] = token.morph
    
    i = i+1

lemmatization  

In [None]:
displacy.render(doc, style='dep',
                jupyter=True, options={'distance': 120})

In [None]:
a = '  string with spaces  '

In [None]:
a = a.strip()

In [None]:
a

In [None]:
a = '....string....'

In [None]:
a.strip('.')

In [None]:
a

In [None]:
a = 'thisthat'


In [None]:
print(a.rstrip('hat'))

In [None]:
message = '     Learn Python  '

# remove leading and trailing whitespaces
print('Message:', message.strip())

# Output: Message: Learn Python

In [None]:
string = '  xoxo love xoxo   '

# Leading and trailing whitespaces are removed
print(string.strip())

# All <whitespace>,x,o,e characters in the left
# and right of string are removed
print(string.strip(' xoe'))

# Argument doesn't contain space
# No characters are removed.
print(string.strip('stx'))

string = 'android is awesome'
print(string.strip('an'))

In [None]:
user_queries = []
text = "Gostaria de saber o saldo do meu contrato\nClaro, por favor, informe o número do contrato.\nNM_FAZENDA,ID_SAFRA\nQT_SALDO_CONTRATO\nNR_CONTRATO"
user_queries.append(text)
text = "Qual é o saldo do meu contrato?\nClaro, por favor, informe o número do contrato.\nNM_FAZENDA,ID_SAFRA\nQT_SALDO_CONTRATO\nNR_CONTRATO"
user_queries.append(text)
user_queries

In [None]:
user_queries

In [None]:
s = '\n \t \v\f Como é bom estudar MAC0110! \n\t\n\v'

In [None]:
s

In [None]:
s_limpa = s.strip()

In [None]:
s_limpa

In [None]:
lst = s.split()

In [None]:
lst 

In [None]:
s = ' linha 1 \n   linha 2 \n   linha3 ' 

In [None]:
s

In [None]:
text = "Gostaria de saber o sabor da minha pizza.\nClaro, por favor, informe o número do contrato.\nNM_FAZENDA,ID_SAFRA\nQT_SALDO_CONTRATO\nNR_CONTRATO"
textList = text.split('\n')
doc_list = nlp(textList[0])

In [None]:
doc = nlp("Gostaria de saber o saldo do meu contrato")

In [None]:
doc.similarity(doc_list)

In [None]:
apples = nlp("Gostaria de saber o saldo do meu contrato")
oranges = nlp("I like oranges")
apples_oranges = apples.similarity(oranges)

In [None]:
user_utter = textList[0]
user_utter

In [None]:
bot_utter = textList[1]
bot_utter

In [None]:
user_keys = textList[2].split(',')
user_keys 

In [None]:
intent_keys = textList[3].split(',')
intent_keys

In [None]:
missing_keys = textList[4]
missing_keys

In [None]:
analise_similiar.sort()

## Similaridade e tals

In [1]:
import spacy
from spacy import displacy
import pandas as pd
import json

In [2]:
nlp = spacy.load("pt_core_news_lg")

In [4]:
doc = nlp("Qual é o saldo do meu contrato?")

In [5]:
filename = "/home/wklinux/spaCy/query_utter.json"  

In [6]:
def load_json(filename):
    with open(filename, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

In [7]:
def relevant_chunk(doc, param):
    for chunk in doc.noun_chunks:
        chunk_root_head_lemma_ = (chunk.root.head.lemma_).lower()
        if chunk_root_head_lemma_ in ["gostaria", "qual"]:
            if chunk.root.dep_ == param:
                return chunk.text

# chunk.text, chunk.start, chunk.end, chunk.root.head.lemma_, chunk.root.dep_, chunk.doc            

In [8]:
data = load_json(filename)

In [9]:
# VISUALIAZACAO ESTRUTURA DEP, HEAD, ROOT, LEMMA

for chunk in doc.noun_chunks:
  chunk_text = chunk.text
  chunk_root = chunk.root.text
  chunk_root_dep = chunk.root.dep_
  chunk_root_head = chunk.root.head.text
  chunk_root_head_dep = chunk.root.head.dep_
  chunk_root_head_lemma = chunk.root.head.lemma_
  
  chunk_ents = chunk.ents
  chunk_root_ent_type = chunk.root.ent_type_
  
  print(f'1.chunk.text: {chunk_text:>15} | 2.ch.root: {chunk_root:>10} | 3.chunk.root.dep_: {chunk_root_dep:>6} | 4.ch.root.head: {chunk_root_head:>12} | 5.ch.root.head.dep_: {chunk_root_head_dep:>6} |  6.chunk.root.head.lemma_: {chunk_root_head_lemma:>9}')

1.chunk.text:            Qual | 2.ch.root:       Qual | 3.chunk.root.dep_:   ROOT | 4.ch.root.head:         Qual | 5.ch.root.head.dep_:   ROOT |  6.chunk.root.head.lemma_:      qual
1.chunk.text:         o saldo | 2.ch.root:      saldo | 3.chunk.root.dep_:  nsubj | 4.ch.root.head:         Qual | 5.ch.root.head.dep_:   ROOT |  6.chunk.root.head.lemma_:      qual
1.chunk.text:    meu contrato | 2.ch.root:   contrato | 3.chunk.root.dep_:   nmod | 4.ch.root.head:        saldo | 5.ch.root.head.dep_:  nsubj |  6.chunk.root.head.lemma_:     saldo


In [None]:
def relevant_chunk(doc, param):
    for chunk in doc.noun_chunks:
        chunk_root_head_lemma_ = (chunk.root.head.lemma_).lower()
        if chunk_root_head_lemma_ in ["gostaria", "qual"]:
            if chunk.root.dep_ == param:
                return chunk.text

In [21]:
for chunk in doc.noun_chunks:
  param_chunk_root_dep_ = chunk.root.dep_
  print(f'\n1. dentro primeiro loop - chunk: {chunk.text} | {param_chunk_root_dep_} | {chunk.doc} \n')
  # print(param_chunk_root_dep_)
  i = 0
  for i in range(len(data)):
    doc_query = nlp(data[i]["user_utter"])
    print(f'\n2. ============indice: {i} | doc_query: {doc_query} ===================\n')

    
    
    for chunk_query in doc_query.noun_chunks:
        chunk_root_head_lemma_ = (chunk_query.root.head.lemma_).lower()
        print(f'\n3. dentro segundo loop - chunk_query: {chunk_query.text} | {chunk_root_head_lemma_}')
        if chunk_root_head_lemma_ in ["gostaria", "qual"]:
           print(f'\n4. deu True para chunk_root_head_lemma_: {chunk_root_head_lemma_}')
        else:
           print(f'\n4. deu False')  
            # if chunk.root.dep_ == "obj":
            #   print()
            #   print()
            #   print(chunk.text)
  
  # print(f'start: {chunk.start:>2} end: {chunk.end:>2} | chunk.text: {chunk.text:>12} || chunk.root: {chunk.root.text:>12} | chunk.root.head: {chunk.root.head.text:>12} | chunk.root.head.lemma_: {chunk.root.head.lemma_:>9} | chunk.root.dep_: {chunk.root.dep_:>6} || chunk.root.head.dep_: {chunk.root.head.dep_:>6}')


1. dentro primeiro loop - chunk: Qual | ROOT | Qual é o saldo do meu contrato? 




3. dentro segundo loop - chunk_query: Qual | qual

4. deu True para chunk_root_head_lemma_: qual

3. dentro segundo loop - chunk_query: seria o saldo | qual

4. deu True para chunk_root_head_lemma_: qual

3. dentro segundo loop - chunk_query: meu contrato | saldo

4. deu False



3. dentro segundo loop - chunk_query: o saldo | saber

4. deu False

3. dentro segundo loop - chunk_query: meu contrato | saldo

4. deu False



3. dentro segundo loop - chunk_query: o saldo | saber

4. deu False

3. dentro segundo loop - chunk_query: meu contrato | saldo

4. deu False

1. dentro primeiro loop - chunk: o saldo | nsubj | Qual é o saldo do meu contrato? 




3. dentro segundo loop - chunk_query: Qual | qual

4. deu True para chunk_root_head_lemma_: qual

3. dentro segundo loop - chunk_query: seria o saldo | qual

4. deu True para chunk_root_head_lemma_: qual

3. dentro segundo loop - chunk_query: meu contrato | 

In [None]:
i = 0
for i in range(len(data)):
    doc_query = nlp(data[i]["user_utter"])
    for chunk in doc_query.noun_chunks:
        relevant_chunk(doc_query)
        

In [None]:
[chunk for doc in nlp.pipe(user_queries) for chunk in relevant_chunk(doc)]

In [None]:
i = 0
for i in range(len(data)):
    print(data[i]["bot_utter"])

In [None]:
analise_similiar = []
for query in list(user_queries):
    doc_query = nlp(query.split('\n')[0])
    indice = doc.similarity(doc_query)
    analise_similiar.append((indice, doc_query))
    # analise_similiar.append(query)
    # print(indice, doc_query)
    

In [None]:
for chunk in doc.noun_chunks:
  param_chunk_root_dep_ = chunk.root.dep_
  print(f'start: {chunk.start:>2} end: {chunk.end:>2} | chunk.text: {chunk.text:>12} || chunk.root: {chunk.root.text:>12} | chunk.root.head: {chunk.root.head.text:>12} | chunk.root.head.lemma_: {chunk.root.head.lemma_:>9} | chunk.root.dep_: {chunk.root.dep_:>6} || chunk.root.head.dep_: {chunk.root.head.dep_:>6}')