In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tqdm
import os
import glob

In [None]:
import spacy

In [None]:
nlp = spacy.load('en_core_web_lg')

## Spacy
- https://spacy.io/usage/spacy-101

In [None]:
word1 = nlp('man')
word2 = nlp('woman')
word3 = nlp('king')
word4 = nlp('queen')

In [None]:
test = word3.vector - word1.vector + word2.vector

In [None]:
cos_sim = np.dot(test, word4.vector) / (np.linalg.norm(test) * np.linalg.norm(word4.vector))

In [None]:
cos_sim

## Linguistic Annotations and Features
- without underscore, hash values
- https://spacy.io/usage/linguistic-features

In [None]:
doc = nlp("Apple is looking at buying U.K.'s startup for $1 billion")

### Part of speech tagging, morphology, lemmatization, and entity linking
- parse trees

In [None]:
lst = []
for token in doc:
    lst.append(
        [
            token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop, token.morph,
            token.head.text, token.head.pos_, token.ent_iob, token.ent_iob_, token.ent_type_,
            token.ent_kb_id_, [child for child in token.children]
        ]
    )
cols = [
    'text', 'lemma', 'pos', 'tag', 'dep', 
    'shape', 'alpha', 'stop', 'morph', 
    'head text', 'head pos', 'ent iob', 'ent iob_', 'ent type',
    'kb id', 'children'
]
df_token = pd.DataFrame(lst, columns=cols)

In [None]:
df_token

### Dependency parsing

In [None]:
# Noun chunks
lst = []
for chunk in doc.noun_chunks:
    lst.append(
        [
            chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text
        ]
    )
cols = ['text', 'root text', 'root dep', 'root head text']
df_chunk = pd.DataFrame(lst, columns=cols)

In [None]:
df_chunk

In [None]:
# Finding a verb with a subject
nsubj = spacy.symbols.nsubj
VERB = spacy.symbols.VERB
verbs = set()
for possible_subject in doc:
    if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
        verbs.add(possible_subject.head)
print(verbs)

In [None]:
# Iterating around a local tree
word = doc[2]
doc_lefts = [token.text for token in word.lefts]
doc_rights = [token.text for token in word.rights]

In [None]:
print ('Word:', word)
print (f'Doc lefts ({word.n_lefts}): {doc_lefts}')
print (f'Doc rights ({word.n_rights}): {doc_rights}')

In [None]:
spacy.displacy.render(doc, style='dep')

#### Credit example

In [None]:
doc_credit = nlp("Credit and mortgage account holders must submit their requests")

In [None]:
# Find ancestors in subtree
root = [token for token in doc_credit if token.head == token][0]
subject = list(root.lefts)[0]
lst = []
for descendant in subject.subtree:
    assert subject is descendant or subject.is_ancestor(descendant)
    lst.append(
        [
            descendant.text, descendant.dep_, descendant.n_lefts, descendant.n_rights,
            [ancestor.text for ancestor in descendant.ancestors]
        ]
    )
cols = ['text', 'dep', 'n lefts', 'n rights', 'ancestors']
df_subtree = pd.DataFrame(lst, columns=cols)

In [None]:
df_subtree

In [None]:
# Use edges to create a span
span = doc_credit[doc_credit[4].left_edge.i : doc_credit[4].right_edge.i+1]
with doc_credit.retokenize() as retokenizer:
    retokenizer.merge(span)
lst = []
for token in doc_credit:
    lst.append(
        [
            token.text, token.pos_, token.dep_, token.head.text
        ]
    )
cols = ['text', 'pos', 'dep', 'head text']
df_span = pd.DataFrame(lst, columns=cols)

In [None]:
df_span

In [None]:
# Manually extract information from text
# Merge noun phrases and entities for easier analysis
#nlp.add_pipe("merge_entities")
#nlp.add_pipe("merge_noun_chunks")

TEXTS = [
    "Net income was $9.4 million compared to the prior year of $2.7 million.",
    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
]
for doc_text in nlp.pipe(TEXTS):
    for token in doc_text:
        if token.ent_type_ == "MONEY":
            # We have an attribute and direct object, so check for subject
            if token.dep_ in ("attr", "dobj"):
                subj = [w for w in token.head.lefts if w.dep_ == "nsubj"]
                if subj:
                    print(subj[0], "-->", token)
            # We have a prepositional object with a preposition
            elif token.dep_ == "pobj" and token.head.dep_ == "prep":
                print(token.head.head, "-->", token)

### Named entities

In [None]:
lst = []
for ent in doc.ents:
    lst.append(
        [
            ent.text, ent.start_char, ent.end_char, ent.label_, ent.kb_id_
        ]
    )
cols = ['text', 'start', 'end', 'label', 'knowledge base id']
df_ent = pd.DataFrame(lst, columns=cols)

In [None]:
df_ent

In [None]:
spacy.displacy.render(doc, style="ent")

#### FB example

In [None]:
# Set fb as an entity
doc_fb = nlp("fb is hiring a new vice president of global policy")
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc_fb.ents]
print('Before', ents)
# The model didn't recognize "fb" as an entity :(

# Create a span for the new entity
fb_ent = spacy.tokens.Span(doc_fb, 0, 1, label="ORG")
orig_ents = list(doc_fb.ents)

# Option 1: Modify the provided entity spans, leaving the rest unmodified
doc_fb.set_ents([fb_ent], default="unmodified")

# Option 2: Assign a complete list of ents to doc.ents
doc_fb.ents = orig_ents + [fb_ent]

ents = [(e.text, e.start, e.end, e.label_) for e in doc_fb.ents]
print('After', ents)

In [None]:
# Set fb as an entity using arrays
doc_fb = nlp.make_doc("fb is hiring a new vice president of global policy")
print("Before", doc_fb.ents)  # []

header = [spacy.attrs.ENT_IOB, spacy.attrs.ENT_TYPE]
attr_array = np.zeros((len(doc_fb), len(header)), dtype="uint64")
attr_array[0, 0] = 3  # B
attr_array[0, 1] = doc_fb.vocab.strings["ORG"]
doc_fb.from_array(header, attr_array)
print("After", doc_fb.ents)  # [London]

### Tokenization

- customizing tokenizer class
- modify existing rule sets
- building a basic white space tokenizer
- using third party tokenizers (e.g. BERT)
- training with a custom tokenizer

In [None]:
# Add special case rule
ORTH = spacy.symbols.ORTH
special_case = [{ORTH: "gim"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("gimme", special_case)

# Check new tokenization
print([w.text for w in nlp("...gimme! that")])  # ['gim', 'me', 'that']

In [None]:
nlp.tokenizer.explain(doc.text)

In [None]:
nlp.Defaults.prefixes[:10], nlp.Defaults.suffixes[:10], nlp.Defaults.infixes[:2]

In [None]:
len(nlp.Defaults.prefixes), len(nlp.Defaults.suffixes), len(nlp.Defaults.infixes)

In [None]:
words = ["Hello", ",", "world", "!"]
spaces = [False, True, False, False]
doc_from_words = spacy.tokens.Doc(nlp.vocab, words=words, spaces=spaces)
print(doc_from_words.text)

In [None]:
other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."]
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."]
align = spacy.training.Alignment.from_strings(other_tokens, spacy_tokens)
print(f"a -> b, lengths: {align.x2y.lengths}")  # array([1, 1, 1, 1, 1, 1, 1, 1])
print(f"a -> b, mapping: {align.x2y.data}")  # array([0, 1, 2, 3, 4, 4, 5, 6]) : two tokens both refer to "'s"
print(f"b -> a, lengths: {align.y2x.lengths}")  # array([1, 1, 1, 1, 2, 1, 1])   : the token "'s" refers to two tokens
print(f"b -> a, mappings: {align.y2x.data}")  # array([0, 1, 2, 3, 4, 5, 6, 7])

### Merging and splitting

In [None]:
doc_ny = nlp("I live in New York")
print("Before:", [token.text for token in doc_ny])
spacy.displacy.render(doc_ny)

In [None]:
with doc_ny.retokenize() as retokenizer:
    retokenizer.merge(doc_ny[3:5], attrs={"LEMMA": "new york"})
print("After:", [token.text for token in doc_ny])
spacy.displacy.render(doc_ny)

In [None]:
doc_ny = nlp("I live in NewYork")
print("Before:", [token.text for token in doc_ny])
spacy.displacy.render(doc_ny)

In [None]:
with doc_ny.retokenize() as retokenizer:
    heads = [(doc_ny[3], 1), doc_ny[2]]
    attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
    retokenizer.split(doc_ny[3], ["New", "York"], heads=heads, attrs=attrs)
print("After:", [token.text for token in doc_ny])
spacy.displacy.render(doc_ny)

In [None]:
doc_ny = nlp("I live in NewYork")
with doc_ny.retokenize() as retokenizer:
    heads = [(doc_ny[3], 0), (doc_ny[3], 1)]
    retokenizer.split(doc_ny[3], ["New", "York"], heads=heads)
print("After:", [token.text for token in doc_ny])
spacy.displacy.render(doc_ny)

In [None]:
spacy.tokens.Token.set_extension("is_city", default=False)

In [None]:
doc_ny = nlp("I live in New York")
print("Before:", [(token.text, token._.is_city) for token in doc_ny])

with doc_ny.retokenize() as retokenizer:
    retokenizer.merge(doc_ny[3:5], attrs={"_": {"is_city": True}})
print("After:", [(token.text, token._.is_city) for token in doc_ny])

### Sentence segmentation

In [None]:
# dependency parse (default)
doc_sent = nlp("This is a sentence. This is another sentence.")
print ('Doc has sent start anotation:', doc_sent.has_annotation("SENT_START"))
for sent in doc_sent.sents:
    print(sent.text)

In [None]:
# statistical segmenter
nlp_sent = spacy.load("en_core_web_lg", exclude=["parser"])
nlp_sent.enable_pipe("senter")
doc_sent = nlp("This is a sentence. This is another sentence.")
for sent in doc_sent.sents:
    print(sent.text)

In [None]:
# rule based pipeline
nlp_sent = spacy.lang.en.English()  # just the language with no pipeline
nlp_sent.add_pipe("sentencizer")
doc_sent = nlp("This is a sentence. This is another sentence.")
for sent in doc_sent.sents:
    print(sent.text)

In [None]:
# custom rule based
text = "this is a sentence...hello...and another sentence."

nlp_sent = spacy.load("en_core_web_lg")
doc_sent = nlp(text)
print("Before:", [sent.text for sent in doc_sent.sents])

@spacy.language.Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == "...":
            doc[token.i + 1].is_sent_start = True
    return doc

nlp_sent.add_pipe("set_custom_boundaries", before="parser")
doc_sent = nlp_sent(text)
print("After:", [sent.text for sent in doc_sent.sents])

### Mappings and exceptions

In [None]:
nlp_who = spacy.load("en_core_web_lg")
text = "I saw The Who perform. Who did you see?"
doc_who = nlp_who(text)
print(doc_who[2].tag_, doc_who[2].pos_)  # DT DET
print(doc_who[3].tag_, doc_who[3].pos_)  # WP PRON

# Add attribute ruler with exception for "The Who" as NNP/PROPN NNP/PROPN
ruler = nlp_who.get_pipe("attribute_ruler")
# Pattern to match "The Who"
patterns = [[{"LOWER": "the"}, {"TEXT": "Who"}]]
# The attributes to assign to the matched token
attrs = {"TAG": "NNP", "POS": "PROPN"}
# Add rules to the attribute ruler
ruler.add(patterns=patterns, attrs=attrs, index=0)  # "The" in "The Who"
ruler.add(patterns=patterns, attrs=attrs, index=1)  # "Who" in "The Who"

doc_who_ruler = nlp_who(text)
print(doc_who_ruler[2].tag_, doc_who_ruler[2].pos_)  # NNP PROPN
print(doc_who_ruler[3].tag_, doc_who_ruler[3].pos_)  # NNP PROPN
# The second "Who" remains unmodified
print(doc_who_ruler[5].tag_, doc_who_ruler[5].pos_)  # WP PRON

### Vectors and Similarity
- similarity is subjective
- sentence embeddings for words: mean vector of words (i.e. insensitive to order)

In [None]:
tokens = nlp('dog cat banana afskfsd')

In [None]:
lst = []
for token in tokens:
    lst.append(
        [
            token.text, token.has_vector, token.vector_norm, token.is_oov
        ]
    )
cols = ['texx', 'vector', 'norm', 'oov']
df_vec = pd.DataFrame(lst, columns=cols)

In [None]:
df_vec

In [None]:
doc1 = nlp('I like salty fries and hamburgers.')
doc2 = nlp('Fast food tastes very good.')
print ('Sentence similarity:', doc1.similarity(doc2))
print ('Word similarity:', doc1[2:4].similarity(doc1[5]))

In [None]:
doc1_nouns = nlp(' '.join([str(t) for t in doc1 if t.pos_ in ['NOUN', 'PROPN']]))
doc2_nouns = nlp(' '.join([str(t) for t in doc2 if t.pos_ in ['NOUN', 'PROPN']]))
print ('Doc1:', doc1_nouns)
print ('Doc2:', doc2_nouns)
print ('Similarity:', doc1_nouns.similarity(doc2_nouns))

In [None]:
doc1_no_stop_words = nlp(' '.join([str(t) for t in doc1 if not t.is_stop]))
doc2_no_stop_words = nlp(' '.join([str(t) for t in doc2 if not t.is_stop]))
print ('Doc1:', doc1_no_stop_words)
print ('Doc2:', doc2_no_stop_words)
print ('Similarity:', doc1_no_stop_words.similarity(doc2_no_stop_words))

In [None]:
vector_data = {
    "dog": np.random.uniform(-1, 1, (300,)),
    "cat": np.random.uniform(-1, 1, (300,)),
    "orange": np.random.uniform(-1, 1, (300,))
}

vocab_new = spacy.vocab.Vocab()
for word, vector in vector_data.items():
    vocab_new.set_vector(word, vector)

## Pipelines, Architecture, Serialization, Training, and Language data

- text nlp doc
  - text (tokenizer processing pipeline) doc
  - text (tokenizer (tagger parser ner lemmatizer textcat custom)) doc
- doc, docbin, example, language, lexeme, span, spangroup, token
- creating and registering custom language subclass (e.g. adding stop words)

In [None]:
lst = []
for token in doc:
    lexeme = doc.vocab[token.text]
    lst.append(
        [
            lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
            lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_
        ]
    )
cols = ['text', 'orth', 'shape', 'prefix', 'suffix', 'alpha', 'digit', 'title', 'lang']
df_lex = pd.DataFrame(lst, columns=cols)

In [None]:
df_lex

In [None]:
apple_hash = nlp.vocab.strings['apple']
apple_str = nlp.vocab.strings[apple_hash]
apple_hash, apple_str

In [None]:
class CustomEnglishDefaults(spacy.lang.en.English.Defaults):
    stop_words = set(["custom", "stop"])

class CustomEnglish(spacy.lang.en.English):
    lang = "custom_en"
    Defaults = CustomEnglishDefaults

nlp1 = spacy.lang.en.English()
nlp2 = CustomEnglish()

print(nlp1.lang, [token.is_stop for token in nlp1("custom stop")])
print(nlp2.lang, [token.is_stop for token in nlp2("custom stop")])

## Sentence Encoders
- Universal sentence encoder via [spacy](https://spacy.io/universe/project/spacy-universal-sentence-encoder) and [tensorflow](https://www.tensorflow.org/hub/tutorials/semantic_similarity_with_tf_hub_universal_encoder)
- [Sentence transformers](https://huggingface.co/sentence-transformers)
- Can also embed paragraphs

In [None]:
#import spacy_universal_sentence_encoder

#nlp = spacy_universal_sentence_encoder.load_model('en_use_lg')
#doc = nlp("This is a test sentence.")
#print(doc.vector)

In [None]:
#import tensorflow_hub as hub
#import tensorflow as tf

#embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
#embeddings = embed(["This is a test sentence.", "And another one."])
#print(embeddings)